diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2010-11-15 06:49:30 +0000 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2010-11-15 06:49:30 +0000 |
commit | c94f28c383f58c9de74678e0f1624db9c5f8a8cb (patch) | |
tree | 3281184f026cb79cee6c20fe29c994ba654cbbe4 /fs | |
parent | df15315899c0641412bd54b29565a70b078a6ac8 (diff) | |
parent | 1bb95834bbcdc969e477a9284cf96c17a4c2616f (diff) |
Merge branch 'drm-intel-fixes' into drm-intel-next
Conflicts:
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/intel_ringbuffer.c
Diffstat (limited to 'fs')
260 files changed, 7793 insertions, 12558 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 795233702a4e..7e0511476797 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -17,3 +17,16 @@ config 9P_FSCACHE Choose Y here to enable persistent, read-only local caching support for 9p clients using FS-Cache + +config 9P_FS_POSIX_ACL + bool "9P POSIX Access Control Lists" + depends on 9P_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 91fba025fcbe..f8ba37effd1b 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_9P_FS) := 9p.o xattr_user.o 9p-$(CONFIG_9P_FSCACHE) += cache.o +9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o diff --git a/fs/9p/acl.c b/fs/9p/acl.c new file mode 100644 index 000000000000..12d602351dbe --- /dev/null +++ b/fs/9p/acl.c @@ -0,0 +1,392 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/posix_acl_xattr.h> +#include "xattr.h" +#include "acl.h" +#include "v9fs_vfs.h" +#include "v9fs.h" + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +{ + ssize_t size; + void *value = NULL; + struct posix_acl *acl = NULL;; + + size = v9fs_fid_xattr_get(fid, name, NULL, 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + goto err_out; + } + } else if (size == -ENODATA || size == 0 || + size == -ENOSYS || size == -EOPNOTSUPP) { + acl = NULL; + } else + acl = ERR_PTR(-EIO); + +err_out: + kfree(value); + return acl; +} + +int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + int retval = 0; + struct posix_acl *pacl, *dacl; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return 0; + } + /* get the default/access acl values and cache them */ + dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT); + pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS); + + if (!IS_ERR(dacl) && !IS_ERR(pacl)) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); + posix_acl_release(dacl); + posix_acl_release(pacl); + } else + retval = -EIO; + + return retval; +} + +static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + /* + * 9p Always cache the acl value when + * instantiating the inode (v9fs_inode_from_fid) + */ + acl = get_cached_acl(inode, type); + BUG_ON(acl == ACL_NOT_CACHED); + return acl; +} + +int v9fs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + /* + * On access = client mode get the acl + * values from the server + */ + return 0; + } + acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) +{ + int retval; + char *name; + size_t size; + void *buffer; + struct inode *inode = dentry->d_inode; + + set_cached_acl(inode, type, acl); + /* Set a setxattr request to server */ + size = posix_acl_xattr_size(acl->a_count); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + retval = posix_acl_to_xattr(acl, buffer, size); + if (retval < 0) + goto err_free_out; + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + retval = v9fs_xattr_set(dentry, name, buffer, size, 0); +err_free_out: + kfree(buffer); + return retval; +} + +int v9fs_acl_chmod(struct dentry *dentry) +{ + int retval = 0; + struct posix_acl *acl, *clone; + struct inode *inode = dentry->d_inode; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + retval = posix_acl_chmod_masq(clone, inode->i_mode); + if (!retval) + retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + } + return retval; +} + +int v9fs_set_create_acl(struct dentry *dentry, + struct posix_acl *dpacl, struct posix_acl *pacl) +{ + if (dpacl) + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); + if (pacl) + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); + posix_acl_release(dpacl); + posix_acl_release(pacl); + return 0; +} + +int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl) +{ + int retval = 0; + mode_t mode = *modep; + struct posix_acl *acl = NULL; + + if (!S_ISLNK(mode)) { + acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + mode &= ~current_umask(); + } + if (acl) { + struct posix_acl *clone; + + if (S_ISDIR(mode)) + *dpacl = acl; + clone = posix_acl_clone(acl, GFP_NOFS); + retval = -ENOMEM; + if (!clone) + goto cleanup; + + retval = posix_acl_create_masq(clone, &mode); + if (retval < 0) { + posix_acl_release(clone); + goto cleanup; + } + if (retval > 0) + *pacl = clone; + } + *modep = mode; + return 0; +cleanup: + posix_acl_release(acl); + return retval; + +} + +static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_get(dentry, full_name, buffer, size); +} + +static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct v9fs_session_info *v9ses; + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + /* + * We allow set/get/list of acl when access=client is not specified + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_get_acl(dentry, name, buffer, size, type); + + acl = v9fs_get_cached_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int v9fs_remote_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_set(dentry, full_name, value, size, flags); +} + + +static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + int retval; + struct posix_acl *acl; + struct v9fs_session_info *v9ses; + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_set_acl(dentry, name, + value, size, flags, type); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EPERM; + if (value) { + /* update the cached acl value */ + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + retval = posix_acl_valid(acl); + if (retval) + goto err_out; + } + } else + acl = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + retval = posix_acl_equiv_mode(acl, &mode); + if (retval < 0) + goto err_out; + else { + struct iattr iattr; + if (retval == 0) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. + */ + acl = NULL; + value = NULL; + size = 0; + } + /* Updte the mode bits */ + iattr.ia_mode = ((mode & S_IALLUGO) | + (inode->i_mode & ~S_IALLUGO)); + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) { + retval = -EINVAL; + goto err_out; + } + break; + default: + BUG(); + } + retval = v9fs_xattr_set(dentry, name, value, size, flags); + if (!retval) + set_cached_acl(inode, type, acl); +err_out: + posix_acl_release(acl); + return retval; +} + +const struct xattr_handler v9fs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; + +const struct xattr_handler v9fs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; diff --git a/fs/9p/acl.h b/fs/9p/acl.h new file mode 100644 index 000000000000..59e18c2e8c7e --- /dev/null +++ b/fs/9p/acl.h @@ -0,0 +1,49 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_ACL_H +#define FS_9P_ACL_H + +#ifdef CONFIG_9P_FS_POSIX_ACL +extern int v9fs_get_acl(struct inode *, struct p9_fid *); +extern int v9fs_check_acl(struct inode *inode, int mask); +extern int v9fs_acl_chmod(struct dentry *); +extern int v9fs_set_create_acl(struct dentry *, + struct posix_acl *, struct posix_acl *); +extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl); +#else +#define v9fs_check_acl NULL +static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + return 0; +} +static inline int v9fs_acl_chmod(struct dentry *dentry) +{ + return 0; +} +static inline int v9fs_set_create_acl(struct dentry *dentry, + struct posix_acl *dpacl, + struct posix_acl *pacl) +{ + return 0; +} +static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, + struct posix_acl **pacl) +{ + return 0; +} + +#endif +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 6406f896bf95..b00223c99d70 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) switch (access) { case V9FS_ACCESS_SINGLE: case V9FS_ACCESS_USER: + case V9FS_ACCESS_CLIENT: uid = current_fsuid(); any = 0; break; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 38dc0e067599..2f77cd33ba83 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->flags |= V9FS_ACCESS_USER; else if (strcmp(s, "any") == 0) v9ses->flags |= V9FS_ACCESS_ANY; - else { + else if (strcmp(s, "client") == 0) { +#ifdef CONFIG_9P_FS_POSIX_ACL + v9ses->flags |= V9FS_ACCESS_CLIENT; +#else + P9_DPRINTK(P9_DEBUG_ERROR, + "access=client option not supported\n"); + kfree(s); + ret = -EINVAL; + goto free_and_return; +#endif + } else { v9ses->flags |= V9FS_ACCESS_SINGLE; v9ses->uid = simple_strtoul(s, &e, 10); if (*e != '\0') @@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; + if (!v9fs_proto_dotl(v9ses) && + ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACCESS_CLIENT only for dotl. + * Fall back to ACCESS_USER + */ + v9ses->flags &= ~V9FS_ACCESS_MASK; + v9ses->flags |= V9FS_ACCESS_USER; + } + /*FIXME !! */ /* for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 4c963c9fc41f..cb6396855e2d 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -33,13 +33,17 @@ * * Session flags reflect options selected by users at mount time */ +#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ + V9FS_ACCESS_USER | \ + V9FS_ACCESS_CLIENT) +#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY + enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, - V9FS_ACCESS_ANY = 0x0C, - V9FS_ACCESS_MASK = 0x0C, + V9FS_ACCESS_CLIENT = 0x10 }; /* possible values of ->cache */ @@ -113,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses); void v9fs_session_cancel(struct v9fs_session_info *v9ses); void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); -#define V9FS_MAGIC 0x01021997 - /* other default globals */ #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 88418c419ea7..bab0eac873f4 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -64,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended); ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); void v9fs_blank_wstat(struct p9_wstat *wstat); +int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); +int v9fs_file_fsync_dotl(struct file *filp, int datasync); + +#define P9_LOCK_TIMEOUT (30*HZ) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 90e38449f4b3..b7f2a8e3863e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page) return 0; } +/** + * v9fs_direct_IO - 9P address space operation for direct I/O + * @rw: direction (read or write) + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of v9fs_direct_IO() in the address space ops vector + * allowes open() O_DIRECT flags which would have failed otherwise. + * + * In the non-cached mode, we shunt off direct read and write requests before + * the VFS gets them, so this method should never be called. + * + * Direct IO is not 'yet' supported in the cached mode. Hence when + * this routine is called through generic_file_aio_read(), the read/write fails + * with an error. + * + */ +ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " + "off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long) pos, nr_segs); + + return -EINVAL; +} const struct address_space_operations v9fs_addr_operations = { .readpage = v9fs_vfs_readpage, .readpages = v9fs_vfs_readpages, .releasepage = v9fs_release_page, .invalidatepage = v9fs_invalidate_page, .launder_page = v9fs_launder_page, + .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 899f168fd19c..b84ebe8cefed 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { err = p9dirent_read(rdir->buf + rdir->head, - buflen - rdir->head, &curdirent, + rdir->tail - rdir->head, + &curdirent, fid->clnt->proto_version); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); @@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = { .readdir = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, + .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index e97c92bd6f16..240c30674396 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -33,6 +33,7 @@ #include <linux/inet.h> #include <linux/list.h> #include <linux/pagemap.h> +#include <linux/utsname.h> #include <asm/uaccess.h> #include <linux/idr.h> #include <net/9p/9p.h> @@ -44,6 +45,7 @@ #include "cache.h" static const struct file_operations v9fs_cached_file_operations; +static const struct file_operations v9fs_cached_file_operations_dotl; /** * v9fs_file_open - open a file (or directory) @@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) /* enable cached file options */ if(file->f_op == &v9fs_file_operations) file->f_op = &v9fs_cached_file_operations; + else if (file->f_op == &v9fs_file_operations_dotl) + file->f_op = &v9fs_cached_file_operations_dotl; #ifdef CONFIG_9P_FSCACHE v9fs_cache_inode_set_cookie(inode, file); @@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) return res; } +static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct p9_flock flock; + struct p9_fid *fid; + uint8_t status; + int res = 0; + unsigned char fl_type; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + if ((fl->fl_flags & FL_POSIX) != FL_POSIX) + BUG(); + + res = posix_lock_file_wait(filp, fl); + if (res < 0) + goto out; + + /* convert posix lock to p9 tlock args */ + memset(&flock, 0, sizeof(flock)); + flock.type = fl->fl_type; + flock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + flock.length = 0; + else + flock.length = fl->fl_end - fl->fl_start + 1; + flock.proc_id = fl->fl_pid; + flock.client_id = utsname()->nodename; + if (IS_SETLKW(cmd)) + flock.flags = P9_LOCK_FLAGS_BLOCK; + + /* + * if its a blocked request and we get P9_LOCK_BLOCKED as the status + * for lock request, keep on trying + */ + for (;;) { + res = p9_client_lock_dotl(fid, &flock, &status); + if (res < 0) + break; + + if (status != P9_LOCK_BLOCKED) + break; + if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) + break; + schedule_timeout_interruptible(P9_LOCK_TIMEOUT); + } + + /* map 9p status to VFS status */ + switch (status) { + case P9_LOCK_SUCCESS: + res = 0; + break; + case P9_LOCK_BLOCKED: + res = -EAGAIN; + break; + case P9_LOCK_ERROR: + case P9_LOCK_GRACE: + res = -ENOLCK; + break; + default: + BUG(); + } + + /* + * incase server returned error for lock request, revert + * it locally + */ + if (res < 0 && fl->fl_type != F_UNLCK) { + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + res = posix_lock_file_wait(filp, fl); + fl->fl_type = fl_type; + } +out: + return res; +} + +static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) +{ + struct p9_getlock glock; + struct p9_fid *fid; + int res = 0; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + posix_test_lock(filp, fl); + /* + * if we have a conflicting lock locally, no need to validate + * with server + */ + if (fl->fl_type != F_UNLCK) + return res; + + /* convert posix lock to p9 tgetlock args */ + memset(&glock, 0, sizeof(glock)); + glock.type = fl->fl_type; + glock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + glock.length = 0; + else + glock.length = fl->fl_end - fl->fl_start + 1; + glock.proc_id = fl->fl_pid; + glock.client_id = utsname()->nodename; + + res = p9_client_getlock_dotl(fid, &glock); + if (res < 0) + return res; + if (glock.type != F_UNLCK) { + fl->fl_type = glock.type; + fl->fl_start = glock.start; + if (glock.length == 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = glock.start + glock.length - 1; + fl->fl_pid = glock.proc_id; + } else + fl->fl_type = F_UNLCK; + + return res; +} + +/** + * v9fs_file_lock_dotl - lock a file (or directory) + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = -ENOLCK; + + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, + cmd, fl, filp->f_path.dentry->d_name.name); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + + if (IS_SETLK(cmd) || IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else if (IS_GETLK(cmd)) + ret = v9fs_file_getlock(filp, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + +/** + * v9fs_file_flock_dotl - lock a file + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_flock_dotl(struct file *filp, int cmd, + struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = -ENOLCK; + + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, + cmd, fl, filp->f_path.dentry->d_name.name); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (!(fl->fl_flags & FL_FLOCK)) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + /* Convert flock to posix lock */ + fl->fl_owner = (fl_owner_t)filp; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + fl->fl_flags |= FL_POSIX; + fl->fl_flags ^= FL_FLOCK; + + if (IS_SETLK(cmd) | IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + /** * v9fs_file_readn - read from a file * @filp: file pointer to read @@ -219,7 +423,9 @@ static ssize_t v9fs_file_write(struct file *filp, const char __user * data, size_t count, loff_t * offset) { - int n, rsize, total = 0; + ssize_t retval; + size_t total = 0; + int n; struct p9_fid *fid; struct p9_client *clnt; struct inode *inode = filp->f_path.dentry->d_inode; @@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data, fid = filp->private_data; clnt = fid->clnt; - rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; + retval = generic_write_checks(filp, &origin, &count, 0); + if (retval) + goto out; - do { - if (count < rsize) - rsize = count; + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; - n = p9_client_write(fid, NULL, data+total, origin+total, - rsize); + do { + n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) break; count -= n; @@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data, } if (n < 0) - return n; - - return total; + retval = n; + else + retval = total; +out: + return retval; } static int v9fs_file_fsync(struct file *filp, int datasync) @@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync) return retval; } +int v9fs_file_fsync_dotl(struct file *filp, int datasync) +{ + struct p9_fid *fid; + int retval; + + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", + filp, datasync); + + fid = filp->private_data; + + retval = p9_client_fsync(fid, datasync); + return retval; +} + static const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = { .fsync = v9fs_file_fsync, }; +static const struct file_operations v9fs_cached_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync_dotl, +}; + const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read = v9fs_file_read, @@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = { .write = v9fs_file_write, .open = v9fs_file_open, .release = v9fs_dir_release, - .lock = v9fs_file_lock, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, .mmap = generic_file_readonly_mmap, - .fsync = v9fs_file_fsync, + .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index ef5905f7c8a3..34bf71b56542 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -36,6 +36,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -44,6 +45,7 @@ #include "fid.h" #include "cache.h" #include "xattr.h" +#include "acl.h" static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_dotu; @@ -53,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl; static const struct inode_operations v9fs_symlink_inode_operations; static const struct inode_operations v9fs_symlink_inode_operations_dotl; +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, + dev_t rdev); + /** * unixmode2p9mode - convert unix mode bits to plan 9 * @v9ses: v9fs session information @@ -500,6 +506,11 @@ v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, v9fs_vcookie_set_qid(ret, &st->qid); v9fs_cache_inode_get_cookie(ret); #endif + err = v9fs_get_acl(ret, fid); + if (err) { + iput(ret); + goto error; + } kfree(st); return ret; error: @@ -553,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) return retval; } -static int -v9fs_open_created(struct inode *inode, struct file *file) -{ - return 0; -} - - /** * v9fs_create - Create a file * @v9ses: session information @@ -655,29 +659,37 @@ error: */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, struct nameidata *nd) { int err = 0; char *name = NULL; gid_t gid; int flags; + mode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_fid *dfid, *ofid; struct file *filp; struct p9_qid qid; struct inode *inode; + struct posix_acl *pacl = NULL, *dacl = NULL; v9ses = v9fs_inode2v9ses(dir); if (nd && nd->flags & LOOKUP_OPEN) flags = nd->intent.open.flags - 1; - else - flags = O_RDWR; + else { + /* + * create call without LOOKUP_OPEN is due + * to mknod of regular files. So use mknod + * operation. + */ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + } name = (char *) dentry->d_name.name; P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%x\n", name, flags, mode); + "mode:0x%x\n", name, flags, omode); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { @@ -695,6 +707,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, } gid = v9fs_get_fsgid_for_create(dir); + + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in creat %d\n", err); + goto error; + } err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, @@ -702,46 +723,52 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, err); goto error; } + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE || + (nd && nd->flags & LOOKUP_OPEN)) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } - /* No need to populate the inode if we are not opening the file AND - * not in cached mode. - */ - if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) { - /* Not in cached mode. No need to populate inode with stat */ - dentry->d_op = &v9fs_dentry_operations; - p9_client_clunk(ofid); - d_instantiate(dentry, NULL); - return 0; - } - - /* Now walk from the parent so we can get an unopened fid. */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; - goto error; - } - - /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); - goto error; - } - if (v9ses->cache) + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } dentry->d_op = &v9fs_cached_dentry_operations; - else + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + /* The fid would get clunked via a dput */ + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate + * inode with stat. We need to get an inode + * so that we can set the acl with dentry + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; + d_instantiate(dentry, inode); + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); /* if we are opening a file, assign the open fid to the file */ if (nd && nd->flags & LOOKUP_OPEN) { - filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { p9_client_clunk(ofid); return PTR_ERR(filp); @@ -800,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, /* if we are opening a file, assign the open fid to the file */ if (nd && nd->flags & LOOKUP_OPEN) { - filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto error; @@ -859,23 +886,28 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) * */ -static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, - int mode) +static int v9fs_vfs_mkdir_dotl(struct inode *dir, + struct dentry *dentry, int omode) { int err; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; gid_t gid; char *name; + mode_t mode; struct inode *inode; struct p9_qid qid; struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); - mode |= S_IFDIR; + omode |= S_IFDIR; + if (dir->i_mode & S_ISGID) + omode |= S_ISGID; + dir_dentry = v9fs_dentry_from_dir_inode(dir); dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { @@ -886,11 +918,14 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, } gid = v9fs_get_fsgid_for_create(dir); - if (gid < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mkdir %d\n", err); goto error; } - name = (char *) dentry->d_name.name; err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); if (err < 0) @@ -920,7 +955,23 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) goto error; fid = NULL; + } else { + /* + * Not in cached mode. No need to populate + * inode with stat. We need to get an inode + * so that we can set the acl with dentry + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); + error: if (fid) p9_client_clunk(fid); @@ -979,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, result = v9fs_fid_add(dentry, fid); if (result < 0) - goto error; + goto error_iput; inst_out: if (v9ses->cache) @@ -990,6 +1041,8 @@ inst_out: d_add(dentry, inode); return NULL; +error_iput: + iput(inode); error: p9_client_clunk(fid); @@ -1237,7 +1290,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * */ -static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) { int retval; struct v9fs_session_info *v9ses; @@ -1279,6 +1332,12 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) setattr_copy(dentry->d_inode, iattr); mark_inode_dirty(dentry->d_inode); + if (iattr->ia_valid & ATTR_MODE) { + /* We also want to update ACL when we update mode bits */ + retval = v9fs_acl_chmod(dentry); + if (retval < 0) + return retval; + } return 0; } @@ -1473,7 +1532,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) + if (!v9fs_proto_dotu(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1616,11 +1675,6 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, gid = v9fs_get_fsgid_for_create(dir); - if (gid < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid); - goto error; - } - /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); @@ -1855,21 +1909,23 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, dev_t rdev) { int err; char *name; + mode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; gid_t gid; struct p9_qid qid; struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; @@ -1885,11 +1941,14 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, } gid = v9fs_get_fsgid_for_create(dir); - if (gid < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mknod %d\n", err); goto error; } - name = (char *) dentry->d_name.name; err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); @@ -1933,13 +1992,68 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); } - + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); error: if (fid) p9_client_clunk(fid); return err; } +static int +v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen) +{ + int retval; + struct p9_fid *fid; + char *target = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + retval = -EPERM; + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + retval = p9_client_readlink(fid, &target); + if (retval < 0) + return retval; + + strncpy(buffer, target, buflen); + P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer); + + retval = strnlen(buffer, buflen); + return retval; +} + +/** + * v9fs_vfs_follow_link_dotl - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +{ + int len = 0; + char *link = __getname(); + + P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); + + if (!link) + link = ERR_PTR(-ENOMEM); + else { + len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX); + if (len < 0) { + __putname(link); + link = ERR_PTR(len); + } else + link[min(len, PATH_MAX-1)] = 0; + } + nd_set_link(nd, link); + + return NULL; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -1970,7 +2084,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - + .check_acl = v9fs_check_acl, }; static const struct inode_operations v9fs_dir_inode_operations = { @@ -1997,6 +2111,7 @@ static const struct inode_operations v9fs_file_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, + .check_acl = v9fs_check_acl, }; static const struct inode_operations v9fs_symlink_inode_operations = { @@ -2008,8 +2123,8 @@ static const struct inode_operations v9fs_symlink_inode_operations = { }; static const struct inode_operations v9fs_symlink_inode_operations_dotl = { - .readlink = generic_readlink, - .follow_link = v9fs_vfs_follow_link, + .readlink = v9fs_vfs_readlink_dotl, + .follow_link = v9fs_vfs_follow_link_dotl, .put_link = v9fs_vfs_put_link, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 1d12ba0ed3db..c55c614500ad 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -39,6 +39,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/statfs.h> +#include <linux/magic.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -46,6 +47,7 @@ #include "v9fs_vfs.h" #include "fid.h" #include "xattr.h" +#include "acl.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; @@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data) * v9fs_fill_super - populate superblock with info * @sb: superblock * @v9ses: session information - * @flags: flags propagated from v9fs_get_sb() + * @flags: flags propagated from v9fs_mount() * */ @@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; +#ifdef CONFIG_9P_FS_POSIX_ACL + if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT) + sb->s_flags |= MS_POSIXACL; +#endif + save_mount_options(sb, data); } /** - * v9fs_get_sb - mount a superblock + * v9fs_mount - mount a superblock * @fs_type: file system type * @flags: mount flags * @dev_name: device name that was mounted * @data: mount options - * @mnt: mountpoint record to be instantiated * */ -static int v9fs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct super_block *sb = NULL; struct inode *inode = NULL; @@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) - return -ENOMEM; + return ERR_PTR(-ENOMEM); fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { @@ -149,7 +154,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto release_sb; } sb->s_root = root; - if (v9fs_proto_dotl(v9ses)) { struct p9_stat_dotl *st = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); @@ -174,19 +178,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, p9stat_free(st); kfree(st); } - + retval = v9fs_get_acl(inode, fid); + if (retval) + goto release_sb; v9fs_fid_add(root, fid); P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); clunk_fid: p9_client_clunk(fid); close_session: v9fs_session_close(v9ses); kfree(v9ses); - return retval; + return ERR_PTR(retval); + release_sb: /* * we will do the session_close and root dentry release @@ -196,7 +202,7 @@ release_sb: */ p9_client_clunk(fid); deactivate_locked_super(sb); - return retval; + return ERR_PTR(retval); } /** @@ -249,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { - buf->f_type = rs.type; + buf->f_type = V9FS_MAGIC; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; @@ -292,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = { struct file_system_type v9fs_fs_type = { .name = "9p", - .get_sb = v9fs_get_sb, + .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index f88e5c2dc873..43ec7df84336 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -21,30 +21,13 @@ #include "fid.h" #include "xattr.h" -/* - * v9fs_xattr_get() - * - * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t buffer_size) +ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, + void *buffer, size_t buffer_size) { ssize_t retval; int msize, read_count; u64 offset = 0, attr_size; - struct p9_fid *fid, *attr_fid; - - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", - __func__, name, buffer_size); - - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); + struct p9_fid *attr_fid; attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -88,6 +71,31 @@ error: } + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + struct p9_fid *fid; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", + __func__, name, buffer_size); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + return v9fs_fid_xattr_get(fid, name, buffer, buffer_size); +} + /* * v9fs_xattr_set() * @@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, +#ifdef CONFIG_9P_FS_POSIX_ACL + &v9fs_xattr_acl_access_handler, + &v9fs_xattr_acl_default_handler, +#endif NULL }; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index 9ddf672ae5c4..eaa837c53bd5 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -15,10 +15,16 @@ #define FS_9P_XATTR_H #include <linux/xattr.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> extern const struct xattr_handler *v9fs_xattr_handlers[]; extern struct xattr_handler v9fs_xattr_user_handler; +extern const struct xattr_handler v9fs_xattr_acl_access_handler; +extern const struct xattr_handler v9fs_xattr_acl_default_handler; +extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *, + void *, size_t); extern ssize_t v9fs_xattr_get(struct dentry *, const char *, void *, size_t); extern int v9fs_xattr_set(struct dentry *, const char *, diff --git a/fs/Kconfig b/fs/Kconfig index b5e582bd769d..771f457402d4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -53,7 +53,6 @@ config EXPORTFS config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y - select BKL # while lockd still uses it. help This option enables standard file locking support, required for filesystems like NFS and for the flock() system @@ -63,7 +62,6 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" -source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" @@ -235,7 +233,6 @@ config NFS_COMMON default y source "net/sunrpc/Kconfig" -source "fs/smbfs/Kconfig" source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" source "fs/ncpfs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bb4cc5b8abc8..79e2ca7973b7 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" - default n + default y depends on BINFMT_ELF && ELF_CORE help ELF core dump files describe each memory mapping of the crashed @@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS inherited. See Documentation/filesystems/proc.txt for details. This config option changes the default setting of coredump_filter - seen at boot time. If unsure, say N. + seen at boot time. If unsure, say Y. config BINFMT_FLAT bool "Kernel support for flat binaries" diff --git a/fs/Makefile b/fs/Makefile index 26956fcec917..a7f7cef0c0c8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -88,7 +88,6 @@ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-$(CONFIG_SYSV_FS) += sysv/ -obj-$(CONFIG_SMB_FS) += smbfs/ obj-$(CONFIG_CIFS) += cifs/ obj-$(CONFIG_NCP_FS) += ncpfs/ obj-$(CONFIG_HPFS_FS) += hpfs/ @@ -101,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ -obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d9803f73236f..959dbff2d42d 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -490,17 +490,16 @@ error: return -EINVAL; } -static int adfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *adfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", - .get_sb = adfs_get_sb, + .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/affs/super.c b/fs/affs/super.c index fa4fbe1e238a..0cf7f4384cbd 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -573,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int affs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *affs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); } static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", - .get_sb = affs_get_sb, + .mount = affs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/afs/super.c b/fs/afs/super.c index eacf76d98ae0..27201cffece4 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -29,9 +29,8 @@ #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ static void afs_i_init_once(void *foo); -static int afs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct vfsmount *mnt); +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); @@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", - .get_sb = afs_get_sb, + .mount = afs_mount, .kill_sb = kill_anon_super, .fs_flags = 0, }; @@ -359,11 +358,8 @@ error: /* * get an AFS superblock */ -static int afs_get_sb(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *options, - struct vfsmount *mnt) +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *options) { struct afs_mount_params params; struct super_block *sb; @@ -427,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type, ASSERTCMP(sb->s_flags, &, MS_ACTIVE); } - simple_set_mnt(mnt, sb); afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); - return 0; + return dget(sb->s_root); error: afs_put_volume(params.volume); @@ -440,7 +435,7 @@ error: key_put(params.key); kfree(new_opts); _leave(" = %d", ret); - return ret; + return ERR_PTR(ret); } /* diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 5365527ca43f..57ce55b2564c 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly; static struct inode *anon_inode_inode; static const struct file_operations anon_inode_fops; -static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC, - mnt); + return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC); } /* @@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", - .get_sb = anon_inodefs_get_sb, + .mount = anon_inodefs_mount, .kill_sb = kill_anon_super, }; static const struct dentry_operations anon_inodefs_dentry_operations = { diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig deleted file mode 100644 index 480e210c83ab..000000000000 --- a/fs/autofs/Kconfig +++ /dev/null @@ -1,22 +0,0 @@ -config AUTOFS_FS - tristate "Kernel automounter support" - depends on BKL # unfixable, just use autofs4 - help - The automounter is a tool to automatically mount remote file systems - on demand. This implementation is partially kernel-based to reduce - overhead in the already-mounted case; this is unlike the BSD - automounter (amd), which is a pure user space daemon. - - To use the automounter you need the user-space tools from the autofs - package; you can find the location in <file:Documentation/Changes>. - You also want to answer Y to "NFS file system support", below. - - If you want to use the newer version of the automounter with more - features, say N here and say Y to "Kernel automounter v4 support", - below. - - To compile this support as a module, choose M here: the module will be - called autofs. - - If you are not a part of a fairly large, distributed network, you - probably do not need an automounter, and can say N here. diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile deleted file mode 100644 index 453a60f46d05..000000000000 --- a/fs/autofs/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for the linux autofs-filesystem routines. -# - -obj-$(CONFIG_AUTOFS_FS) += autofs.o - -autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h deleted file mode 100644 index 901a3e67ec45..000000000000 --- a/fs/autofs/autofs_i.h +++ /dev/null @@ -1,165 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * linux/fs/autofs/autofs_i.h - * - * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* Internal header file for autofs */ - -#include <linux/auto_fs.h> - -/* This is the range of ioctl() numbers we claim as ours */ -#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY -#define AUTOFS_IOC_COUNT 32 - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/string.h> -#include <linux/wait.h> -#include <linux/dcache.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/sched.h> - -#include <asm/current.h> -#include <asm/uaccess.h> - -#ifdef DEBUG -#define DPRINTK(D) (printk D) -#else -#define DPRINTK(D) ((void)0) -#endif - -/* - * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the - * kernel will keep the negative response cached for up to the time given - * here, although the time can be shorter if the kernel throws the dcache - * entry away. This probably should be settable from user space. - */ -#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */ - -/* Structures associated with the root directory hash table */ - -#define AUTOFS_HASH_SIZE 67 - -struct autofs_dir_ent { - int hash; - char *name; - int len; - ino_t ino; - struct dentry *dentry; - /* Linked list of entries */ - struct autofs_dir_ent *next; - struct autofs_dir_ent **back; - /* The following entries are for the expiry system */ - unsigned long last_usage; - struct list_head exp; -}; - -struct autofs_dirhash { - struct autofs_dir_ent *h[AUTOFS_HASH_SIZE]; - struct list_head expiry_head; -}; - -struct autofs_wait_queue { - wait_queue_head_t queue; - struct autofs_wait_queue *next; - autofs_wqt_t wait_queue_token; - /* We use the following to see what we are waiting for */ - int hash; - int len; - char *name; - /* This is for status reporting upon return */ - int status; - int wait_ctr; -}; - -struct autofs_symlink { - char *data; - int len; - time_t mtime; -}; - -#define AUTOFS_MAX_SYMLINKS 256 - -#define AUTOFS_ROOT_INO 1 -#define AUTOFS_FIRST_SYMLINK 2 -#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS) - -#define AUTOFS_SYMLINK_BITMAP_LEN \ - ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8)) - -#define AUTOFS_SBI_MAGIC 0x6d4a556d - -struct autofs_sb_info { - u32 magic; - struct file *pipe; - struct pid *oz_pgrp; - int catatonic; - struct super_block *sb; - unsigned long exp_timeout; - ino_t next_dir_ino; - struct autofs_wait_queue *queues; /* Wait queue pointer */ - struct autofs_dirhash dirhash; /* Root directory hash */ - struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS]; - unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN]; -}; - -static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) -{ - return (struct autofs_sb_info *)(sb->s_fs_info); -} - -/* autofs_oz_mode(): do we see the man behind the curtain? (The - processes which do manipulations for us in user space sees the raw - filesystem without "magic".) */ - -static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; -} - -/* Hash operations */ - -void autofs_initialize_hash(struct autofs_dirhash *); -struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *); -void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *); -void autofs_hash_delete(struct autofs_dir_ent *); -struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *); -void autofs_hash_dputall(struct autofs_dirhash *); -void autofs_hash_nuke(struct autofs_sb_info *); - -/* Expiration-handling functions */ - -void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *); -struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt); - -/* Operations structures */ - -extern const struct inode_operations autofs_root_inode_operations; -extern const struct inode_operations autofs_symlink_inode_operations; -extern const struct file_operations autofs_root_operations; - -/* Initializing function */ - -int autofs_fill_super(struct super_block *, void *, int); -void autofs_kill_sb(struct super_block *sb); -struct inode *autofs_iget(struct super_block *, unsigned long); - -/* Queue management functions */ - -int autofs_wait(struct autofs_sb_info *,struct qstr *); -int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); -void autofs_catatonic_mode(struct autofs_sb_info *); - -#ifdef DEBUG -void autofs_say(const char *name, int len); -#else -#define autofs_say(n,l) ((void)0) -#endif diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c deleted file mode 100644 index e947915109e5..000000000000 --- a/fs/autofs/dirhash.c +++ /dev/null @@ -1,250 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/dirhash.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include "autofs_i.h" - -/* Functions for maintenance of expiry queue */ - -static void autofs_init_usage(struct autofs_dirhash *dh, - struct autofs_dir_ent *ent) -{ - list_add_tail(&ent->exp, &dh->expiry_head); - ent->last_usage = jiffies; -} - -static void autofs_delete_usage(struct autofs_dir_ent *ent) -{ - list_del(&ent->exp); -} - -void autofs_update_usage(struct autofs_dirhash *dh, - struct autofs_dir_ent *ent) -{ - autofs_delete_usage(ent); /* Unlink from current position */ - autofs_init_usage(dh,ent); /* Relink at queue tail */ -} - -struct autofs_dir_ent *autofs_expire(struct super_block *sb, - struct autofs_sb_info *sbi, - struct vfsmount *mnt) -{ - struct autofs_dirhash *dh = &sbi->dirhash; - struct autofs_dir_ent *ent; - unsigned long timeout = sbi->exp_timeout; - - while (1) { - struct path path; - int umount_ok; - - if ( list_empty(&dh->expiry_head) || sbi->catatonic ) - return NULL; /* No entries */ - /* We keep the list sorted by last_usage and want old stuff */ - ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp); - if (jiffies - ent->last_usage < timeout) - break; - /* Move to end of list in case expiry isn't desirable */ - autofs_update_usage(dh, ent); - - /* Check to see that entry is expirable */ - if ( ent->ino < AUTOFS_FIRST_DIR_INO ) - return ent; /* Symlinks are always expirable */ - - /* Get the dentry for the autofs subdirectory */ - path.dentry = ent->dentry; - - if (!path.dentry) { - /* Should only happen in catatonic mode */ - printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name); - autofs_delete_usage(ent); - continue; - } - - if (!path.dentry->d_inode) { - dput(path.dentry); - printk("autofs: negative dentry on expiry queue: %s\n", - ent->name); - autofs_delete_usage(ent); - continue; - } - - /* Make sure entry is mounted and unused; note that dentry will - point to the mounted-on-top root. */ - if (!S_ISDIR(path.dentry->d_inode->i_mode) || - !d_mountpoint(path.dentry)) { - DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); - continue; - } - path.mnt = mnt; - path_get(&path); - if (!follow_down(&path)) { - path_put(&path); - DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); - continue; - } - while (d_mountpoint(path.dentry) && follow_down(&path)) - ; - umount_ok = may_umount(path.mnt); - path_put(&path); - - if (umount_ok) { - DPRINTK(("autofs: signaling expire on %s\n", ent->name)); - return ent; /* Expirable! */ - } - DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name)); - } - return NULL; /* No expirable entries */ -} - -void autofs_initialize_hash(struct autofs_dirhash *dh) { - memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *)); - INIT_LIST_HEAD(&dh->expiry_head); -} - -struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name) -{ - struct autofs_dir_ent *dhn; - - DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash)); - autofs_say(name->name,name->len); - - for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) { - if ( name->hash == dhn->hash && - name->len == dhn->len && - !memcmp(name->name, dhn->name, name->len) ) - break; - } - - return dhn; -} - -void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent) -{ - struct autofs_dir_ent **dhnp; - - DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash)); - autofs_say(ent->name,ent->len); - - autofs_init_usage(dh,ent); - if (ent->dentry) - dget(ent->dentry); - - dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE]; - ent->next = *dhnp; - ent->back = dhnp; - *dhnp = ent; - if ( ent->next ) - ent->next->back = &(ent->next); -} - -void autofs_hash_delete(struct autofs_dir_ent *ent) -{ - *(ent->back) = ent->next; - if ( ent->next ) - ent->next->back = ent->back; - - autofs_delete_usage(ent); - - if ( ent->dentry ) - dput(ent->dentry); - kfree(ent->name); - kfree(ent); -} - -/* - * Used by readdir(). We must validate "ptr", so we can't simply make it - * a pointer. Values below 0xffff are reserved; calling with any value - * <= 0x10000 will return the first entry found. - * - * "last" can be NULL or the value returned by the last search *if* we - * want the next sequential entry. - */ -struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh, - off_t *ptr, struct autofs_dir_ent *last) -{ - int bucket, ecount, i; - struct autofs_dir_ent *ent; - - bucket = (*ptr >> 16) - 1; - ecount = *ptr & 0xffff; - - if ( bucket < 0 ) { - bucket = ecount = 0; - } - - DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount)); - - ent = last ? last->next : NULL; - - if ( ent ) { - ecount++; - } else { - while ( bucket < AUTOFS_HASH_SIZE ) { - ent = dh->h[bucket]; - for ( i = ecount ; ent && i ; i-- ) - ent = ent->next; - - if (ent) { - ecount++; /* Point to *next* entry */ - break; - } - - bucket++; ecount = 0; - } - } - -#ifdef DEBUG - if ( !ent ) - printk("autofs_hash_enum: nothing found\n"); - else { - printk("autofs_hash_enum: found hash %08x, name", ent->hash); - autofs_say(ent->name,ent->len); - } -#endif - - *ptr = ((bucket+1) << 16) + ecount; - return ent; -} - -/* Iterate over all the ents, and remove all dentry pointers. Used on - entering catatonic mode, in order to make the filesystem unmountable. */ -void autofs_hash_dputall(struct autofs_dirhash *dh) -{ - int i; - struct autofs_dir_ent *ent; - - for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) { - for ( ent = dh->h[i] ; ent ; ent = ent->next ) { - if ( ent->dentry ) { - dput(ent->dentry); - ent->dentry = NULL; - } - } - } -} - -/* Delete everything. This is used on filesystem destruction, so we - make no attempt to keep the pointers valid */ -void autofs_hash_nuke(struct autofs_sb_info *sbi) -{ - int i; - struct autofs_dir_ent *ent, *nent; - - for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) { - for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) { - nent = ent->next; - if ( ent->dentry ) - dput(ent->dentry); - kfree(ent->name); - kfree(ent); - } - } -} diff --git a/fs/autofs/init.c b/fs/autofs/init.c deleted file mode 100644 index cea5219b4f37..000000000000 --- a/fs/autofs/init.c +++ /dev/null @@ -1,52 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/init.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include <linux/module.h> -#include <linux/init.h> -#include "autofs_i.h" - -static int autofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt); -} - -static struct file_system_type autofs_fs_type = { - .owner = THIS_MODULE, - .name = "autofs", - .get_sb = autofs_get_sb, - .kill_sb = autofs_kill_sb, -}; - -static int __init init_autofs_fs(void) -{ - return register_filesystem(&autofs_fs_type); -} - -static void __exit exit_autofs_fs(void) -{ - unregister_filesystem(&autofs_fs_type); -} - -module_init(init_autofs_fs); -module_exit(exit_autofs_fs); - -#ifdef DEBUG -void autofs_say(const char *name, int len) -{ - printk("(%d: ", len); - while ( len-- ) - printk("%c", *name++); - printk(")\n"); -} -#endif -MODULE_LICENSE("GPL"); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c deleted file mode 100644 index e1734f2d6e26..000000000000 --- a/fs/autofs/inode.c +++ /dev/null @@ -1,288 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/inode.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/file.h> -#include <linux/parser.h> -#include <linux/bitops.h> -#include <linux/magic.h> -#include "autofs_i.h" -#include <linux/module.h> - -void autofs_kill_sb(struct super_block *sb) -{ - struct autofs_sb_info *sbi = autofs_sbi(sb); - unsigned int n; - - /* - * In the event of a failure in get_sb_nodev the superblock - * info is not present so nothing else has been setup, so - * just call kill_anon_super when we are called from - * deactivate_super. - */ - if (!sbi) - goto out_kill_sb; - - if (!sbi->catatonic) - autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */ - - put_pid(sbi->oz_pgrp); - - autofs_hash_nuke(sbi); - for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) { - if (test_bit(n, sbi->symlink_bitmap)) - kfree(sbi->symlink[n].data); - } - - kfree(sb->s_fs_info); - -out_kill_sb: - DPRINTK(("autofs: shutting down\n")); - kill_anon_super(sb); -} - -static const struct super_operations autofs_sops = { - .statfs = simple_statfs, - .show_options = generic_show_options, -}; - -enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; - -static const match_table_t autofs_tokens = { - {Opt_fd, "fd=%u"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pgrp, "pgrp=%u"}, - {Opt_minproto, "minproto=%u"}, - {Opt_maxproto, "maxproto=%u"}, - {Opt_err, NULL} -}; - -static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, - pid_t *pgrp, int *minproto, int *maxproto) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - *uid = current_uid(); - *gid = current_gid(); - *pgrp = task_pgrp_nr(current); - - *minproto = *maxproto = AUTOFS_PROTO_VERSION; - - *pipefd = -1; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, autofs_tokens, args); - switch (token) { - case Opt_fd: - if (match_int(&args[0], &option)) - return 1; - *pipefd = option; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return 1; - *uid = option; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 1; - *gid = option; - break; - case Opt_pgrp: - if (match_int(&args[0], &option)) - return 1; - *pgrp = option; - break; - case Opt_minproto: - if (match_int(&args[0], &option)) - return 1; - *minproto = option; - break; - case Opt_maxproto: - if (match_int(&args[0], &option)) - return 1; - *maxproto = option; - break; - default: - return 1; - } - } - return (*pipefd < 0); -} - -int autofs_fill_super(struct super_block *s, void *data, int silent) -{ - struct inode * root_inode; - struct dentry * root; - struct file * pipe; - int pipefd; - struct autofs_sb_info *sbi; - int minproto, maxproto; - pid_t pgid; - - save_mount_options(s, data); - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto fail_unlock; - DPRINTK(("autofs: starting up, sbi = %p\n",sbi)); - - s->s_fs_info = sbi; - sbi->magic = AUTOFS_SBI_MAGIC; - sbi->pipe = NULL; - sbi->catatonic = 1; - sbi->exp_timeout = 0; - autofs_initialize_hash(&sbi->dirhash); - sbi->queues = NULL; - memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); - sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs_sops; - s->s_time_gran = 1; - sbi->sb = s; - - root_inode = autofs_iget(s, AUTOFS_ROOT_INO); - if (IS_ERR(root_inode)) - goto fail_free; - root = d_alloc_root(root_inode); - pipe = NULL; - - if (!root) - goto fail_iput; - - /* Can this call block? - WTF cares? s is locked. */ - if (parse_options(data, &pipefd, &root_inode->i_uid, - &root_inode->i_gid, &pgid, &minproto, - &maxproto)) { - printk("autofs: called with bogus options\n"); - goto fail_dput; - } - - /* Couldn't this be tested earlier? */ - if (minproto > AUTOFS_PROTO_VERSION || - maxproto < AUTOFS_PROTO_VERSION) { - printk("autofs: kernel does not match daemon version\n"); - goto fail_dput; - } - - DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid)); - sbi->oz_pgrp = find_get_pid(pgid); - - if (!sbi->oz_pgrp) { - printk("autofs: could not find process group %d\n", pgid); - goto fail_dput; - } - - pipe = fget(pipefd); - - if (!pipe) { - printk("autofs: could not open pipe file descriptor\n"); - goto fail_put_pid; - } - - if (!pipe->f_op || !pipe->f_op->write) - goto fail_fput; - sbi->pipe = pipe; - sbi->catatonic = 0; - - /* - * Success! Install the root dentry now to indicate completion. - */ - s->s_root = root; - return 0; - -fail_fput: - printk("autofs: pipe file descriptor does not contain proper ops\n"); - fput(pipe); -fail_put_pid: - put_pid(sbi->oz_pgrp); -fail_dput: - dput(root); - goto fail_free; -fail_iput: - printk("autofs: get root dentry failed\n"); - iput(root_inode); -fail_free: - kfree(sbi); - s->s_fs_info = NULL; -fail_unlock: - return -EINVAL; -} - -struct inode *autofs_iget(struct super_block *sb, unsigned long ino) -{ - unsigned int n; - struct autofs_sb_info *sbi = autofs_sbi(sb); - struct inode *inode; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - /* Initialize to the default case (stub directory) */ - - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_nlink = 2; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - if (ino == AUTOFS_ROOT_INO) { - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; - inode->i_op = &autofs_root_inode_operations; - inode->i_fop = &autofs_root_operations; - goto done; - } - - inode->i_uid = inode->i_sb->s_root->d_inode->i_uid; - inode->i_gid = inode->i_sb->s_root->d_inode->i_gid; - - if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) { - /* Symlink inode - should be in symlink list */ - struct autofs_symlink *sl; - - n = ino - AUTOFS_FIRST_SYMLINK; - if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) { - printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino); - goto done; - } - - inode->i_op = &autofs_symlink_inode_operations; - sl = &sbi->symlink[n]; - inode->i_private = sl; - inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; - inode->i_size = sl->len; - inode->i_nlink = 1; - } - -done: - unlock_new_inode(inode); - return inode; -} diff --git a/fs/autofs/root.c b/fs/autofs/root.c deleted file mode 100644 index 0c4ca81aeaeb..000000000000 --- a/fs/autofs/root.c +++ /dev/null @@ -1,645 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/root.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/stat.h> -#include <linux/slab.h> -#include <linux/param.h> -#include <linux/time.h> -#include <linux/compat.h> -#include <linux/smp_lock.h> -#include "autofs_i.h" - -static int autofs_root_readdir(struct file *,void *,filldir_t); -static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *); -static int autofs_root_symlink(struct inode *,struct dentry *,const char *); -static int autofs_root_unlink(struct inode *,struct dentry *); -static int autofs_root_rmdir(struct inode *,struct dentry *); -static int autofs_root_mkdir(struct inode *,struct dentry *,int); -static long autofs_root_ioctl(struct file *,unsigned int,unsigned long); -#ifdef CONFIG_COMPAT -static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long); -#endif - -const struct file_operations autofs_root_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = autofs_root_readdir, - .unlocked_ioctl = autofs_root_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = autofs_root_compat_ioctl, -#endif -}; - -const struct inode_operations autofs_root_inode_operations = { - .lookup = autofs_root_lookup, - .unlink = autofs_root_unlink, - .symlink = autofs_root_symlink, - .mkdir = autofs_root_mkdir, - .rmdir = autofs_root_rmdir, -}; - -static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct autofs_dir_ent *ent = NULL; - struct autofs_dirhash *dirhash; - struct autofs_sb_info *sbi; - struct inode * inode = filp->f_path.dentry->d_inode; - off_t onr, nr; - - lock_kernel(); - - sbi = autofs_sbi(inode->i_sb); - dirhash = &sbi->dirhash; - nr = filp->f_pos; - - switch(nr) - { - case 0: - if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos = ++nr; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos = ++nr; - /* fall through */ - default: - while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) { - if (!ent->dentry || d_mountpoint(ent->dentry)) { - if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0) - goto out; - filp->f_pos = nr; - } - } - break; - } - -out: - unlock_kernel(); - return 0; -} - -static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi) -{ - struct inode * inode; - struct autofs_dir_ent *ent; - int status = 0; - - if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) { - do { - if (status && dentry->d_inode) { - if (status != -ENOENT) - printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name); - return 0; /* Try to get the kernel to invalidate this dentry */ - } - - /* Turn this into a real negative dentry? */ - if (status == -ENOENT) { - dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT; - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - return 1; - } else if (status) { - /* Return a negative dentry, but leave it "pending" */ - return 1; - } - status = autofs_wait(sbi, &dentry->d_name); - } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))); - } - - /* Abuse this field as a pointer to the directory entry, used to - find the expire list pointers */ - dentry->d_time = (unsigned long) ent; - - if (!dentry->d_inode) { - inode = autofs_iget(sb, ent->ino); - if (IS_ERR(inode)) { - /* Failed, but leave pending for next time */ - return 1; - } - dentry->d_inode = inode; - } - - /* If this is a directory that isn't a mount point, bitch at the - daemon and fix it in user space */ - if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) { - return !autofs_wait(sbi, &dentry->d_name); - } - - /* We don't update the usages for the autofs daemon itself, this - is necessary for recursive autofs mounts */ - if (!autofs_oz_mode(sbi)) { - autofs_update_usage(&sbi->dirhash,ent); - } - - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - return 1; -} - - -/* - * Revalidate is called on every cache lookup. Some of those - * cache lookups may actually happen while the dentry is not - * yet completely filled in, and revalidate has to delay such - * lookups.. - */ -static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd) -{ - struct inode * dir; - struct autofs_sb_info *sbi; - struct autofs_dir_ent *ent; - int res; - - lock_kernel(); - dir = dentry->d_parent->d_inode; - sbi = autofs_sbi(dir->i_sb); - - /* Pending dentry */ - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) { - if (autofs_oz_mode(sbi)) - res = 1; - else - res = try_to_fill_dentry(dentry, dir->i_sb, sbi); - unlock_kernel(); - return res; - } - - /* Negative dentry.. invalidate if "old" */ - if (!dentry->d_inode) { - unlock_kernel(); - return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT); - } - - /* Check for a non-mountpoint directory */ - if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) { - if (autofs_oz_mode(sbi)) - res = 1; - else - res = try_to_fill_dentry(dentry, dir->i_sb, sbi); - unlock_kernel(); - return res; - } - - /* Update the usage list */ - if (!autofs_oz_mode(sbi)) { - ent = (struct autofs_dir_ent *) dentry->d_time; - if (ent) - autofs_update_usage(&sbi->dirhash,ent); - } - unlock_kernel(); - return 1; -} - -static const struct dentry_operations autofs_dentry_operations = { - .d_revalidate = autofs_revalidate, -}; - -static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - struct autofs_sb_info *sbi; - int oz_mode; - - DPRINTK(("autofs_root_lookup: name = ")); - lock_kernel(); - autofs_say(dentry->d_name.name,dentry->d_name.len); - - if (dentry->d_name.len > NAME_MAX) { - unlock_kernel(); - return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */ - } - - sbi = autofs_sbi(dir->i_sb); - - oz_mode = autofs_oz_mode(sbi); - DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, " - "oz_mode = %d\n", task_pid_nr(current), - task_pgrp_nr(current), sbi->catatonic, - oz_mode)); - - /* - * Mark the dentry incomplete, but add it. This is needed so - * that the VFS layer knows about the dentry, and we can count - * on catching any lookups through the revalidate. - * - * Let all the hard work be done by the revalidate function that - * needs to be able to do this anyway.. - * - * We need to do this before we release the directory semaphore. - */ - dentry->d_op = &autofs_dentry_operations; - dentry->d_flags |= DCACHE_AUTOFS_PENDING; - d_add(dentry, NULL); - - mutex_unlock(&dir->i_mutex); - autofs_revalidate(dentry, nd); - mutex_lock(&dir->i_mutex); - - /* - * If we are still pending, check if we had to handle - * a signal. If so we can force a restart.. - */ - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) { - /* See if we were interrupted */ - if (signal_pending(current)) { - sigset_t *sigset = ¤t->pending.signal; - if (sigismember (sigset, SIGKILL) || - sigismember (sigset, SIGQUIT) || - sigismember (sigset, SIGINT)) { - unlock_kernel(); - return ERR_PTR(-ERESTARTNOINTR); - } - } - } - unlock_kernel(); - - /* - * If this dentry is unhashed, then we shouldn't honour this - * lookup even if the dentry is positive. Returning ENOENT here - * doesn't do the right thing for all system calls, but it should - * be OK for the operations we permit from an autofs. - */ - if (dentry->d_inode && d_unhashed(dentry)) - return ERR_PTR(-ENOENT); - - return NULL; -} - -static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_dirhash *dh = &sbi->dirhash; - struct autofs_dir_ent *ent; - unsigned int n; - int slsize; - struct autofs_symlink *sl; - struct inode *inode; - - DPRINTK(("autofs_root_symlink: %s <- ", symname)); - autofs_say(dentry->d_name.name,dentry->d_name.len); - - lock_kernel(); - if (!autofs_oz_mode(sbi)) { - unlock_kernel(); - return -EACCES; - } - - if (autofs_hash_lookup(dh, &dentry->d_name)) { - unlock_kernel(); - return -EEXIST; - } - - n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS); - if (n >= AUTOFS_MAX_SYMLINKS) { - unlock_kernel(); - return -ENOSPC; - } - - set_bit(n,sbi->symlink_bitmap); - sl = &sbi->symlink[n]; - sl->len = strlen(symname); - sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL); - if (!sl->data) { - clear_bit(n,sbi->symlink_bitmap); - unlock_kernel(); - return -ENOSPC; - } - - ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL); - if (!ent) { - kfree(sl->data); - clear_bit(n,sbi->symlink_bitmap); - unlock_kernel(); - return -ENOSPC; - } - - ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL); - if (!ent->name) { - kfree(sl->data); - kfree(ent); - clear_bit(n,sbi->symlink_bitmap); - unlock_kernel(); - return -ENOSPC; - } - - memcpy(sl->data,symname,slsize); - sl->mtime = get_seconds(); - - ent->ino = AUTOFS_FIRST_SYMLINK + n; - ent->hash = dentry->d_name.hash; - memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len)); - ent->dentry = NULL; /* We don't keep the dentry for symlinks */ - - autofs_hash_insert(dh,ent); - - inode = autofs_iget(dir->i_sb, ent->ino); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - d_instantiate(dentry, inode); - unlock_kernel(); - return 0; -} - -/* - * NOTE! - * - * Normal filesystems would do a "d_delete()" to tell the VFS dcache - * that the file no longer exists. However, doing that means that the - * VFS layer can turn the dentry into a negative dentry, which we - * obviously do not want (we're dropping the entry not because it - * doesn't exist, but because it has timed out). - * - * Also see autofs_root_rmdir().. - */ -static int autofs_root_unlink(struct inode *dir, struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_dirhash *dh = &sbi->dirhash; - struct autofs_dir_ent *ent; - unsigned int n; - - /* This allows root to remove symlinks */ - lock_kernel(); - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) { - unlock_kernel(); - return -EACCES; - } - - ent = autofs_hash_lookup(dh, &dentry->d_name); - if (!ent) { - unlock_kernel(); - return -ENOENT; - } - - n = ent->ino - AUTOFS_FIRST_SYMLINK; - if (n >= AUTOFS_MAX_SYMLINKS) { - unlock_kernel(); - return -EISDIR; /* It's a directory, dummy */ - } - if (!test_bit(n,sbi->symlink_bitmap)) { - unlock_kernel(); - return -EINVAL; /* Nonexistent symlink? Shouldn't happen */ - } - - dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL; - autofs_hash_delete(ent); - clear_bit(n,sbi->symlink_bitmap); - kfree(sbi->symlink[n].data); - d_drop(dentry); - - unlock_kernel(); - return 0; -} - -static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_dirhash *dh = &sbi->dirhash; - struct autofs_dir_ent *ent; - - lock_kernel(); - if (!autofs_oz_mode(sbi)) { - unlock_kernel(); - return -EACCES; - } - - ent = autofs_hash_lookup(dh, &dentry->d_name); - if (!ent) { - unlock_kernel(); - return -ENOENT; - } - - if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) { - unlock_kernel(); - return -ENOTDIR; /* Not a directory */ - } - - if (ent->dentry != dentry) { - printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name); - } - - dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL; - autofs_hash_delete(ent); - drop_nlink(dir); - d_drop(dentry); - unlock_kernel(); - - return 0; -} - -static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_dirhash *dh = &sbi->dirhash; - struct autofs_dir_ent *ent; - struct inode *inode; - ino_t ino; - - lock_kernel(); - if (!autofs_oz_mode(sbi)) { - unlock_kernel(); - return -EACCES; - } - - ent = autofs_hash_lookup(dh, &dentry->d_name); - if (ent) { - unlock_kernel(); - return -EEXIST; - } - - if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) { - printk("autofs: Out of inode numbers -- what the heck did you do??\n"); - unlock_kernel(); - return -ENOSPC; - } - ino = sbi->next_dir_ino++; - - ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL); - if (!ent) { - unlock_kernel(); - return -ENOSPC; - } - - ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL); - if (!ent->name) { - kfree(ent); - unlock_kernel(); - return -ENOSPC; - } - - ent->hash = dentry->d_name.hash; - memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len)); - ent->ino = ino; - ent->dentry = dentry; - autofs_hash_insert(dh,ent); - - inc_nlink(dir); - - inode = autofs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - drop_nlink(dir); - return PTR_ERR(inode); - } - - d_instantiate(dentry, inode); - unlock_kernel(); - - return 0; -} - -/* Get/set timeout ioctl() operation */ -#ifdef CONFIG_COMPAT -static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, - unsigned int __user *p) -{ - unsigned long ntimeout; - - if (get_user(ntimeout, p) || - put_user(sbi->exp_timeout / HZ, p)) - return -EFAULT; - - if (ntimeout > UINT_MAX/HZ) - sbi->exp_timeout = 0; - else - sbi->exp_timeout = ntimeout * HZ; - - return 0; -} -#endif - -static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, - unsigned long __user *p) -{ - unsigned long ntimeout; - - if (get_user(ntimeout, p) || - put_user(sbi->exp_timeout / HZ, p)) - return -EFAULT; - - if (ntimeout > ULONG_MAX/HZ) - sbi->exp_timeout = 0; - else - sbi->exp_timeout = ntimeout * HZ; - - return 0; -} - -/* Return protocol version */ -static inline int autofs_get_protover(int __user *p) -{ - return put_user(AUTOFS_PROTO_VERSION, p); -} - -/* Perform an expiry operation */ -static inline int autofs_expire_run(struct super_block *sb, - struct autofs_sb_info *sbi, - struct vfsmount *mnt, - struct autofs_packet_expire __user *pkt_p) -{ - struct autofs_dir_ent *ent; - struct autofs_packet_expire pkt; - - memset(&pkt,0,sizeof pkt); - - pkt.hdr.proto_version = AUTOFS_PROTO_VERSION; - pkt.hdr.type = autofs_ptype_expire; - - if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt))) - return -EAGAIN; - - pkt.len = ent->len; - memcpy(pkt.name, ent->name, pkt.len); - pkt.name[pkt.len] = '\0'; - - if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) - return -EFAULT; - - return 0; -} - -/* - * ioctl()'s on the root directory is the chief method for the daemon to - * generate kernel reactions - */ -static int autofs_do_root_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); - void __user *argp = (void __user *)arg; - - DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current))); - - if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || - _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) - return -ENOTTY; - - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch(cmd) { - case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs_wait_release(sbi,(autofs_wqt_t)arg,0); - case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); - case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs_catatonic_mode(sbi); - return 0; - case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs_get_protover(argp); -#ifdef CONFIG_COMPAT - case AUTOFS_IOC_SETTIMEOUT32: - return autofs_compat_get_set_timeout(sbi, argp); -#endif - case AUTOFS_IOC_SETTIMEOUT: - return autofs_get_set_timeout(sbi, argp); - case AUTOFS_IOC_EXPIRE: - return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt, - argp); - default: - return -ENOSYS; - } - -} - -static long autofs_root_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - int ret; - - lock_kernel(); - ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode, - filp, cmd, arg); - unlock_kernel(); - - return ret; -} - -#ifdef CONFIG_COMPAT -static long autofs_root_compat_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - int ret; - - lock_kernel(); - if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs_do_root_ioctl(inode, filp, cmd, arg); - else - ret = autofs_do_root_ioctl(inode, filp, cmd, - (unsigned long)compat_ptr(arg)); - unlock_kernel(); - - return ret; -} -#endif diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c deleted file mode 100644 index 7ce9cb2c9ce2..000000000000 --- a/fs/autofs/symlink.c +++ /dev/null @@ -1,26 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/symlink.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include "autofs_i.h" - -/* Nothing to release.. */ -static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data; - nd_set_link(nd, s); - return NULL; -} - -const struct inode_operations autofs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = autofs_follow_link -}; diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c deleted file mode 100644 index be46805972f0..000000000000 --- a/fs/autofs/waitq.c +++ /dev/null @@ -1,205 +0,0 @@ -/* -*- linux-c -*- --------------------------------------------------------- * - * - * linux/fs/autofs/waitq.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ - -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/signal.h> -#include <linux/file.h> -#include "autofs_i.h" - -/* We make this a static variable rather than a part of the superblock; it - is better if we don't reassign numbers easily even across filesystems */ -static autofs_wqt_t autofs_next_wait_queue = 1; - -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - -void autofs_catatonic_mode(struct autofs_sb_info *sbi) -{ - struct autofs_wait_queue *wq, *nwq; - - DPRINTK(("autofs: entering catatonic mode\n")); - - sbi->catatonic = 1; - wq = sbi->queues; - sbi->queues = NULL; /* Erase all wait queues */ - while ( wq ) { - nwq = wq->next; - wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name); - wq->name = NULL; - wake_up(&wq->queue); - wq = nwq; - } - fput(sbi->pipe); /* Close the pipe */ - sbi->pipe = NULL; - autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */ -} - -static int autofs_write(struct file *file, const void *addr, int bytes) -{ - unsigned long sigpipe, flags; - mm_segment_t fs; - const char *data = (const char *)addr; - ssize_t wr = 0; - - /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/ - - sigpipe = sigismember(¤t->pending.signal, SIGPIPE); - - /* Save pointer to user space and point back to kernel space */ - fs = get_fs(); - set_fs(KERNEL_DS); - - while (bytes && - (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { - data += wr; - bytes -= wr; - } - - set_fs(fs); - - /* Keep the currently executing process from receiving a - SIGPIPE unless it was already supposed to get one */ - if (wr == -EPIPE && !sigpipe) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - sigdelset(¤t->pending.signal, SIGPIPE); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - } - - return (bytes > 0); -} - -static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq) -{ - struct autofs_packet_missing pkt; - - DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token)); - autofs_say(wq->name,wq->len); - - memset(&pkt,0,sizeof pkt); /* For security reasons */ - - pkt.hdr.proto_version = AUTOFS_PROTO_VERSION; - pkt.hdr.type = autofs_ptype_missing; - pkt.wait_queue_token = wq->wait_queue_token; - pkt.len = wq->len; - memcpy(pkt.name, wq->name, pkt.len); - pkt.name[pkt.len] = '\0'; - - if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) ) - autofs_catatonic_mode(sbi); -} - -int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name) -{ - struct autofs_wait_queue *wq; - int status; - - /* In catatonic mode, we don't wait for nobody */ - if ( sbi->catatonic ) - return -ENOENT; - - /* We shouldn't be able to get here, but just in case */ - if ( name->len > NAME_MAX ) - return -ENOENT; - - for ( wq = sbi->queues ; wq ; wq = wq->next ) { - if ( wq->hash == name->hash && - wq->len == name->len && - wq->name && !memcmp(wq->name,name->name,name->len) ) - break; - } - - if ( !wq ) { - /* Create a new wait queue */ - wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); - if ( !wq ) - return -ENOMEM; - - wq->name = kmalloc(name->len,GFP_KERNEL); - if ( !wq->name ) { - kfree(wq); - return -ENOMEM; - } - wq->wait_queue_token = autofs_next_wait_queue++; - init_waitqueue_head(&wq->queue); - wq->hash = name->hash; - wq->len = name->len; - wq->status = -EINTR; /* Status return if interrupted */ - memcpy(wq->name, name->name, name->len); - wq->next = sbi->queues; - sbi->queues = wq; - - /* autofs_notify_daemon() may block */ - wq->wait_ctr = 2; - autofs_notify_daemon(sbi,wq); - } else - wq->wait_ctr++; - - /* wq->name is NULL if and only if the lock is already released */ - - if ( sbi->catatonic ) { - /* We might have slept, so check again for catatonic mode */ - wq->status = -ENOENT; - kfree(wq->name); - wq->name = NULL; - } - - if ( wq->name ) { - /* Block all but "shutdown" signals while waiting */ - sigset_t sigmask; - - siginitsetinv(&sigmask, SHUTDOWN_SIGS); - sigprocmask(SIG_BLOCK, &sigmask, &sigmask); - - interruptible_sleep_on(&wq->queue); - - sigprocmask(SIG_SETMASK, &sigmask, NULL); - } else { - DPRINTK(("autofs_wait: skipped sleeping\n")); - } - - status = wq->status; - - if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */ - kfree(wq); - - return status; -} - - -int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) -{ - struct autofs_wait_queue *wq, **wql; - - for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { - if ( wq->wait_queue_token == wait_queue_token ) - break; - } - if ( !wq ) - return -EINVAL; - - *wql = wq->next; /* Unlink from chain */ - kfree(wq->name); - wq->name = NULL; /* Do not wait on this queue */ - - wq->status = status; - - if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */ - kfree(wq); - else - wake_up(&wq->queue); - - return 0; -} - diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 9722e4bd8957..c038727b4050 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -14,16 +14,16 @@ #include <linux/init.h> #include "autofs_i.h" -static int autofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *autofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); + return mount_nodev(fs_type, flags, data, autofs4_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", - .get_sb = autofs_get_sb, + .mount = autofs_mount, .kill_sb = autofs4_kill_sb, }; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index dc39d2824885..aa4e7c7ae3c6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int -befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) +static struct dentry * +befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); } static struct file_system_type befs_fs_type = { .owner = THIS_MODULE, .name = "befs", - .get_sb = befs_get_sb, + .mount = befs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 883e77acd5a8..76db6d7d49bb 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -450,16 +450,16 @@ out: return ret; } -static int bfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); } static struct file_system_type bfs_fs_type = { .owner = THIS_MODULE, .name = "bfs", - .get_sb = bfs_get_sb, + .mount = bfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 29990f0eee0c..1befe2ec8186 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -706,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) return err; } -static int bm_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bm_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); + return mount_single(fs_type, flags, data, bm_fill_super); } static struct linux_binfmt misc_format = { @@ -720,7 +720,7 @@ static struct linux_binfmt misc_format = { static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", - .get_sb = bm_get_sb, + .mount = bm_mount, .kill_sb = kill_litter_super, }; @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; @@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { diff --git a/fs/block_dev.c b/fs/block_dev.c index dea3b628a6ce..06e8ff12b97c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -464,15 +464,15 @@ static const struct super_operations bdev_sops = { .evict_inode = bdev_evict_inode, }; -static int bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); } static struct file_system_type bd_type = { .name = "bdev", - .get_sb = bd_get_sb, + .mount = bd_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 396039b3a8a2..7845d1f7d1d9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -163,7 +163,6 @@ fail: */ static void end_compressed_bio_read(struct bio *bio, int err) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - tree = &BTRFS_I(inode)->io_tree; ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, cb->start, cb->orig_bio->bi_io_vec, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c3df14ce2cc2..9ac171599258 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid) { struct extent_buffer *cow; - u32 nritems; int ret = 0; int level; struct btrfs_disk_key disk_key; @@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); if (level == 0) btrfs_item_key(buf, &disk_key, 0); else @@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - int err_on_enospc = 0; u64 orig_ptr; if (level == 0) @@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; + btrfs_header_nritems(mid); left = read_node_slot(root, parent, pslot - 1); if (left) { @@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; + btrfs_header_nritems(mid); } /* @@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - u64 orig_ptr; if (level == 0) return 1; mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) parent = path->nodes[level + 1]; @@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, level - 1); tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - /* - * we found an up to date block without sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; + if (tmp) { + if (btrfs_buffer_uptodate(tmp, 0)) { + if (btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without + * sleeping, return + * right away + */ + *eb_ret = tmp; + return 0; + } + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + free_extent_buffer(tmp); + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + free_extent_buffer(tmp); + btrfs_release_path(NULL, p); + return -EIO; + } } /* @@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_unlock_up_safe(p, level + 1); btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); + free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); @@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - int slot; int i; int push_space = 0; int push_items = 0; @@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 this_item_size; u32 old_left_item_size; - slot = path->slots[1]; - if (empty) nr = min(right_nritems, max_slot); else @@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, { int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int size_diff; int i; - slot_orig = path->slots[0]; leaf = path->nodes[0]; slot = path->slots[0]; @@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, { int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_size; int i; - slot_orig = path->slots[0]; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_key *cpu_key, u32 *data_size, int nr) { - struct extent_buffer *leaf; int ret = 0; int slot; int i; @@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret < 0) goto out; - leaf = path->nodes[0]; slot = path->slots[0]; BUG_ON(slot < 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eaf286abad17..8db9234f6b41 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -99,6 +99,9 @@ struct btrfs_ordered_sum; */ #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + /* dummy objectid represents multiple objectids */ #define BTRFS_MULTIPLE_OBJECTIDS -255ULL @@ -265,6 +268,22 @@ struct btrfs_chunk { /* additional stripes go here */ } __attribute__ ((__packed__)); +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -365,8 +384,10 @@ struct btrfs_super_block { char label[BTRFS_LABEL_SIZE]; + __le64 cache_generation; + /* future expansion */ - __le64 reserved[32]; + __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; } __attribute__ ((__packed__)); @@ -375,13 +396,15 @@ struct btrfs_super_block { * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) /* * A leaf is full of items. offset and size tell us where to find @@ -675,7 +698,8 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; /* total bytes in the space */ + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the @@ -687,6 +711,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ int full; /* indicates that we cannot allocate any more chunks for this space */ @@ -750,6 +776,14 @@ enum btrfs_caching_type { BTRFS_CACHE_FINISHED = 2, }; +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN = 0, + BTRFS_DC_ERROR = 1, + BTRFS_DC_CLEAR = 2, + BTRFS_DC_SETUP = 3, + BTRFS_DC_NEED_WRITE = 4, +}; + struct btrfs_caching_control { struct list_head list; struct mutex mutex; @@ -763,6 +797,7 @@ struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; struct btrfs_fs_info *fs_info; + struct inode *inode; spinlock_t lock; u64 pinned; u64 reserved; @@ -773,8 +808,11 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int ro; - int dirty; + int ro:1; + int dirty:1; + int iref:1; + + int disk_cache_state; /* cache tracking stuff */ int cached; @@ -863,6 +901,7 @@ struct btrfs_fs_info { struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; struct btrfs_super_block super_copy; @@ -949,6 +988,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; + struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1192,6 +1232,9 @@ struct btrfs_root { #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) +#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) +#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) +#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, write_eb_member(eb, item, struct btrfs_dir_item, location, key); } +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); @@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { @@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file) return file->f_path.dentry; } +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + /* extent-tree.c */ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, @@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries); + int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries); + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, @@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e9103b3baa49..f0cad5ae5be7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, ret = btrfs_truncate_item(trans, root, path, item_len - sub_item_len, 1); } - return 0; + return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5e789f4a3ed0..fb827d0d7181 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - int found_level; unsigned long len; struct extent_buffer *eb; int ret; @@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); goto err; } - found_level = btrfs_header_level(eb); - csum_tree_block(root, eb, 0); err: free_extent_buffer(eb); @@ -481,9 +478,12 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata) + if (end_io_wq->metadata == 1) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); + else if (end_io_wq->metadata == 2) + btrfs_queue_worker(&fs_info->endio_freespace_worker, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); @@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err) } } +/* + * For the metadata arg you want + * + * 0 - if data + * 1 - if normal metadta + * 2 - if writing to the free space cache area + */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) { @@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) static void run_one_async_start(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); @@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_io_tree *io_tree; int ret; - io_tree = &BTRFS_I(btree_inode)->io_tree; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; @@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio) u64 start = 0; struct page *page; struct extent_io_tree *io_tree = NULL; - struct btrfs_fs_info *info = NULL; struct bio_vec *bvec; int i; int ret; @@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio) buf_len = page->private >> 2; start = page_offset(page) + bvec->bv_offset; io_tree = &BTRFS_I(page->mapping->host)->io_tree; - info = BTRFS_I(page->mapping->host)->root->fs_info; } /* are we fully contained in this bio? */ if (buf_len <= length) @@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) goto fail_iput; @@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", + 1, &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->endio_freespace_worker, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); btrfs_orphan_cleanup(fs_info->fs_root); + btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); } @@ -2035,6 +2038,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); @@ -2410,6 +2414,7 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + btrfs_put_block_group_cache(fs_info); if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) @@ -2456,6 +2461,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_close_devices(fs_info->fs_devices); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b81ecdb101c..0c097f3aec41 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache) return NULL; } + /* We're loading it the fast way, so we don't have a caching_ctl. */ + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + ctl = cache->caching_ctl; atomic_inc(&ctl->count); spin_unlock(&cache->lock); @@ -421,7 +427,9 @@ err: return 0; } -static int cache_block_group(struct btrfs_block_group_cache *cache) +static int cache_block_group(struct btrfs_block_group_cache *cache, + struct btrfs_trans_handle *trans, + int load_cache_only) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; @@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) if (cache->cached != BTRFS_CACHE_NO) return 0; + /* + * We can't do the read from on-disk cache during a commit since we need + * to have the normal tree locking. + */ + if (!trans->transaction->in_commit) { + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return 0; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + ret = load_free_space_cache(fs_info, cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + } else { + cache->cached = BTRFS_CACHE_NO; + } + spin_unlock(&cache->lock); + if (ret == 1) + return 0; + } + + if (load_cache_only) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); BUG_ON(!caching_ctl); @@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags == flags) { + if (found->flags & flags) { rcu_read_unlock(); return found; } @@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor) return num; } +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { @@ -2687,6 +2734,109 @@ next_block_group(struct btrfs_root *root, return cache; } +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_root *root = block_group->fs_info->tree_root; + struct inode *inode = NULL; + u64 alloc_hint = 0; + int num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * 1024 * 1024)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + +again: + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(root, path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(root, trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, path, + inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED) { + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + if (!num_pages) + num_pages = 1; + + /* + * Just to make absolutely sure we have enough space, we're going to + * preallocate 12 pages worth of space for each block group. In + * practice we ought to use at most 8, but we need extra space so we can + * add our header and have a terminator between the extents and the + * bitmaps. + */ + num_pages *= 16; + num_pages *= PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + btrfs_free_reserved_data_space(inode, num_pages); +out_put: + iput(inode); +out_free: + btrfs_release_path(root, path); +out: + spin_lock(&block_group->lock); + if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + else + block_group->disk_cache_state = BTRFS_DC_SETUP; + spin_unlock(&block_group->lock); + + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -2699,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + err = cache_save_setup(cache, trans, path); + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + while (1) { if (last == 0) { err = btrfs_run_delayed_refs(trans, root, @@ -2708,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = btrfs_lookup_first_block_group(root->fs_info, last); while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->dirty) break; cache = next_block_group(root, cache); @@ -2719,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, continue; } + if (cache->disk_cache_state == BTRFS_DC_SETUP) + cache->disk_cache_state = BTRFS_DC_NEED_WRITE; cache->dirty = 0; last = cache->key.objectid + cache->key.offset; @@ -2727,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, btrfs_put_block_group(cache); } + while (1) { + /* + * I don't think this is needed since we're just marking our + * preallocated extent as written, but just in case it can't + * hurt. + */ + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + /* + * Really this shouldn't happen, but it could if we + * couldn't write the entire preallocated extent and + * splitting the extent resulted in a new block. + */ + if (cache->dirty) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + btrfs_write_out_cache(root, trans, cache, path); + + /* + * If we didn't have an error then the cache state is still + * NEED_WRITE, so we can set it to WRITTEN. + */ + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + cache->disk_cache_state = BTRFS_DC_WRITTEN; + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + btrfs_free_path(path); return 0; } @@ -2762,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->full = 0; @@ -2781,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; found->disk_used = bytes_used * factor; found->bytes_pinned = 0; @@ -2882,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; - int ret = 0, committed = 0; + int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + if (root == root->fs_info->tree_root) { + alloc_chunk = 0; + committed = 1; + } + data_sinfo = BTRFS_I(inode)->space_info; if (!data_sinfo) goto alloc; @@ -2905,7 +3134,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full) { + if (!data_sinfo->full && alloc_chunk) { u64 alloc_target; data_sinfo->force_alloc = 1; @@ -2997,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static int should_alloc_chunk(struct btrfs_space_info *sinfo, - u64 alloc_bytes) +static int should_alloc_chunk(struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 alloc_bytes) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 thresh; if (sinfo->bytes_used + sinfo->bytes_reserved + alloc_bytes + 256 * 1024 * 1024 < num_bytes) @@ -3010,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, alloc_bytes < div_factor(num_bytes, 8)) return 0; + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; } @@ -3041,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { + if (!force && !should_alloc_chunk(extent_root, space_info, + alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } spin_unlock(&space_info->lock); /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* * if we're doing a data chunk, go ahead and make sure that * we keep a reasonable number of metadata chunks allocated in the * FS as well. @@ -3072,55 +3316,25 @@ out: return ret; } -static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 num_bytes) -{ - int ret; - int end_trans = 0; - - if (sinfo->full) - return 0; - - spin_lock(&sinfo->lock); - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); - spin_unlock(&sinfo->lock); - if (!ret) - return 0; - - if (!trans) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - end_trans = 1; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, - get_alloc_profile(root, sinfo->flags), 0); - - if (end_trans) - btrfs_end_transaction(trans, root); - - return ret == 1 ? 1 : 0; -} - /* * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim, int sync) { struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; int pause = 1; - int ret; + int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; block_rsv = &root->fs_info->delalloc_block_rsv; - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + space_info = block_rsv->space_info; + + smp_mb(); + reserved = space_info->bytes_reserved; if (reserved == 0) return 0; @@ -3128,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); while (1) { - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); - if (!ret) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; - } else { - pause = 1; - } + /* have the flusher threads jump in and do some IO */ + smp_mb(); + nr_pages = min_t(unsigned long, nr_pages, + root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); - spin_lock(&block_rsv->lock); - if (reserved > block_rsv->reserved) - reclaimed = reserved - block_rsv->reserved; - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed += reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } return reclaimed >= to_reclaim; } -static int should_retry_reserve(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) +/* + * Retries tells us how many times we've called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can't make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don't try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don't intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - int ret; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; + bool committed = false; - if ((*retries) > 2) - return -ENOSPC; +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); - if (ret) - return 1; + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; - if (trans && trans->transaction->in_commit) - return -ENOSPC; + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused -= space_info->total_bytes; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = unused - space_info->total_bytes + + (orig_bytes * (retries + 1)); + } - ret = shrink_delalloc(trans, root, num_bytes); - if (ret) - return ret; + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + */ + if (ret && !reserved) { + space_info->bytes_reserved += orig_bytes; + reserved = true; + } - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < num_bytes) - ret = 1; spin_unlock(&space_info->lock); - if (ret) - return -ENOSPC; - - (*retries)++; - if (trans) - return -EAGAIN; + if (!ret) + return 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (!flush) + goto out; - return 1; -} + /* + * We do synchronous shrinking since we don't actually unreserve + * metadata until after the IO is completed. + */ + ret = shrink_delalloc(trans, root, num_bytes, 1); + if (ret > 0) + return 0; + else if (ret < 0) + goto out; -static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; - int ret = -ENOSPC; + /* + * So if we were overcommitted it's possible that somebody else flushed + * out enough space and we simply didn't have enough space to reclaim, + * so go back around and try again. + */ + if (retries < 2) { + retries++; + goto again; + } spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly; + /* + * Not enough space to be reclaimed, don't bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; - if (unused < space_info->total_bytes) - unused = space_info->total_bytes - unused; - else - unused = 0; + ret = -EAGAIN; + if (trans || committed) + goto out; - if (unused >= num_bytes) { - if (block_rsv->priority >= 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } else { - if ((unused + block_rsv->reserved) * - block_rsv->priority >= - (num_bytes + block_rsv->reserved) * 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } - } + ret = -ENOSPC; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) { + trans = NULL; + committed = true; + goto again; + } + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); } - spin_unlock(&space_info->lock); return ret; } @@ -3327,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; - u64 alloc_target; block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); if (!block_rsv) return NULL; btrfs_init_block_rsv(block_rsv); - - alloc_target = btrfs_get_alloc_profile(root, 0); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - return block_rsv; } @@ -3369,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) + u64 num_bytes) { int ret; if (num_bytes == 0) return 0; -again: - ret = reserve_metadata_bytes(block_rsv, num_bytes); + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; } - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); - if (ret > 0) - goto again; - return ret; } @@ -3420,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(block_rsv, num_bytes); + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3499,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; meta_used = sinfo->bytes_used; spin_unlock(&sinfo->lock); @@ -3526,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = num_bytes; num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly; + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; @@ -3597,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries) + int num_items) { u64 num_bytes; int ret; @@ -3607,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes, retries); + num_bytes); if (!ret) { trans->bytes_reserved += num_bytes; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3681,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; - int retries = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); -again: + spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; if (nr_extents > BTRFS_I(inode)->reserved_extents) { @@ -3698,18 +3972,14 @@ again: nr_extents = 0; to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(block_rsv, to_reserve); - if (ret) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, - &retries); - if (ret > 0) - goto again; + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + if (ret) return ret; - } + spin_lock(&BTRFS_I(inode)->accounting_lock); BTRFS_I(inode)->reserved_extents += nr_extents; atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_unlock(&BTRFS_I(inode)->accounting_lock); @@ -3717,7 +3987,7 @@ again: block_rsv_add_bytes(block_rsv, to_reserve, 1); if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve); + shrink_delalloc(NULL, root, to_reserve, 0); return 0; } @@ -3776,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; - int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; + int factor; /* block accounting for super block */ spin_lock(&info->delalloc_lock); @@ -3803,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans, factor = 2; else factor = 1; + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + cache_block_group(cache, trans, 1); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); + + if (btrfs_super_cache_generation(&info->super_copy) != 0 && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); @@ -4554,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; + bool use_cluster = true; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -4568,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, return -ENOSPC; } + /* + * If the space info is for both data and metadata it means we have a + * small filesystem and we can't use the clustering stuff. + */ + if (btrfs_mixed_space_info(space_info)) + use_cluster = false; + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; - if (data & BTRFS_BLOCK_GROUP_METADATA) { + if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -4641,6 +4934,10 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { u64 free_percent; + ret = cache_block_group(block_group, trans, 1); + if (block_group->cached == BTRFS_CACHE_FINISHED) + goto have_block_group; + free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; free_percent = div64_u64(free_percent, @@ -4661,7 +4958,7 @@ have_block_group: if (loop > LOOP_CACHING_NOWAIT || (loop > LOOP_FIND_IDEAL && atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group); + ret = cache_block_group(block_group, trans, 0); BUG_ON(ret); } found_uncached_bg = true; @@ -5218,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group); + cache_block_group(block_group, trans, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -5308,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(block_rsv, blocksize); + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize, 0); if (ret) return ERR_PTR(ret); return block_rsv; @@ -5318,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return ERR_PTR(-ENOSPC); } @@ -5421,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 generation; u64 refs; u64 flags; - u64 last = 0; u32 nritems; u32 blocksize; struct btrfs_key key; @@ -5489,7 +5781,6 @@ reada: generation); if (ret) break; - last = bytenr + blocksize; nread++; } wc->reada_slot = slot; @@ -7813,6 +8104,40 @@ out: return ret; } +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = next_block_group(info->tree_root, + block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; @@ -7896,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; root = info->extent_root; key.objectid = 0; @@ -7905,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!path) return -ENOMEM; + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + if (cache_gen != 0 && + btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(root, CLEAR_CACHE)) + need_clear = 1; + if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + while (1) { ret = find_first_block_group(root, path, &key); if (ret > 0) @@ -7927,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + if (need_clear) + cache->disk_cache_state = BTRFS_DC_CLEAR; + /* * we only want to have 32k of ram per block group for keeping * track of free space, and if we pass 1/2 of that we want to @@ -8031,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; /* * we only want to have 32k of ram per block group for keeping track @@ -8087,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_group_cache *block_group; struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; + struct inode *inode; int ret; + int factor; root = root->fs_info->extent_root; @@ -8097,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group->ro); memcpy(&key, &block_group->key, sizeof(key)); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; /* make sure this block group isn't part of an allocation cluster */ cluster = &root->fs_info->data_alloc_cluster; @@ -8116,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + inode = lookup_free_space_inode(root, block_group, path); + if (!IS_ERR(inode)) { + btrfs_orphan_add(trans, inode); + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(tree_root, path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(tree_root, path); + } + spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); @@ -8137,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; spin_unlock(&block_group->space_info->lock); + memcpy(&key, &block_group->key, sizeof(key)); + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d74e6af9b53a..eac10e3260a9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping, gfp_t mask) { tree->state = RB_ROOT; - tree->buffer = RB_ROOT; + INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); @@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return ret; } -static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, - u64 offset, struct rb_node *node) -{ - struct rb_root *root = &tree->buffer; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct extent_buffer *eb; - - while (*p) { - parent = *p; - eb = rb_entry(parent, struct extent_buffer, rb_node); - - if (offset < eb->start) - p = &(*p)->rb_left; - else if (offset > eb->start) - p = &(*p)->rb_right; - else - return eb; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct extent_buffer *buffer_search(struct extent_io_tree *tree, - u64 offset) -{ - struct rb_root *root = &tree->buffer; - struct rb_node *n = root->rb_node; - struct extent_buffer *eb; - - while (n) { - eb = rb_entry(n, struct extent_buffer, rb_node); - if (offset < eb->start) - n = n->rb_left; - else if (offset > eb->start) - n = n->rb_right; - else - return eb; - } - return NULL; -} - static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; - u64 end; start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; - end = start + bvec->bv_len - 1; bio->bi_private = NULL; @@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; - u64 unlock_start; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; @@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); - unlock_start = page_end + 1; goto done; } @@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); - unlock_start = page_end + 1; break; } em = epd->get_extent(inode, page, pg_offset, cur, @@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, cur += iosize; pg_offset += iosize; - unlock_start = cur; continue; } /* leave this out until we have a page_mkwrite call */ @@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int range_whole = 0; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; scanned = 1; } retry: @@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree, NULL, 1, end_bio_extent_preparewrite, 0, 0, 0); + if (ret && !err) + err = ret; iocount++; block_start = block_start + iosize; } else { @@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + struct page *page; + + if (!eb->first_page) + return; + + index = num_extent_pages(eb->start, eb->len); + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page) + page_cache_release(page); + } while (index != start_idx); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, struct page *page0, @@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct page *p; struct address_space *mapping = tree->mapping; int uptodate = 1; + int ret; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) { - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); mark_page_accessed(eb->first_page); return eb; } - spin_unlock(&tree->buffer_lock); + rcu_read_unlock(); eb = __alloc_extent_buffer(tree, start, len, mask); if (!eb) @@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&tree->buffer_lock); - exists = buffer_tree_insert(tree, start, &eb->rb_node); - if (exists) { + ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + if (ret == -EEXIST) { + exists = radix_tree_lookup(&tree->buffer, + start >> PAGE_CACHE_SHIFT); /* add one reference for the caller */ atomic_inc(&exists->refs); spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); goto free_eb; } /* add one reference for the tree */ atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); return eb; free_eb: if (!atomic_dec_and_test(&eb->refs)) return exists; - for (index = 1; index < i; index++) - page_cache_release(extent_buffer_page(eb, index)); - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); + btrfs_release_extent_buffer(eb); return exists; } @@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, { struct extent_buffer *eb; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); - - if (eb) + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); mark_page_accessed(eb->first_page); + return eb; + } + rcu_read_unlock(); - return eb; + return NULL; } void free_extent_buffer(struct extent_buffer *eb) @@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + btrfs_release_extent_buffer(eb); +} + int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) { u64 start = page_offset(page); struct extent_buffer *eb; int ret = 1; - unsigned long i; - unsigned long num_pages; spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (!eb) goto out; - if (atomic_read(&eb->refs) > 1) { + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; goto out; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + + /* + * set @eb->refs to 0 if it is already 1, and then release the @eb. + * Or go back. + */ + if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { ret = 0; goto out; } - /* at this point we can safely release the extent buffer */ - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) - page_cache_release(extent_buffer_page(eb, i)); - rb_erase(&eb->rb_node, &tree->buffer); - __free_extent_buffer(eb); + + radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); out: spin_unlock(&tree->buffer_lock); + + /* at this point we can safely release the extent buffer */ + if (atomic_read(&eb->refs) == 0) + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5691c7b590da..1c6d4f342ef7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -85,7 +85,7 @@ struct extent_io_ops { struct extent_io_tree { struct rb_root state; - struct rb_root buffer; + struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; spinlock_t lock; @@ -123,7 +123,7 @@ struct extent_buffer { unsigned long bflags; atomic_t refs; struct list_head leak_list; - struct rb_node rb_node; + struct rcu_head rcu_head; /* the spinlock is used to protect most operations */ spinlock_t lock; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 454ca52d6451..23cb8da3ff66 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, goto out; } if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); + em = ERR_CAST(rb_node); goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); @@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, goto out; } if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); + em = ERR_CAST(rb_node); goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f488fac04d99..22ee0dc2e6b8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -23,10 +23,761 @@ #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" +#include "disk-io.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +static void recalculate_thresholds(struct btrfs_block_group_cache + *block_group); +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info); + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + int ret; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(root, path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(root, path); + + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + + spin_lock(&block_group->lock); + if (!root->fs_info->closing) { + block_group->inode = igrab(inode); + block_group->iref = 1; + } + spin_unlock(&block_group->lock); + + return inode; +} + +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + u64 objectid; + int ret; + + ret = btrfs_find_free_objectid(trans, root, 0, &objectid); + if (ret < 0) + return ret; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + return ret; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, + block_group->key.objectid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + return 0; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; + + trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, + 0, 5); + if (ret) + return ret; + + oldsize = i_size_read(inode); + btrfs_i_size_write(inode, 0); + truncate_pagecache(inode, oldsize, 0); + + /* + * We don't need an orphan item because truncating the free space cache + * will never be split across transactions. + */ + ret = btrfs_truncate_inode_items(trans, root, inode, + 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + WARN_ON(1); + return ret; + } + + return btrfs_update_inode(trans, root, inode); +} + +static int readahead_cache(struct inode *inode) +{ + struct file_ra_state *ra; + unsigned long last_index; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + file_ra_state_init(ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); + + kfree(ra); + + return 0; +} + +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct page *page; + struct btrfs_path *path; + u32 *checksums = NULL, *crc; + char *disk_crcs = NULL; + struct btrfs_key key; + struct list_head bitmaps; + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u32 cur_crc = ~(u32)0; + pgoff_t index = 0; + unsigned long first_page_offset; + int num_checksums; + int ret = 0; + + /* + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. + */ + smp_mb(); + if (fs_info->closing) + return 0; + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + INIT_LIST_HEAD(&bitmaps); + + path = btrfs_alloc_path(); + if (!path) + return 0; + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) { + btrfs_free_path(path); + goto out; + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret) { + btrfs_free_path(path); + goto out; + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_free_path(path); + + if (BTRFS_I(inode)->generation != generation) { + printk(KERN_ERR "btrfs: free space inode generation (%llu) did" + " not match free space cache generation (%llu) for " + "block group %llu\n", + (unsigned long long)BTRFS_I(inode)->generation, + (unsigned long long)generation, + (unsigned long long)block_group->key.objectid); + goto out; + } + + if (!num_entries) + goto out; + + /* Setup everything for doing checksumming */ + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; + checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + if (!checksums) + goto out; + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + disk_crcs = kzalloc(first_page_offset, GFP_NOFS); + if (!disk_crcs) + goto out; + + ret = readahead_cache(inode); + if (ret) { + ret = 0; + goto out; + } + + while (1) { + struct btrfs_free_space_entry *entry; + struct btrfs_free_space *e; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + int need_loop = 0; + + if (!num_entries && !num_bitmaps) + break; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = 0; + goto free_cache; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + printk(KERN_ERR "btrfs: error reading free " + "space cache: %llu\n", + (unsigned long long) + block_group->key.objectid); + goto free_cache; + } + } + addr = kmap(page); + + if (index == 0) { + u64 *gen; + + memcpy(disk_crcs, addr, first_page_offset); + gen = addr + (sizeof(u32) * num_checksums); + if (*gen != BTRFS_I(inode)->generation) { + printk(KERN_ERR "btrfs: space cache generation" + " (%llu) does not match inode (%llu) " + "for block group %llu\n", + (unsigned long long)*gen, + (unsigned long long) + BTRFS_I(inode)->generation, + (unsigned long long) + block_group->key.objectid); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc = (u32 *)disk_crcs; + } + entry = addr + start_offset; + + /* First lets check our crc before we do anything fun */ + cur_crc = ~(u32)0; + cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, + PAGE_CACHE_SIZE - start_offset); + btrfs_csum_final(cur_crc, (char *)&cur_crc); + if (cur_crc != *crc) { + printk(KERN_ERR "btrfs: crc mismatch for page %lu in " + "block group %llu\n", index, + (unsigned long long)block_group->key.objectid); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc++; + + while (1) { + if (!num_entries) + break; + + need_loop = 1; + e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!e) { + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + e->offset = le64_to_cpu(entry->offset); + e->bytes = le64_to_cpu(entry->bytes); + if (!e->bytes) { + kunmap(page); + kfree(e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + if (entry->type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&block_group->tree_lock); + ret = link_free_space(block_group, e); + spin_unlock(&block_group->tree_lock); + BUG_ON(ret); + } else { + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kunmap(page); + kfree(e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + spin_lock(&block_group->tree_lock); + ret = link_free_space(block_group, e); + block_group->total_bitmaps++; + recalculate_thresholds(block_group); + spin_unlock(&block_group->tree_lock); + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + break; + entry++; + } + + /* + * We read an entry out of this page, we need to move on to the + * next page. + */ + if (need_loop) { + kunmap(page); + goto next; + } + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + e = list_entry(bitmaps.next, struct btrfs_free_space, list); + list_del_init(&e->list); + memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); + kunmap(page); + num_bitmaps--; +next: + unlock_page(page); + page_cache_release(page); + index++; + } + + ret = 1; +out: + kfree(checksums); + kfree(disk_crcs); + iput(inode); + return ret; + +free_cache: + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + btrfs_remove_free_space_cache(block_group); + goto out; +} + +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode; + struct rb_node *node; + struct list_head *pos, *n; + struct page *page; + struct extent_state *cached_state = NULL; + struct list_head bitmap_list; + struct btrfs_key key; + u64 bytes = 0; + u32 *crc, *checksums; + pgoff_t index = 0, last_index = 0; + unsigned long first_page_offset; + int num_checksums; + int entries = 0; + int bitmaps = 0; + int ret = 0; + + root = root->fs_info->tree_root; + + INIT_LIST_HEAD(&bitmap_list); + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) + return 0; + + if (!i_size_read(inode)) { + iput(inode); + return 0; + } + + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + filemap_write_and_wait(inode->i_mapping); + btrfs_wait_ordered_range(inode, inode->i_size & + ~(root->sectorsize - 1), (u64)-1); + + /* We need a checksum per page. */ + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; + crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + if (!crc) { + iput(inode); + return 0; + } + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + + node = rb_first(&block_group->free_space_offset); + if (!node) + goto out_free; + + /* + * Lock all pages first so we can lock the extent safely. + * + * NOTE: Because we hold the ref the entire time we're going to write to + * the page find_get_page should never fail, so we don't do a check + * after find_get_page at this point. Just putting this here so people + * know and don't freak out. + */ + while (index <= last_index) { + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + pgoff_t i = 0; + + while (i < index) { + page = find_get_page(inode->i_mapping, i); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + i++; + } + goto out_free; + } + index++; + } + + index = 0; + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state, GFP_NOFS); + + /* Write out the extent entries */ + do { + struct btrfs_free_space_entry *entry; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + entry = addr + start_offset; + + memset(addr, 0, PAGE_CACHE_SIZE); + while (1) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; + + entry->offset = cpu_to_le64(e->offset); + entry->bytes = cpu_to_le64(e->bytes); + if (e->bitmap) { + entry->type = BTRFS_FREE_SPACE_BITMAP; + list_add_tail(&e->list, &bitmap_list); + bitmaps++; + } else { + entry->type = BTRFS_FREE_SPACE_EXTENT; + } + node = rb_next(node); + if (!node) + break; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + break; + entry++; + } + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr + start_offset, *crc, + PAGE_CACHE_SIZE - start_offset); + kunmap(page); + + btrfs_csum_final(*crc, (char *)crc); + crc++; + + bytes += PAGE_CACHE_SIZE; + + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + + /* + * We need to release our reference we got for grab_cache_page, + * except for the first page which will hold our checksums, we + * do that below. + */ + if (index != 0) { + unlock_page(page); + page_cache_release(page); + } + + page_cache_release(page); + + index++; + } while (node); + + /* Write out the bitmaps */ + list_for_each_safe(pos, n, &bitmap_list) { + void *addr; + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); + kunmap(page); + btrfs_csum_final(*crc, (char *)crc); + crc++; + bytes += PAGE_CACHE_SIZE; + + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + list_del_init(&entry->list); + index++; + } + + /* Zero out the rest of the pages just to make sure */ + while (index <= last_index) { + void *addr; + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + memset(addr, 0, PAGE_CACHE_SIZE); + kunmap(page); + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + bytes += PAGE_CACHE_SIZE; + index++; + } + + btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); + + /* Write the checksums and trans id to the first page */ + { + void *addr; + u64 *gen; + + page = find_get_page(inode->i_mapping, 0); + + addr = kmap(page); + memcpy(addr, checksums, sizeof(u32) * num_checksums); + gen = addr + (sizeof(u32) * num_checksums); + *gen = trans->transid; + kunmap(page); + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + } + BTRFS_I(inode)->generation = trans->transid; + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + filemap_write_and_wait(inode->i_mapping); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) { + ret = 0; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + goto out_free; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + BUG_ON(!path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != block_group->key.objectid) { + ret = 0; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, + GFP_NOFS); + btrfs_release_path(root, path); + goto out_free; + } + } + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + ret = 1; + +out_free: + if (ret == 0) { + invalidate_inode_pages2_range(inode->i_mapping, 0, index); + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + BTRFS_I(inode)->generation = 0; + } + kfree(checksums); + btrfs_update_inode(trans, root, inode); + iput(inode); + return ret; +} + static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, u64 offset) { diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 890a8e79011b..e49ca5c321b5 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -27,6 +27,24 @@ struct btrfs_free_space { struct list_head list; }; +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path); +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode); +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 64f99cf69ce0..558cac2dfa54 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 num_bytes; - u64 orig_start; - u64 disk_num_bytes; u64 blocksize = root->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; - orig_start = start; - actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -371,7 +367,6 @@ again: total_compressed = min(total_compressed, max_uncompressed); num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; total_in = 0; ret = 0; @@ -467,7 +462,6 @@ again: if (total_compressed >= total_in) { will_compress = 0; } else { - disk_num_bytes = total_compressed; num_bytes = total_in; } } @@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode, u64 disk_num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; - u64 actual_end; - u64 isize = i_size_read(inode); struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; + BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - actual_end = min_t(u64, isize, end + 1); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode, int type; int nocow; int check_prev = 1; + bool nolock = false; path = btrfs_alloc_path(); BUG_ON(!path); - trans = btrfs_join_transaction(root, 1); + if (root == root->fs_info->tree_root) { + nolock = true; + trans = btrfs_join_transaction_nolock(root, 1); + } else { + trans = btrfs_join_transaction(root, 1); + } BUG_ON(!trans); cow_start = (u64)-1; @@ -1211,8 +1208,13 @@ out_check: BUG_ON(ret); } - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + if (nolock) { + ret = btrfs_end_transaction_nolock(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + } btrfs_free_path(path); return 0; } @@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + int do_list = (root->root_key.objectid != + BTRFS_ROOT_TREE_OBJECTID); if (*bits & EXTENT_FIRST_DELALLOC) *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode, spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; root->fs_info->delalloc_bytes += len; - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); } @@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + int do_list = (root->root_key.objectid != + BTRFS_ROOT_TREE_OBJECTID); if (*bits & EXTENT_FIRST_DELALLOC) *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode, if (*bits & EXTENT_DO_ACCOUNTING) btrfs_delalloc_release_metadata(inode, len); - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && do_list) btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); root->fs_info->delalloc_bytes -= len; BTRFS_I(inode)->delalloc_bytes -= len; - if (BTRFS_I(inode)->delalloc_bytes == 0 && + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); } @@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, if (map_length < length + size) return 1; - return 0; + return ret; } /* @@ -1426,7 +1433,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (root == root->fs_info->tree_root) + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); + else + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); if (!(rw & REQ_WRITE)) { @@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct extent_state *cached_state = NULL; int compressed = 0; int ret; + bool nolock = false; ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, end - start + 1); @@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) return 0; BUG_ON(!ordered_extent); + nolock = (root == root->fs_info->tree_root); + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len); BUG_ON(ret); } else { + BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, @@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) - btrfs_end_transaction(trans, root); + if (nolock) { + if (trans) + btrfs_end_transaction_nolock(trans, root); + } else { + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); + } + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_item *item; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; @@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* pull out the item */ leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ @@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); - BUG_ON(ret); + if (ret == -ENOENT) + ret = 0; err: btrfs_free_path(path); if (ret) @@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root, { struct extent_buffer *eb; int level; - int ret; u64 refs = 1; + int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) @@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return 0; + return ret; /* XXX callers? */ } /* @@ -3196,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - if (root->ref_cows) + if (root->ref_cows || root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); @@ -3344,7 +3370,8 @@ delete: } else { break; } - if (found_extent && root->ref_cows) { + if (found_extent && (root->ref_cows || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -3675,7 +3702,8 @@ void btrfs_evict_inode(struct inode *inode) int ret; truncate_inode_pages(&inode->i_data, 0); - if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0) + if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || + root == root->fs_info->tree_root)) goto no_delete; if (is_bad_inode(inode)) { @@ -3888,7 +3916,14 @@ static void inode_tree_del(struct inode *inode) } spin_unlock(&root->inode_lock); - if (empty && btrfs_root_refs(&root->root_item) == 0) { + /* + * Free space cache has inodes in the tree root, but the tree root has a + * root_refs of 0, so this could end up dropping the tree root as a + * snapshot, so we need the extra !root->fs_info->tree_root check to + * make sure we don't drop it. + */ + if (empty && btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) { synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); @@ -4282,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret = 0; + bool nolock = false; if (BTRFS_I(inode)->dummy_inode) return 0; + smp_mb(); + nolock = (root->fs_info->closing && root == root->fs_info->tree_root); + if (wbc->sync_mode == WB_SYNC_ALL) { - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_commit_transaction(trans, root); + if (nolock) + ret = btrfs_end_transaction_nolock(trans, root); + else + ret = btrfs_commit_transaction(trans, root); } return ret; } @@ -5645,7 +5690,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; struct bio_vec *bvec = bio->bi_io_vec; - u64 start; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; @@ -5671,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->inode = inode; dip->logical_offset = file_offset; - start = dip->logical_offset; dip->bytes = 0; do { dip->bytes += bvec->bv_len; @@ -6308,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } + if (root == root->fs_info->tree_root) { + struct btrfs_block_group_cache *block_group; + + block_group = btrfs_lookup_block_group(root->fs_info, + BTRFS_I(inode)->block_group); + if (block_group && block_group->inode == inode) { + spin_lock(&block_group->lock); + block_group->inode = NULL; + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", @@ -6340,7 +6398,8 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); @@ -6609,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync) { struct btrfs_inode *binode; struct inode *inode = NULL; @@ -6631,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - write_inode_now(inode, 0); + if (sync) { + filemap_write_and_wait(inode->i_mapping); + /* + * We have to do this because compression doesn't + * actually set PG_writeback until it submits the pages + * for IO, which happens in an async thread, so we could + * race and not actually wait for any writeback pages + * because they've not been submitted yet. Technically + * this could still be the case for the ordered stuff + * since the async thread may not have started to do its + * work yet. If this becomes the case then we need to + * figure out a way to make sure that in writepage we + * wait for any async pages to be submitted before + * returning so that fdatawait does what its supposed to + * do. + */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + } if (delay_iput) btrfs_add_delayed_iput(inode); else @@ -6757,27 +6836,33 @@ out_unlock: return err; } -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint) +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; int ret = 0; + bool own_trans = true; + if (trans) + own_trans = false; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; + if (own_trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_end_transaction(trans, root); + if (own_trans) + btrfs_end_transaction(trans, root); break; } @@ -6810,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); + if (own_trans) + btrfs_end_transaction(trans, root); } return ret; } +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} + +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); +} + static long btrfs_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9254b3d58dbe..463d91b4dd3a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) + char *name, int namelen, + u64 *async_transid) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - err = btrfs_commit_transaction(trans, root); + if (async_transid) { + *async_transid = trans->transid; + err = btrfs_commit_transaction_async(trans, root, 1); + } else { + err = btrfs_commit_transaction(trans, root); + } if (err && !ret) ret = err; return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen, u64 *async_transid) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); + if (async_transid) { + *async_transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, + root->fs_info->extent_root, 1); + } else { + ret = btrfs_commit_transaction(trans, + root->fs_info->extent_root); + } BUG_ON(ret); ret = pending_snapshot->error; @@ -395,6 +409,76 @@ fail: return ret; } +/* copy of check_sticky in fs/namei.c() +* It's inline, so penalty for filesystems that don't use sticky bit is +* minimal. +*/ +static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) +{ + uid_t fsuid = current_fsuid(); + + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (inode->i_uid == fsuid) + return 0; + if (dir->i_uid == fsuid) + return 0; + return !capable(CAP_FOWNER); +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) +{ + int error; + + if (!victim->d_inode) + return -ENOENT; + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(victim, dir); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (btrfs_check_sticky(dir, victim->d_inode)|| + IS_APPEND(victim->d_inode)|| + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { @@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) */ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, - struct btrfs_root *snap_src) + struct btrfs_root *snap_src, + u64 *async_transid) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry); + error = create_snapshot(snap_src, dentry, + name, namelen, async_transid); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen); + name, namelen, async_transid); } if (!error) fsnotify_mkdir(dir, dentry); @@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, char *sizestr; char *devstr = NULL; int ret = 0; - int namelen; int mod = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) @@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; @@ -801,11 +885,13 @@ out_unlock: return ret; } -static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) +static noinline int btrfs_ioctl_snap_create_transid(struct file *file, + char *name, + unsigned long fd, + int subvol, + u64 *transid) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - struct btrfs_ioctl_vol_args *vol_args; struct file *src_file; int namelen; int ret = 0; @@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/')) { + namelen = strlen(name); + if (strchr(name, '/')) { ret = -EINVAL; goto out; } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - NULL); + ret = btrfs_mksubvol(&file->f_path, name, namelen, + NULL, transid); } else { struct inode *src_inode; - src_file = fget(vol_args->fd); + src_file = fget(fd); if (!src_file) { ret = -EINVAL; goto out; @@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, fput(src_file); goto out; } - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - BTRFS_I(src_inode)->root); + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid); fput(src_file); } out: + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol, + int async) +{ + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; + char *name; + u64 fd; + u64 transid = 0; + int ret; + + if (async) { + async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); + if (IS_ERR(async_vol_args)) + return PTR_ERR(async_vol_args); + + name = async_vol_args->name; + fd = async_vol_args->fd; + async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + name = vol_args->name; + fd = vol_args->fd; + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + } + + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, &transid); + + if (!ret && async) { + if (copy_to_user(arg + + offsetof(struct btrfs_ioctl_async_vol_args, + transid), &transid, sizeof(transid))) + return -EFAULT; + } + kfree(vol_args); + kfree(async_vol_args); + return ret; } @@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); - return -EFAULT; - } inode = fdentry(file)->d_inode; ret = search_ioctl(inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) @@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); - return -EFAULT; - } inode = fdentry(file)->d_inode; if (args->treeid == 0) @@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } inode = dentry->d_inode; + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)){ + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ + err = -EINVAL; + if (root == dest) + goto out_dput; + + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; + + /* check if subvolume may be deleted by a non-root user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + } + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; } - dest = BTRFS_I(inode)->root; - mutex_lock(&inode->i_mutex); err = d_invalidate(dentry); if (err) @@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BUG_ON(ret); } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_end_transaction(trans, root); BUG_ON(ret); inode->i_flags |= S_DEAD; out_up_write: @@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, path->reada = 2; if (inode < src) { - mutex_lock(&inode->i_mutex); - mutex_lock(&src->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); } else { - mutex_lock(&src->i_mutex); - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); } /* determine range to clone */ @@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, off+len); - if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + ordered = btrfs_lookup_first_ordered_extent(src, off+len); + if (!ordered && + !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, + EXTENT_DELALLOC, 0, NULL)) break; unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); if (ordered) btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, off+len); + btrfs_wait_ordered_range(src, off, len); } /* clone data */ @@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_release_path(root, path); - if (key.offset + datal < off || + if (key.offset + datal <= off || key.offset >= off+len) goto next; @@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return 0; } +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } +} + long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; @@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info *user_dest; struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; int alloc_size; int ret = 0; int slot_count = 0; + int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; - /* first we count slots */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) - slot_count++; - rcu_read_unlock(); + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } + + slot_count = min_t(int, space_args.space_slots, slot_count); + alloc_size = sizeof(*dest) * slot_count; + /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ @@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) dest_orig = dest; /* now we have a buffer to copy into */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { - /* make sure we don't copy more than we allocated - * in our buffer - */ - if (slot_count == 0) - break; - slot_count--; - - /* make sure userland has enough room in their buffer */ - if (space_args.total_spaces >= space_args.space_slots) - break; + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); - space.flags = info->flags; - space.total_bytes = info->total_bytes; - space.used_bytes = info->bytes_used; - memcpy(dest, &space, sizeof(space)); - dest++; - space_args.total_spaces++; + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + get_block_group_info(&info->block_groups[c], + &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + } + } + up_read(&info->groups_sem); } - rcu_read_unlock(); user_dest = (struct btrfs_ioctl_space_info *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file) return 0; } +static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + struct btrfs_trans_handle *trans; + u64 transid; + + trans = btrfs_start_transaction(root, 0); + transid = trans->transid; + btrfs_commit_transaction_async(trans, root, 0); + + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; + return 0; +} + +static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + u64 transid; + + if (argp) { + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + } else { + transid = 0; /* current trans */ + } + return btrfs_wait_for_commit(root, transid); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, 0, 0); + case BTRFS_IOC_SNAP_CREATE_ASYNC: + return btrfs_ioctl_snap_create(file, argp, 0, 1); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, 1, 0); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(file, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 424694aa517f..17c99ebdf960 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,14 +22,21 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 -#define BTRFS_PATH_NAME_MAX 4087 /* this should be 4k */ +#define BTRFS_PATH_NAME_MAX 4087 struct btrfs_ioctl_vol_args { __s64 fd; char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_SNAPSHOT_NAME_MAX 4079 +struct btrfs_ioctl_async_vol_args { + __s64 fd; + __u64 transid; + char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ struct btrfs_ioctl_space_args) +#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) +#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) +#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_async_vol_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e56c72bc5add..f4621f6deca1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; u64 orig_end; - u64 wait_end; struct btrfs_ordered_extent *ordered; int found; @@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } - wait_end = orig_end; again: /* start IO across the range first to instantiate any delalloc * extents diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b37d723b9d4a..045c9c2b2d7e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -29,6 +29,7 @@ #include "locking.h" #include "btrfs_inode.h" #include "async-thread.h" +#include "free-space-cache.h" /* * backref_node, mapping_node and tree_block start with this @@ -178,8 +179,6 @@ struct reloc_control { u64 search_start; u64 extents_found; - int block_rsv_retries; - unsigned int stage:8; unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; @@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err) LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - int retries = 0; mutex_lock(&root->fs_info->trans_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; @@ -2143,7 +2141,7 @@ again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes, &retries); + num_bytes); if (ret) err = ret; } @@ -2155,7 +2153,6 @@ again: btrfs_end_transaction(trans, rc->extent_root); btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); - retries = 0; goto again; } } @@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, - &rc->block_rsv_retries); + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; return ret; } - rc->block_rsv_retries = 0; return 0; } @@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc, BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, extent_key, &ref_owner, NULL); + if (ret < 0) + return ret; BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); level = (int)ref_owner; /* FIXME: get real generation */ @@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc, return ret; } +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 ino) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret = 0; + + if (inode) + goto truncate; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (!inode || IS_ERR(inode) || is_bad_inode(inode)) { + if (inode && !IS_ERR(inode)) + iput(inode); + return -ENOENT; + } + +truncate: + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + trans = btrfs_join_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + goto out; + } + + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + + btrfs_free_path(path); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out: + iput(inode); + return ret; +} + /* * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY * this function scans fs tree to find blocks reference the data extent @@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc, int counted; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ref_root = btrfs_extent_data_ref_root(leaf, ref); ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); ref_offset = btrfs_extent_data_ref_offset(leaf, ref); ref_count = btrfs_extent_data_ref_count(leaf, ref); + /* + * This is an extent belonging to the free space cache, lets just delete + * it and redo the search. + */ + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { + ret = delete_block_group_cache(rc->extent_root->fs_info, + NULL, ref_objectid); + if (ret != -ENOENT) + return ret; + ret = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - &rc->block_rsv_retries); + rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc) rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; - rc->block_rsv_retries = 0; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) { struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; int ret; int rw = 0; int err = 0; @@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rw = 1; } + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, + path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); @@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_add_ordered_sum(inode, ordered, sums); } btrfs_put_ordered_extent(ordered); - return 0; + return ret; } void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 2d958be761c8..6a1086e83ffc 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) { struct btrfs_root *dead_root; - struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_key key; struct btrfs_key found_key; @@ -214,7 +213,6 @@ again: nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; } - item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) goto next; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 144f8a5730f5..8299a25ffc8f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb) ret = close_ctree(root); sb->s_fs_info = NULL; + + (void)ret; /* FIXME: need to fix VFS to return error? */ } enum { @@ -68,7 +70,8 @@ enum { Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_err, + Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, + Opt_user_subvol_rm_allowed, }; static match_table_t tokens = { @@ -92,6 +95,9 @@ static match_table_t tokens = { {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_space_cache, "space_cache"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_err, NULL}, }; @@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_discard: btrfs_set_opt(info->mount_opt, DISCARD); break; + case Opt_space_cache: + printk(KERN_INFO "btrfs: enabling disk space caching\n"); + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + case Opt_clear_cache: + printk(KERN_INFO "btrfs: force clearing of disk cache\n"); + btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb, find_root: new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); if (IS_ERR(new_root)) - return ERR_PTR(PTR_ERR(new_root)); + return ERR_CAST(new_root); if (btrfs_root_refs(&new_root->root_item) == 0) return ERR_PTR(-ENOENT); @@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; struct btrfs_key key; int err; @@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb, return PTR_ERR(tree_root); } sb->s_fs_info = tree_root; - disk_super = &tree_root->fs_info->super_copy; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -560,8 +574,8 @@ static int btrfs_test_super(struct super_block *s, void *data) * Note: This is based on get_sb_bdev from fs/super.c with a few additions * for multiple device setup. Make sure to keep it in sync. */ -static int btrfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct block_device *bdev = NULL; struct super_block *s; @@ -571,7 +585,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - int found = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; @@ -580,7 +593,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, &subvol_name, &subvol_objectid, &fs_devices); if (error) - return error; + return ERR_PTR(error); error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); if (error) @@ -607,7 +620,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, goto error_close_devices; } - found = 1; btrfs_close_devices(fs_devices); } else { char b[BDEVNAME_SIZE]; @@ -629,7 +641,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(root)) { error = PTR_ERR(root); deactivate_locked_super(s); - goto error; + goto error_free_subvol_name; } /* if they gave us a subvolume name bind mount into that */ if (strcmp(subvol_name, ".")) { @@ -643,24 +655,21 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, deactivate_locked_super(s); error = PTR_ERR(new_root); dput(root); - goto error_close_devices; + goto error_free_subvol_name; } if (!new_root->d_inode) { dput(root); dput(new_root); deactivate_locked_super(s); error = -ENXIO; - goto error_close_devices; + goto error_free_subvol_name; } dput(root); root = new_root; } - mnt->mnt_sb = s; - mnt->mnt_root = root; - kfree(subvol_name); - return 0; + return root; error_s: error = PTR_ERR(s); @@ -668,8 +677,7 @@ error_close_devices: btrfs_close_devices(fs_devices); error_free_subvol_name: kfree(subvol_name); -error: - return error; + return ERR_PTR(error); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) @@ -716,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; + u64 total_used_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry_rcu(found, head, list) { + if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_SYSTEM)) + total_used_data += found->disk_total; + else + total_used_data += found->disk_used; total_used += found->disk_used; + } rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_bfree; + buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; @@ -746,7 +761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", - .get_sb = btrfs_get_sb, + .mount = btrfs_mount, .kill_sb = kill_anon_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 66e4c66cc63b..1fffbc017bdf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,6 +163,7 @@ enum btrfs_trans_type { TRANS_START, TRANS_JOIN, TRANS_USERSPACE, + TRANS_JOIN_NOLOCK, }; static int may_wait_transaction(struct btrfs_root *root, int type) @@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; int ret; again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) return ERR_PTR(-ENOMEM); - mutex_lock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_lock(&root->fs_info->trans_mutex); if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -195,7 +196,8 @@ again: cur_trans = root->fs_info->running_transaction; cur_trans->use_count++; - mutex_unlock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); h->transid = cur_trans->transid; h->transaction = cur_trans; @@ -212,8 +214,7 @@ again: } if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items, - &retries); + ret = btrfs_trans_reserve_metadata(h, root, num_items); if (ret == -EAGAIN) { btrfs_commit_transaction(h, root); goto again; @@ -224,9 +225,11 @@ again: } } - mutex_lock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); - mutex_unlock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; @@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, return start_transaction(root, 0, TRANS_JOIN); } +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, 0, TRANS_JOIN_NOLOCK); +} + struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { @@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + + ret = 0; + if (transid) { + if (transid <= root->fs_info->last_trans_committed) + goto out_unlock; + + /* find specified transaction */ + list_for_each_entry(t, &root->fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + break; + } + if (t->transid > transid) + break; + } + ret = -EINVAL; + if (!cur_trans) + goto out_unlock; /* bad transid */ + } else { + /* find newest transaction that is committing | committed */ + list_for_each_entry_reverse(t, &root->fs_info->trans_list, + list) { + if (t->in_commit) { + if (t->commit_done) + goto out_unlock; + cur_trans = t; + break; + } + } + if (!cur_trans) + goto out_unlock; /* nothing committing|committed */ + } + + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, cur_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + ret = 0; +out_unlock: + mutex_unlock(&root->fs_info->trans_mutex); + return ret; +} + #if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new @@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle) + struct btrfs_root *root, int throttle, int lock) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; @@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); - if (!root->fs_info->open_ioctl_trans && + if (lock && !root->fs_info->open_ioctl_trans && should_end_transaction(trans, root)) trans->transaction->blocked = 1; - if (cur_trans->blocked && !cur_trans->in_commit) { + if (lock && cur_trans->blocked && !cur_trans->in_commit) { if (throttle) return btrfs_commit_transaction(trans, root); else wake_up_process(info->transaction_kthread); } - mutex_lock(&info->trans_mutex); + if (lock) + mutex_lock(&info->trans_mutex); WARN_ON(cur_trans != info->running_transaction); WARN_ON(cur_trans->num_writers < 1); cur_trans->num_writers--; + smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); - mutex_unlock(&info->trans_mutex); + if (lock) + mutex_unlock(&info->trans_mutex); if (current->journal_info == trans) current->journal_info = NULL; @@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 0); + return __btrfs_end_transaction(trans, root, 0, 1); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 1); + return __btrfs_end_transaction(trans, root, 1, 1); +} + +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0, 0); } /* @@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; int ret; - int retries = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve, &retries); + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; + if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + super->cache_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } +/* + * wait for the current transaction commit to start and block subsequent + * transaction joins + */ +static void wait_current_trans_commit_start(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->in_commit) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->in_commit) { + finish_wait(&root->fs_info->transaction_blocked_wait, + &wait); + break; + } + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_blocked_wait, &wait); + } +} + +/* + * wait for the current transaction to start and then become unblocked. + * caller holds ref. + */ +static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->commit_done || (trans->in_commit && !trans->blocked)) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->commit_done || + (trans->in_commit && !trans->blocked)) { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } +} + +/* + * commit transactions asynchronously. once btrfs_commit_transaction_async + * returns, any subsequent transaction will not be allowed to join. + */ +struct btrfs_async_commit { + struct btrfs_trans_handle *newtrans; + struct btrfs_root *root; + struct delayed_work work; +}; + +static void do_async_commit(struct work_struct *work) +{ + struct btrfs_async_commit *ac = + container_of(work, struct btrfs_async_commit, work.work); + + btrfs_commit_transaction(ac->newtrans, ac->root); + kfree(ac); +} + +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock) +{ + struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + + ac = kmalloc(sizeof(*ac), GFP_NOFS); + BUG_ON(!ac); + + INIT_DELAYED_WORK(&ac->work, do_async_commit); + ac->root = root; + ac->newtrans = btrfs_join_transaction(root, 0); + + /* take transaction reference */ + mutex_lock(&root->fs_info->trans_mutex); + cur_trans = trans->transaction; + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + btrfs_end_transaction(trans, root); + schedule_delayed_work(&ac->work, 0); + + /* wait for transaction to start and unblock */ + mutex_lock(&root->fs_info->trans_mutex); + if (wait_for_unblock) + wait_current_trans_commit_start_and_unblock(root, cur_trans); + else + wait_current_trans_commit_start(root, cur_trans); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + +/* + * btrfs_transaction state sequence: + * in_commit = 0, blocked = 0 (initial) + * in_commit = 1, blocked = 1 + * blocked = 0 + * commit_done = 1 + */ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { unsigned long joined = 0; - unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); @@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + wake_up(&root->fs_info->transaction_blocked_wait); + if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - if (cur_trans->num_writers > 1) - timeout = MAX_SCHEDULE_TIMEOUT; - else if (should_grow) - timeout = 1; - mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit || snap_pending) { @@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, TASK_UNINTERRUPTIBLE); smp_mb(); - if (cur_trans->num_writers > 1 || should_grow) - schedule_timeout(timeout); + if (cur_trans->num_writers > 1) + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + else if (should_grow) + schedule_timeout(1); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e104986d0bfd..f104b57ad4ef 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, + int num_blocks); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks); +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, @@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f7ac8e013ed7..992ab425599d 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int ret = 0; int wret; int level; - int orig_level; int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; @@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, return -ENOMEM; level = btrfs_header_level(root->node); - orig_level = level; if (level == 0) goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fb102a9aee9c..a29f19384a27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, { struct inode *dir; int ret; - struct btrfs_key location; struct btrfs_inode_ref *ref; struct btrfs_dir_item *di; struct inode *inode; @@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, unsigned long ref_ptr; unsigned long ref_end; - location.objectid = key->objectid; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - /* * it is possible that we didn't log all the parent directories * for a given inode. If we don't find the dir, just don't @@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; - u32 item_size; int level; int i; int ret; @@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { btrfs_item_key_to_cpu(eb, &key, i); - item_size = btrfs_item_size_nr(eb, i); /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && @@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); @@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; int i; int slot; int ret; @@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { - struct extent_buffer *node; - node = path->nodes[i]; path->slots[i]++; *level = i; WARN_ON(*level == 0); @@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); if (wc->free) { @@ -2273,7 +2260,7 @@ fail: } btrfs_end_log_trans(root); - return 0; + return err; } /* see comments for btrfs_del_dir_entries_in_log */ @@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; - u32 size; int err = 0; int ret; int nritems; @@ -2793,7 +2779,6 @@ again: break; src = path->nodes[0]; - size = btrfs_item_size_nr(src, path->slots[0]); if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e25e46a8b4e2..cc04dc1445d6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1898,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root) u64 size_to_free; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_chunk *chunk; struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_key found_key; @@ -1962,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root) if (found_key.objectid != key.objectid) break; - chunk = btrfs_item_ptr(path->nodes[0], - path->slots[0], - struct btrfs_chunk); /* chunk zero is special */ if (found_key.offset == 0) break; @@ -3031,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } bio->bi_sector = multi->stripes[dev_nr].physical >> 9; dev = multi->stripes[dev_nr].dev; - BUG_ON(rw == WRITE && !dev->writeable); - if (dev && dev->bdev) { + if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 88ecbb215878..698fdd2c739c 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; - struct btrfs_item *item; struct extent_buffer *leaf; struct btrfs_dir_item *di; int ret = 0, slot, advance; @@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) } advance = 1; - item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); /* check to make sure this item is what we want */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 3e2b90eaa239..b9cd5445f71c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; - int out_written = 0; - int in_read = 0; unsigned long bytes_left; *out_pages = 0; @@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, workspace->def_strm.avail_out = PAGE_CACHE_SIZE; workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); - out_written = 0; - in_read = 0; - while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d6e0e0421891..08b460ae0539 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path) { int err; @@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, } } - mnt->mnt_root = root; - mnt->mnt_sb = fsc->sb; - fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); - err = 0; + mutex_unlock(&fsc->client->mount_mutex); + return root; out: mutex_unlock(&fsc->client->mount_mutex); - return err; + return ERR_PTR(err); fail: if (first) { @@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb, return err; } -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *ceph_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct super_block *sb; struct ceph_fs_client *fsc; + struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; - dout("ceph_get_sb\n"); + dout("ceph_mount\n"); err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out_final; + } /* create client (which we may/may not use) */ fsc = create_fs_client(fsopt, opt); if (IS_ERR(fsc)) { - err = PTR_ERR(fsc); + res = ERR_CAST(fsc); kfree(fsopt); kfree(opt); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out; + } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; sb = sget(fs_type, compare_super, ceph_set_super, fsc); if (IS_ERR(sb)) { - err = PTR_ERR(sb); + res = ERR_CAST(sb); goto out; } @@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type, } else { dout("get_sb using new client %p\n", fsc); err = ceph_register_bdi(sb, fsc); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out_splat; + } } - err = ceph_mount(fsc, mnt, path); - if (err < 0) + res = ceph_real_mount(fsc, path); + if (IS_ERR(res)) goto out_splat; - dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, - mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); - return 0; + dout("root %p inode %p ino %llx.%llx\n", res, + res->d_inode, ceph_vinop(res->d_inode)); + return res; out_splat: ceph_mdsc_close_sessions(fsc->mdsc); @@ -843,8 +847,8 @@ out: ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: - dout("ceph_get_sb fail %d\n", err); - return err; + dout("ceph_mount fail %ld\n", PTR_ERR(res)); + return res; } static void ceph_kill_sb(struct super_block *s) @@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s) static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .get_sb = ceph_get_sb, + .mount = ceph_mount, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 917b7d449bb2..0ed213970ced 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,6 +2,9 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS + select CRYPTO + select CRYPTO_MD5 + select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/TODO b/fs/cifs/TODO index 5aff46c61e52..355abcdcda98 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for v) mount check for unmatched uids -w) Add support for new vfs entry points for setlease and fallocate +w) Add support for new vfs entry point for fallocate x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of processes can proceed better in parallel (on the server) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 525ba59a4105..e9a393c9c2ca 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -15,7 +15,7 @@ * the GNU Lesser General Public License for more details. * */ -#include <linux/radix-tree.h> +#include <linux/rbtree.h> #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H @@ -42,9 +42,9 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ struct cifs_sb_info { - struct radix_tree_root tlink_tree; -#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */ + struct rb_root tlink_tree; spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7ac0056294cf..f856732161ab 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -43,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - const struct session_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; + int rc; - if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) + if (cifs_pdu == NULL || signature == NULL || server == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); - cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + + crypto_shash_update(&server->secmech.sdescmd5->shash, + cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - cifs_MD5_final(signature, &context); return 0; } @@ -79,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calculate_signature(cifs_pdu, &server->session_key, - smb_signature); + rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -90,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, } static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - const struct session_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; int i; + int rc; - if ((iov == NULL) || (signature == NULL) || (key == NULL)) + if (iov == NULL || signature == NULL || server == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -112,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - cifs_MD5_update(&context, iov[0].iov_base+4, - iov[0].iov_len-4); + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); } else - cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base, iov[i].iov_len); } - cifs_MD5_final(signature, &context); + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - return 0; + return rc; } - int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -146,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calc_signature2(iov, n_vec, &server->session_key, - smb_signature); + rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -157,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, } int cifs_verify_signature(struct smb_hdr *cifs_pdu, - const struct session_key *session_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; - if (cifs_pdu == NULL || session_key == NULL) + if (cifs_pdu == NULL || server == NULL) return -EINVAL; if (cifs_pdu->Command == SMB_COM_NEGOTIATE) @@ -193,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - rc = cifs_calculate_signature(cifs_pdu, session_key, + rc = cifs_calculate_signature(cifs_pdu, server, what_we_think_sig_should_be); if (rc) @@ -209,18 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } -/* We fill in key by putting in 40 byte array which was allocated by caller */ -int cifs_calculate_session_key(struct session_key *key, const char *rn, - const char *password) +/* first calculate 24 bytes ntlm response and then 16 byte session key */ +int setup_ntlm_response(struct cifsSesInfo *ses) { - char temp_key[16]; - if ((key == NULL) || (rn == NULL)) + unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; + char temp_key[CIFS_SESS_KEY_SIZE]; + + if (!ses) return -EINVAL; - E_md4hash(password, temp_key); - mdfour(key->data.ntlm, temp_key, 16); - memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE); - key->len = 40; + ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len); + return -ENOMEM; + } + ses->auth_key.len = temp_len; + + SMBNTencrypt(ses->password, ses->server->cryptkey, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); + + E_md4hash(ses->password, temp_key); + mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + return 0; } @@ -294,15 +328,15 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp) * two times the unicode length of a server name + * size of a timestamp (which is 8 bytes). */ - ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; - ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL); - if (!ses->tiblob) { - ses->tilen = 0; + ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); + if (!ses->auth_key.response) { + ses->auth_key.len = 0; cERROR(1, "Challenge target info allocation failure"); return -ENOMEM; } - blobptr = ses->tiblob; + blobptr = ses->auth_key.response; attrptr = (struct ntlmssp2_name *) blobptr; attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); @@ -357,7 +391,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp) * about target string i.e. for some, just user name might suffice. */ static int -find_domain_name(struct cifsSesInfo *ses) +find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp) { unsigned int attrsize; unsigned int type; @@ -366,11 +400,11 @@ find_domain_name(struct cifsSesInfo *ses) unsigned char *blobend; struct ntlmssp2_name *attrptr; - if (!ses->tilen || !ses->tiblob) + if (!ses->auth_key.len || !ses->auth_key.response) return 0; - blobptr = ses->tiblob; - blobend = ses->tiblob + ses->tilen; + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; while (blobptr + onesize < blobend) { attrptr = (struct ntlmssp2_name *) blobptr; @@ -386,16 +420,13 @@ find_domain_name(struct cifsSesInfo *ses) if (!attrsize) break; if (!ses->domainName) { - struct nls_table *default_nls; ses->domainName = kmalloc(attrsize + 1, GFP_KERNEL); if (!ses->domainName) return -ENOMEM; - default_nls = load_nls_default(); cifs_from_ucs2(ses->domainName, (__le16 *)blobptr, attrsize, attrsize, - default_nls, false); - unload_nls(default_nls); + nls_cp, false); break; } } @@ -405,82 +436,136 @@ find_domain_name(struct cifsSesInfo *ses) return 0; } -static int calc_ntlmv2_hash(struct cifsSesInfo *ses, +static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash, const struct nls_table *nls_cp) { int rc = 0; int len; - char nt_hash[16]; - struct HMACMD5Context *pctxt; + char nt_hash[CIFS_NTHASH_SIZE]; wchar_t *user; wchar_t *domain; + wchar_t *server; - pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); - - if (pctxt == NULL) - return -ENOMEM; + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - /* convert Domainname to unicode and uppercase */ - hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); + crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + CIFS_NTHASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n"); + return rc; + } /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); user = kmalloc(2 + (len * 2), GFP_KERNEL); - if (user == NULL) + if (user == NULL) { + cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); + rc = -ENOMEM; goto calc_exit_2; + } len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - hmac_md5_update((char *)user, 2*len, pctxt); + + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)user, 2 * len); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { len = strlen(ses->domainName); domain = kmalloc(2 + (len * 2), GFP_KERNEL); - if (domain == NULL) + if (domain == NULL) { + cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); + rc = -ENOMEM; goto calc_exit_1; + } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); - /* the following line was removed since it didn't work well - with lower cased domain name that passed as an option. - Maybe converting the domain name earlier makes sense */ - /* UniStrupr(domain); */ - - hmac_md5_update((char *)domain, 2*len, pctxt); - + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)domain, 2 * len); kfree(domain); + } else if (ses->serverName) { + len = strlen(ses->serverName); + + server = kmalloc(2 + (len * 2), GFP_KERNEL); + if (server == NULL) { + cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); + rc = -ENOMEM; + goto calc_exit_1; + } + len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, + nls_cp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)server, 2 * len); + kfree(server); } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2_hash); + calc_exit_1: kfree(user); calc_exit_2: - /* BB FIXME what about bytes 24 through 40 of the signing key? - compare with the NTLM example */ - hmac_md5_final(ses->ntlmv2_hash, pctxt); + return rc; +} + +static int +CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash) +{ + int rc; + unsigned int offset = CIFS_SESS_KEY_SIZE + 8; + + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } + + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + return rc; + } + + if (ses->server->secType == RawNTLMSSP) + memcpy(ses->auth_key.response + offset, + ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + else + memcpy(ses->auth_key.response + offset, + ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + offset, ses->auth_key.len - offset); + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); - kfree(pctxt); return rc; } + int -setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, - const struct nls_table *nls_cp) +setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp) { int rc; - struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; - struct HMACMD5Context context; - - buf->blob_signature = cpu_to_le32(0x00000101); - buf->reserved = 0; - buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); - buf->reserved2 = 0; + int baselen; + unsigned int tilen; + struct ntlmv2_resp *buf; + char ntlmv2_hash[16]; + unsigned char *tiblob = NULL; /* target info blob */ if (ses->server->secType == RawNTLMSSP) { if (!ses->domainName) { - rc = find_domain_name(ses); + rc = find_domain_name(ses, nls_cp); if (rc) { cERROR(1, "error %d finding domain name", rc); goto setup_ntlmv2_rsp_ret; @@ -490,51 +575,179 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, rc = build_avpair_blob(ses, nls_cp); if (rc) { cERROR(1, "error %d building av pair blob", rc); - return rc; + goto setup_ntlmv2_rsp_ret; } } - /* calculate buf->ntlmv2_hash */ - rc = calc_ntlmv2_hash(ses, nls_cp); + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + tilen = ses->auth_key.len; + tiblob = ses->auth_key.response; + + ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); + if (!ses->auth_key.response) { + rc = ENOMEM; + ses->auth_key.len = 0; + cERROR(1, "%s: Can't allocate auth blob", __func__); + goto setup_ntlmv2_rsp_ret; + } + ses->auth_key.len += baselen; + + buf = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + buf->blob_signature = cpu_to_le32(0x00000101); + buf->reserved = 0; + buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); + buf->reserved2 = 0; + + memcpy(ses->auth_key.response + baselen, tiblob, tilen); + + /* calculate ntlmv2_hash */ + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cERROR(1, "could not get v2 hash rc %d", rc); goto setup_ntlmv2_rsp_ret; } - CalcNTLMv2_response(ses, resp_buf); + + /* calculate first part of the client response (CR1) */ + rc = CalcNTLMv2_response(ses, ntlmv2_hash); + if (rc) { + cERROR(1, "Could not calculate CR1 rc: %d", rc); + goto setup_ntlmv2_rsp_ret; + } /* now calculate the session key for NTLMv2 */ - hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context); - hmac_md5_update(resp_buf, 16, &context); - hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context); + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "%s: Could not init hmacmd5\n", __func__); + goto setup_ntlmv2_rsp_ret; + } - memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf, - sizeof(struct ntlmv2_resp)); - ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_HMAC_MD5_HASH_SIZE); - return 0; + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response); setup_ntlmv2_rsp_ret: - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; + kfree(tiblob); return rc; } -void CalcNTLMv2_response(const struct cifsSesInfo *ses, - char *v2_session_response) +int +calc_seckey(struct cifsSesInfo *ses) { - struct HMACMD5Context context; - /* rest of v2 struct already generated */ - memcpy(v2_session_response + 8, ses->cryptKey, 8); - hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context); + int rc; + struct crypto_blkcipher *tfm_arc4; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ + + get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); + + tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (!tfm_arc4 || IS_ERR(tfm_arc4)) { + cERROR(1, "could not allocate crypto API arc4\n"); + return PTR_ERR(tfm_arc4); + } - hmac_md5_update(v2_session_response+8, - sizeof(struct ntlmv2_resp) - 8, &context); + desc.tfm = tfm_arc4; - if (ses->tilen) - hmac_md5_update(ses->tiblob, ses->tilen, &context); + crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + CIFS_SESS_KEY_SIZE); - hmac_md5_final(v2_session_response, &context); -/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ + sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); + sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + if (rc) { + cERROR(1, "could not encrypt session key rc: %d\n", rc); + crypto_free_blkcipher(tfm_arc4); + return rc; + } + + /* make secondary_key/nonce as session key */ + memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); + /* and make len as that of session key only */ + ses->auth_key.len = CIFS_SESS_KEY_SIZE; + + crypto_free_blkcipher(tfm_arc4); + + return 0; +} + +void +cifs_crypto_shash_release(struct TCP_Server_Info *server) +{ + if (server->secmech.md5) + crypto_free_shash(server->secmech.md5); + + if (server->secmech.hmacmd5) + crypto_free_shash(server->secmech.hmacmd5); + + kfree(server->secmech.sdeschmacmd5); + + kfree(server->secmech.sdescmd5); +} + +int +cifs_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); + if (!server->secmech.hmacmd5 || + IS_ERR(server->secmech.hmacmd5)) { + cERROR(1, "could not allocate crypto hmacmd5\n"); + return PTR_ERR(server->secmech.hmacmd5); + } + + server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); + if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { + cERROR(1, "could not allocate crypto md5\n"); + rc = PTR_ERR(server->secmech.md5); + goto crypto_allocate_md5_fail; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacmd5); + server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n"); + rc = -ENOMEM; + goto crypto_allocate_hmacmd5_sdesc_fail; + } + server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; + server->secmech.sdeschmacmd5->shash.flags = 0x0; + + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.md5); + server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdescmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n"); + rc = -ENOMEM; + goto crypto_allocate_md5_sdesc_fail; + } + server->secmech.sdescmd5->shash.tfm = server->secmech.md5; + server->secmech.sdescmd5->shash.flags = 0x0; + + return 0; + +crypto_allocate_md5_sdesc_fail: + kfree(server->secmech.sdeschmacmd5); + +crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.md5); + +crypto_allocate_md5_fail: + crypto_free_shash(server->secmech.hmacmd5); + + return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 34371637f210..9c3789762ab7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data, return -ENOMEM; spin_lock_init(&cifs_sb->tlink_tree_lock); - INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); + cifs_sb->tlink_tree = RB_ROOT; rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) { @@ -318,12 +318,10 @@ cifs_alloc_inode(struct super_block *sb) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ cifs_inode->time = 0; - cifs_inode->write_behind_rc = 0; /* Until the file is open and we have gotten oplock info back from the server, can not assume caching of file data or metadata */ - cifs_inode->clientCanCacheRead = false; - cifs_inode->clientCanCacheAll = false; + cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ @@ -545,9 +543,9 @@ static const struct super_operations cifs_super_ops = { #endif }; -static int -cifs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry * +cifs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int rc; struct super_block *sb; @@ -557,18 +555,17 @@ cifs_get_sb(struct file_system_type *fs_type, cFYI(1, "Devname: %s flags: %d ", dev_name, flags); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); sb->s_flags = flags; rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); if (rc) { deactivate_locked_super(sb); - return rc; + return ERR_PTR(rc); } sb->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, @@ -634,7 +631,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", - .get_sb = cifs_get_sb, + .mount = cifs_do_mount, .kill_sb = kill_anon_super, /* .fs_flags */ }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f35795a16b42..897b2b2b28b5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -112,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.67" +#define CIFS_VERSION "1.68" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 3365e77f6f24..b577bf0a1bb3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -25,6 +25,9 @@ #include <linux/workqueue.h> #include "cifs_fs_sb.h" #include "cifsacl.h" +#include <crypto/internal/hash.h> +#include <linux/scatterlist.h> + /* * The sizes of various internal tables and strings */ @@ -74,7 +77,7 @@ * CIFS vfs client Status information (based on what we know.) */ - /* associated with each tcp and smb session */ +/* associated with each tcp and smb session */ enum statusEnum { CifsNew = 0, CifsGood, @@ -99,14 +102,29 @@ enum protocolEnum { struct session_key { unsigned int len; - union { - char ntlm[CIFS_SESS_KEY_SIZE + 16]; - char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */ - struct { - char key[16]; - struct ntlmv2_resp resp; - } ntlmv2; - } data; + char *response; +}; + +/* crypto security descriptor definition */ +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +/* crypto hashing related structure/fields, not specific to a sec mech */ +struct cifs_secmech { + struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ + struct crypto_shash *md5; /* md5 hash function */ + struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ + struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ +}; + +/* per smb session structure/fields */ +struct ntlmssp_auth { + __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ + __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */ }; struct cifs_cred { @@ -179,12 +197,14 @@ struct TCP_Server_Info { int capabilities; /* allow selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ __u16 CurrentMid; /* multiplex id - rotating counter */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ u16 dialect; /* dialect index that server chose */ + struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ /* extended security flavors that server supports */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ @@ -222,11 +242,8 @@ struct cifsSesInfo { char userName[MAX_USERNAME_SIZE + 1]; char *domainName; char *password; - char cryptKey[CIFS_CRYPTO_KEY_SIZE]; struct session_key auth_key; - char ntlmv2_hash[16]; - unsigned int tilen; /* length of the target info blob */ - unsigned char *tiblob; /* target info blob in challenge response */ + struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ bool need_reconnect:1; /* connection reset, uid now invalid */ }; /* no more than one of the following three session flags may be set */ @@ -319,7 +336,8 @@ struct cifsTconInfo { * "get" on the container. */ struct tcon_link { - unsigned long tl_index; + struct rb_node tl_rbnode; + uid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 @@ -395,16 +413,19 @@ struct cifsFileInfo { struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - atomic_t count; /* reference count */ + int count; /* refcount protected by cifs_file_list_lock */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ }; -/* Take a reference on the file private data */ +/* + * Take a reference on the file private data. Must be called with + * cifs_file_list_lock held. + */ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - atomic_inc(&cifs_file->count); + ++cifs_file->count; } void cifsFileInfo_put(struct cifsFileInfo *cifs_file); @@ -417,7 +438,6 @@ struct cifsInodeInfo { struct list_head lockList; /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; - int write_behind_rc; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned long time; /* jiffies of last update/check of inode */ bool clientCanCacheRead:1; /* read oplock */ @@ -668,7 +688,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * GlobalSMBSesLock protects: + * cifs_file_list_lock protects: * list operations on tcp and SMB session lists and tCon lists * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b0f4b5656d4c..de36b09763a8 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -131,9 +131,20 @@ #define CIFS_CRYPTO_KEY_SIZE (8) /* + * Size of the ntlm client response + */ +#define CIFS_AUTH_RESP_SIZE (24) + +/* * Size of the session key (crypto key encrypted with the password */ -#define CIFS_SESS_KEY_SIZE (24) +#define CIFS_SESS_KEY_SIZE (16) + +#define CIFS_CLIENT_CHALLENGE_SIZE (8) +#define CIFS_SERVER_CHALLENGE_SIZE (8) +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) +#define CIFS_NTHASH_SIZE (16) /* * Maximum user name length diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e593c40ba7ba..7ed69b6b5fe6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,6 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, @@ -362,13 +363,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); extern int cifs_verify_signature(struct smb_hdr *, - const struct session_key *session_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int cifs_calculate_session_key(struct session_key *key, const char *rn, - const char *pass); -extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); -extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *, - const struct nls_table *); +extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); +extern int setup_ntlm_response(struct cifsSesInfo *); +extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); +extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); +extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern int calc_seckey(struct cifsSesInfo *); + #ifdef CONFIG_CIFS_WEAK_PW_HASH extern void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e98f1f317b15..2f2632b6df5a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { - memcpy(ses->cryptKey, rsp->EncryptionKey, + memcpy(ses->server->cryptkey, rsp->EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if (server->secMode & SECMODE_PW_ENCRYPT) { rc = -EIO; /* need cryptkey unless plain text */ @@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { - memcpy(ses->cryptKey, pSMBr->u.EncryptionKey, + memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && (pSMBr->EncryptionKeyLength == 0)) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7e73176acb58..251a17c03545 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -116,6 +116,7 @@ struct smb_vol { static int ipv4_connect(struct TCP_Server_Info *server); static int ipv6_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); /* @@ -175,6 +176,9 @@ cifs_reconnect(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { @@ -1064,7 +1068,7 @@ cifs_parse_mount_options(char *options, const char *devname, } i = cifs_convert_address((struct sockaddr *)&vol->srcaddr, value, strlen(value)); - if (i < 0) { + if (i == 0) { printk(KERN_WARNING "CIFS: Could not parse" " srcaddr: %s\n", value); @@ -1560,8 +1564,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + cifs_crypto_shash_release(server); cifs_fscache_release_client_cookie(server); + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + task = xchg(&server->tsk, NULL); if (task) force_sig(SIGKILL, task); @@ -1614,10 +1623,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info) goto out_err; } + rc = cifs_crypto_shash_allocate(tcp_ses); + if (rc) { + cERROR(1, "could not setup hash structures rc %d", rc); + goto out_err; + } + tcp_ses->hostname = extract_hostname(volume_info->UNC); if (IS_ERR(tcp_ses->hostname)) { rc = PTR_ERR(tcp_ses->hostname); - goto out_err; + goto out_err_crypto_release; } tcp_ses->noblocksnd = volume_info->noblocksnd; @@ -1661,7 +1676,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } if (rc < 0) { cERROR(1, "Error connecting to socket. Aborting operation"); - goto out_err; + goto out_err_crypto_release; } /* @@ -1675,7 +1690,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = PTR_ERR(tcp_ses->tsk); cERROR(1, "error %d create cifsd thread", rc); module_put(THIS_MODULE); - goto out_err; + goto out_err_crypto_release; } /* thread spawned, put it on the list */ @@ -1687,6 +1702,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) return tcp_ses; +out_err_crypto_release: + cifs_crypto_shash_release(tcp_ses); + out_err: if (tcp_ses) { if (!IS_ERR(tcp_ses->hostname)) @@ -1801,8 +1819,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses == NULL) goto get_ses_fail; - ses->tilen = 0; - ses->tiblob = NULL; /* new SMB session uses our server ref */ ses->server = server; if (server->addr.sockAddr6.sin6_family == AF_INET6) @@ -1823,10 +1839,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) goto get_ses_fail; } if (volume_info->domainname) { - int len = strlen(volume_info->domainname); - ses->domainName = kmalloc(len + 1, GFP_KERNEL); - if (ses->domainName) - strcpy(ses->domainName, volume_info->domainname); + ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL); + if (!ses->domainName) + goto get_ses_fail; } ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; @@ -2886,24 +2901,16 @@ remote_path_check: goto mount_fail_check; } - tlink->tl_index = pSesInfo->linux_uid; + tlink->tl_uid = pSesInfo->linux_uid; tlink->tl_tcon = tcon; tlink->tl_time = jiffies; set_bit(TCON_LINK_MASTER, &tlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); - rc = radix_tree_preload(GFP_KERNEL); - if (rc == -ENOMEM) { - kfree(tlink); - goto mount_fail_check; - } - + cifs_sb->master_tlink = tlink; spin_lock(&cifs_sb->tlink_tree_lock); - radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); - radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid, - CIFS_TLINK_MASTER_TAG); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -2985,13 +2992,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, #ifdef CONFIG_CIFS_WEAK_PW_HASH if ((global_secflags & CIFSSEC_MAY_LANMAN) && (ses->server->secType == LANMAN)) - calc_lanman_hash(tcon->password, ses->cryptKey, + calc_lanman_hash(tcon->password, ses->server->cryptkey, ses->server->secMode & SECMODE_PW_ENCRYPT ? true : false, bcc_ptr); else #endif /* CIFS_WEAK_PW_HASH */ - SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr); + SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { @@ -3093,32 +3100,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, int cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) { - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; char *tmp; - struct tcon_link *tlink[8]; - unsigned long index = 0; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); - for (i = 0; i < ret; i++) - cifs_put_tlink(tlink[i]); - } while (ret != 0); + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); tmp = cifs_sb->prepath; cifs_sb->prepathlen = 0; @@ -3178,10 +3178,11 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses, } else { mutex_lock(&ses->server->srv_mutex); if (!server->session_estab) { - memcpy(&server->session_key.data, - &ses->auth_key.data, ses->auth_key.len); + server->session_key.response = ses->auth_key.response; server->session_key.len = ses->auth_key.len; - ses->server->session_estab = true; + server->sequence_number = 0x2; + server->session_estab = true; + ses->auth_key.response = NULL; } mutex_unlock(&server->srv_mutex); @@ -3192,6 +3193,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses, spin_unlock(&GlobalMid_Lock); } + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + return rc; } @@ -3250,22 +3257,10 @@ out: return tcon; } -static struct tcon_link * +static inline struct tcon_link * cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) { - struct tcon_link *tlink; - unsigned int ret; - - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink, - 0, 1, CIFS_TLINK_MASTER_TAG); - spin_unlock(&cifs_sb->tlink_tree_lock); - - /* the master tcon should always be present */ - if (ret == 0) - BUG(); - - return tlink; + return cifs_sb->master_tlink; } struct cifsTconInfo * @@ -3281,6 +3276,47 @@ cifs_sb_tcon_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + /* * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the * current task. @@ -3288,7 +3324,7 @@ cifs_sb_tcon_pending_wait(void *unused) * If the superblock doesn't refer to a multiuser mount, then just return * the master tcon for the mount. * - * First, search the radix tree for an existing tcon for this fsuid. If one + * First, search the rbtree for an existing tcon for this fsuid. If one * exists, then check to see if it's pending construction. If it is then wait * for construction to complete. Once it's no longer pending, check to see if * it failed and either return an error or retry construction, depending on @@ -3301,14 +3337,14 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - unsigned long fsuid = (unsigned long) current_fsuid(); + uid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); @@ -3317,36 +3353,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); if (newtlink == NULL) return ERR_PTR(-ENOMEM); - newtlink->tl_index = fsuid; + newtlink->tl_uid = fsuid; newtlink->tl_tcon = ERR_PTR(-EACCES); set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); cifs_get_tlink(newtlink); - ret = radix_tree_preload(GFP_KERNEL); - if (ret != 0) { - kfree(newtlink); - return ERR_PTR(ret); - } - spin_lock(&cifs_sb->tlink_tree_lock); /* was one inserted after previous search? */ - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) { cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); kfree(newtlink); goto wait_for_construction; } - ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink); - spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - if (ret) { - kfree(newtlink); - return ERR_PTR(ret); - } tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, @@ -3392,39 +3416,39 @@ cifs_prune_tlinks(struct work_struct *work) { struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); - struct tcon_link *tlink[8]; - unsigned long now = jiffies; - unsigned long index = 0; - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || - atomic_read(&tlink[i]->tl_count) != 0 || - time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, - now)) { - tlink[i] = NULL; - continue; - } - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; - for (i = 0; i < ret; i++) { - if (tlink[i] != NULL) - cifs_put_tlink(tlink[i]); - } - } while (ret != 0); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 45af003865d2..06c3e83fa387 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -131,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) - pCifsInode->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); } cFYI(1, "invalidating remote inode since open detected it " "changed"); @@ -147,12 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); return rc; } @@ -232,6 +226,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, if (pCifsFile == NULL) return pCifsFile; + pCifsFile->count = 1; pCifsFile->netfid = fileHandle; pCifsFile->pid = current->tgid; pCifsFile->uid = current_fsuid(); @@ -242,7 +237,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->count, 1); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -254,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -267,16 +256,18 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, /* * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. + * the filehandle out on the server. Must be called without holding + * cifs_file_list_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); - struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); - if (!atomic_dec_and_test(&cifs_file->count)) { + if (--cifs_file->count > 0) { spin_unlock(&cifs_file_list_lock); return; } @@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifsi->clientCanCacheRead = false; - cifsi->clientCanCacheAll = false; + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -605,11 +595,8 @@ reopen_success: if (can_flush) { rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) - CIFS_I(inode)->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); - pCifsInode->clientCanCacheAll = false; - pCifsInode->clientCanCacheRead = false; if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); @@ -623,18 +610,9 @@ reopen_success: invalidate the current end of file on the server we can not go to the server to get the new inod info */ - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - pCifsFile->dentry->d_inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = true; - pCifsInode->clientCanCacheAll = false; - } else { - pCifsInode->clientCanCacheRead = false; - pCifsInode->clientCanCacheAll = false; - } + + cifs_set_oplock_level(pCifsInode, oplock); + cifs_relock_file(pCifsFile); reopen_error_exit: @@ -776,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; if ((tcon->ses->capabilities & CAP_UNIX) && @@ -957,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { + struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; @@ -964,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1030,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); - /* since the write may have blocked check these pointers again */ - if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { - struct inode *inode = file->f_path.dentry->d_inode; /* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > file->f_path.dentry->d_inode->i_size) - i_size_write(file->f_path.dentry->d_inode, - *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(file->f_path.dentry->d_inode); + * inode->i_ctime = inode->i_mtime = + * current_fs_time(inode->i_sb);*/ + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); } + mark_inode_dirty_sync(inode); + FreeXid(xid); return total_written; } @@ -1179,7 +1148,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb; bool any_available = false; int rc; @@ -1193,6 +1162,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, return NULL; } + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; @@ -1353,6 +1324,7 @@ static int cifs_writepages(struct address_space *mapping, if (!experimEnabled && tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { cifsFileInfo_put(open_file); + kfree(iov); return generic_writepages(mapping, wbc); } cifsFileInfo_put(open_file); @@ -1478,12 +1450,7 @@ retry: if (rc || bytes_written < bytes_to_write) { cERROR(1, "Write2 ret %d, wrote %d", rc, bytes_written); - /* BB what if continued retry is - requested via mount flags? */ - if (rc == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); + mapping_set_error(mapping, rc); } else { cifs_stats_bytes_written(tcon, bytes_written); } @@ -1628,11 +1595,10 @@ int cifs_fsync(struct file *file, int datasync) rc = filemap_write_and_wait(inode->i_mapping); if (rc == 0) { - rc = CIFS_I(inode)->write_behind_rc; - CIFS_I(inode)->write_behind_rc = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + tcon = tlink_tcon(smbfile->tlink); - if (!rc && tcon && smbfile && - !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); } @@ -1677,21 +1643,8 @@ int cifs_flush(struct file *file, fl_owner_t id) struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; - /* Rather than do the steps manually: - lock the inode for writing - loop through pages looking for write behind data (dirty pages) - coalesce into contiguous 16K (or smaller) chunks to write to server - send to server (prefer in parallel) - deal with writebehind errors - unlock inode for writing - filemapfdatawrite appears easier for the time being */ - - rc = filemap_fdatawrite(inode->i_mapping); - /* reset wb rc if we were able to write out dirty pages */ - if (!rc) { - rc = CIFS_I(inode)->write_behind_rc; - CIFS_I(inode)->write_behind_rc = 0; - } + if (file->f_mode & FMODE_WRITE) + rc = filemap_write_and_wait(inode->i_mapping); cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); @@ -2270,7 +2223,7 @@ void cifs_oplock_break(struct work_struct *work) oplock_break); struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); - int rc, waitrc = 0; + int rc = 0; if (inode && S_ISREG(inode->i_mode)) { if (cinode->clientCanCacheRead) @@ -2279,13 +2232,10 @@ void cifs_oplock_break(struct work_struct *work) break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); if (cinode->clientCanCacheRead == 0) { - waitrc = filemap_fdatawait(inode->i_mapping); + rc = filemap_fdatawait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); invalidate_remote_inode(inode); } - if (!rc) - rc = waitrc; - if (rc) - cinode->write_behind_rc = rc; cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } @@ -2304,7 +2254,7 @@ void cifs_oplock_break(struct work_struct *work) /* * We might have kicked in before is_valid_oplock_break() * finished grabbing reference for us. Make sure it's done by - * waiting for GlobalSMSSeslock. + * waiting for cifs_file_list_lock. */ spin_lock(&cifs_file_list_lock); spin_unlock(&cifs_file_list_lock); @@ -2312,6 +2262,7 @@ void cifs_oplock_break(struct work_struct *work) cifs_oplock_break_put(cfile); } +/* must be called while holding cifs_file_list_lock */ void cifs_oplock_break_get(struct cifsFileInfo *cfile) { cifs_sb_active(cfile->dentry->d_sb); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 94979309698a..ef3a55bf86b6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1682,8 +1682,7 @@ cifs_invalidate_mapping(struct inode *inode) /* write back any cached data */ if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_write_and_wait(inode->i_mapping); - if (rc) - cifs_i->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); } invalidate_remote_inode(inode); cifs_fscache_reset_inode_cookie(inode); @@ -1943,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + mapping_set_error(inode->i_mapping, rc); + rc = 0; if (attrs->ia_valid & ATTR_SIZE) { rc = cifs_set_file_size(inode, attrs, xid, full_path); @@ -2087,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + mapping_set_error(inode->i_mapping, rc); + rc = 0; if (attrs->ia_valid & ATTR_SIZE) { rc = cifs_set_file_size(inode, attrs, xid, full_path); @@ -2182,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) setattr_copy(inode, attrs); mark_inode_dirty(inode); - return 0; cifs_setattr_exit: kfree(full_path); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 077bf756f342..0c98672d0122 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct cifs_sb_info *cifs_sb; #ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; - struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); + struct cifsTconInfo *tcon; __u64 ExtAttrBits = 0; __u64 ExtAttrMask = 0; - __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + __u64 caps; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); @@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (pSMBFile == NULL) - break; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); if (rc == 0) @@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; break; } - if (pSMBFile == NULL) - break; /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, extAttrBits, &ExtAttrMask);*/ } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1c681f6a6803..43f10281bc19 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,15 +569,14 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - pCifsInode->clientCanCacheAll = false; - if (pSMB->OplockLevel == 0) - pCifsInode->clientCanCacheRead = false; + cifs_set_oplock_level(pCifsInode, + pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing * succeeded. cifs_oplock_break() will - * synchronize using GlobalSMSSeslock. + * synchronize using cifs_file_list_lock. */ if (queue_work(system_nrt_wq, &netfile->oplock_break)) @@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb_master_tcon(cifs_sb)->treeName); } } + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 2a11efd96592..7b01d3f6eed6 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -32,9 +32,6 @@ #include <linux/slab.h> #include "cifs_spnego.h" -extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, - unsigned char *p24); - /* * Checks if this is the first smb session to be reconnected after * the socket has been reestablished (so we know whether to use vc 0). @@ -402,23 +399,22 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, return -EINVAL; } - memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); + memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); /* BB we could decode pblob->NegotiateFlags; some may be useful */ /* In particular we can examine sign flags */ /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ - + ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); tilen = cpu_to_le16(pblob->TargetInfoArray.Length); - ses->tilen = tilen; - if (ses->tilen) { - ses->tiblob = kmalloc(tilen, GFP_KERNEL); - if (!ses->tiblob) { + if (tilen) { + ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); + if (!ses->auth_key.response) { cERROR(1, "Challenge target info allocation failure"); - ses->tilen = 0; return -ENOMEM; } - memcpy(ses->tiblob, bcc_ptr + tioffset, ses->tilen); + memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen); + ses->auth_key.len = tilen; } return 0; @@ -443,10 +439,12 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM; if (ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (ses->server->secMode & SECMODE_SIGN_REQUIRED) - flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH | + NTLMSSP_NEGOTIATE_EXTENDED_SEC; + } sec_blob->NegotiateFlags |= cpu_to_le32(flags); @@ -469,11 +467,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, const struct nls_table *nls_cp) { int rc; - unsigned int size; AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; __u32 flags; unsigned char *tmp; - struct ntlmv2_resp ntlmv2_response = {}; memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -497,25 +493,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.MaximumLength = 0; sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp); + rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLMSSP authentication", rc); goto setup_ntlmv2_ret; } - size = sizeof(struct ntlmv2_resp); - memcpy(tmp, (char *)&ntlmv2_response, size); - tmp += size; - if (ses->tilen > 0) { - memcpy(tmp, ses->tiblob, ses->tilen); - tmp += ses->tilen; - } + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen); + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(size + ses->tilen); - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->domainName == NULL) { sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); @@ -554,9 +544,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->SessionKey.Length = 0; - sec_blob->SessionKey.MaximumLength = 0; + if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && + !calc_seckey(ses)) { + memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.MaximumLength = + cpu_to_le16(CIFS_CPHTXT_SIZE); + tmp += CIFS_CPHTXT_SIZE; + } else { + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; + } setup_ntlmv2_ret: *buflen = tmp - pbuffer; @@ -600,8 +600,16 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, return -EINVAL; type = ses->server->secType; - cFYI(1, "sess setup type %d", type); + if (type == RawNTLMSSP) { + /* if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) + return -ENOMEM; + } + ssetup_ntlmssp_authenticate: if (phase == NtLmChallenge) phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ @@ -666,10 +674,14 @@ ssetup_ntlmssp_authenticate: /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); - /* BB calculate hash with password */ - /* and copy into bcc */ - calc_lanman_hash(ses->password, ses->cryptKey, + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + + calc_lanman_hash(ses->password, ses->server->cryptkey, ses->server->secMode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); @@ -687,24 +699,27 @@ ssetup_ntlmssp_authenticate: ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #endif } else if (type == NTLM) { - char ntlm_session_key[CIFS_SESS_KEY_SIZE]; - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_SESS_KEY_SIZE); + cpu_to_le16(CIFS_AUTH_RESP_SIZE); pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_SESS_KEY_SIZE); + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses); + if (rc) { + cERROR(1, "Error %d during NTLM authentication", rc); + goto ssetup_exit; + } - /* calculate session key */ - SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key); + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; - cifs_calculate_session_key(&ses->auth_key, - ntlm_session_key, ses->password); - /* copy session key */ - memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); - bcc_ptr += CIFS_SESS_KEY_SIZE; - memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); - bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ if (iov[0].iov_len % 2) { @@ -715,47 +730,26 @@ ssetup_ntlmssp_authenticate: } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); } else if (type == NTLMv2) { - char *v2_sess_key = - kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL); - - /* BB FIXME change all users of v2_sess_key to - struct ntlmv2_resp */ - - if (v2_sess_key == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); /* LM2 password would be here if we supported it */ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - /* cpu_to_le16(LM2_SESS_KEY_SIZE); */ - /* calculate session key */ - rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLMv2 authentication", rc); - kfree(v2_sess_key); goto ssetup_exit; } - memcpy(bcc_ptr, (char *)v2_sess_key, - sizeof(struct ntlmv2_resp)); - bcc_ptr += sizeof(struct ntlmv2_resp); - kfree(v2_sess_key); + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + /* set case sensitive password length after tilen may get * assigned, tilen is 0 otherwise. */ pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen); - if (ses->tilen > 0) { - memcpy(bcc_ptr, ses->tiblob, ses->tilen); - bcc_ptr += ses->tilen; - /* we never did allocate ses->domainName to free */ - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; - } + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->capabilities & CAP_UNICODE) { if (iov[0].iov_len % 2) { @@ -768,6 +762,7 @@ ssetup_ntlmssp_authenticate: } else if (type == Kerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; + spnego_key = cifs_get_spnego_key(ses); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); @@ -785,16 +780,17 @@ ssetup_ntlmssp_authenticate: rc = -EKEYREJECTED; goto ssetup_exit; } - /* bail out if key is too long */ - if (msg->sesskey_len > - sizeof(ses->auth_key.data.krb5)) { - cERROR(1, "Kerberos signing key too long (%u bytes)", - msg->sesskey_len); - rc = -EOVERFLOW; + + ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; goto ssetup_exit; } + memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); ses->auth_key.len = msg->sesskey_len; - memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len); + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities = cpu_to_le32(capabilities); @@ -897,8 +893,6 @@ ssetup_ntlmssp_authenticate: CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); /* SMB request buf freed in SendReceive2 */ - cFYI(1, "ssetup rc from sendrecv2 is %d", rc); - pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; smb_buf = (struct smb_hdr *)iov[0].iov_base; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index a66c91eb6eb4..e0588cdf4cc5 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(midQ->resp_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 7993b96ca348..5ea57c8c7f97 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -306,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) /* init_coda: used by filesystems.c to register coda */ -static int coda_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *coda_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); + return mount_nodev(fs_type, flags, data, coda_fill_super); } struct file_system_type coda_fs_type = { .owner = THIS_MODULE, .name = "coda", - .get_sb = coda_get_sb, + .mount = coda_mount, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/compat.c b/fs/compat.c index f03abdadc401..c580c322fa6b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -29,8 +29,6 @@ #include <linux/vfs.h> #include <linux/ioctl.h> #include <linux/init.h> -#include <linux/smb.h> -#include <linux/smb_mount.h> #include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> #include <linux/syscalls.h> @@ -51,6 +49,7 @@ #include <linux/eventpoll.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -608,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type, /* * Single unix specification: * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t + * ssize_t. * - * Be careful here because iov_len is a size_t not an ssize_t + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. */ tot_len = 0; ret = -EINVAL; for (seg = 0; seg < nr_segs; seg++) { - compat_ssize_t tmp = tot_len; compat_uptr_t buf; compat_ssize_t len; @@ -626,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; iov->iov_base = compat_ptr(buf); iov->iov_len = (compat_size_t) len; uvector++; @@ -745,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data) return raw_data; } -struct compat_smb_mount_data { - compat_int_t version; - __compat_uid_t mounted_uid; - __compat_uid_t uid; - __compat_gid_t gid; - compat_mode_t file_mode; - compat_mode_t dir_mode; -}; - -static void *do_smb_super_data_conv(void *raw_data) -{ - struct smb_mount_data *s = raw_data; - struct compat_smb_mount_data *c_s = raw_data; - - if (c_s->version != SMB_MOUNT_OLDVERSION) - goto out; - s->dir_mode = c_s->dir_mode; - s->file_mode = c_s->file_mode; - s->gid = c_s->gid; - s->uid = c_s->uid; - s->mounted_uid = c_s->mounted_uid; - out: - return raw_data; -} struct compat_nfs_string { compat_uint_t len; @@ -835,7 +810,6 @@ static int do_nfs4_super_data_conv(void *raw_data) return 0; } -#define SMBFS_NAME "smbfs" #define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" @@ -870,9 +844,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name, retval = -EINVAL; if (kernel_type && data_page) { - if (!strcmp(kernel_type, SMBFS_NAME)) { - do_smb_super_data_conv((void *)data_page); - } else if (!strcmp(kernel_type, NCPFS_NAME)) { + if (!strcmp(kernel_type, NCPFS_NAME)) { do_ncp_super_data_conv((void *)data_page); } else if (!strcmp(kernel_type, NFS4_NAME)) { if (do_nfs4_super_data_conv((void *) data_page)) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index d0ad09d57789..410ed188faa1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -46,7 +46,6 @@ #include <linux/videodev.h> #include <linux/netdevice.h> #include <linux/raw.h> -#include <linux/smb_fs.h> #include <linux/blkdev.h> #include <linux/elevator.h> #include <linux/rtc.h> @@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) #endif /* CONFIG_BLOCK */ -static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, - compat_uid_t __user *argp) -{ - mm_segment_t old_fs = get_fs(); - __kernel_uid_t kuid; - int err; - - cmd = SMB_IOC_GETMOUNTUID; - - set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&kuid); - set_fs(old_fs); - - if (err >= 0) - err = put_user(kuid, argp); - - return err; -} - /* Bluetooth ioctls */ #define HCIUARTSETPROTO _IOW('U', 200, int) #define HCIUARTGETPROTO _IOR('U', 201, int) @@ -1199,8 +1179,9 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) -/* SMB ioctls which do not need any translations */ -COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) +/* Raw devices */ +COMPATIBLE_IOCTL(RAW_SETBIND) +COMPATIBLE_IOCTL(RAW_GETBIND) /* Watchdog */ COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) COMPATIBLE_IOCTL(WDIOC_GETSTATUS) @@ -1458,10 +1439,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case MTIOCPOS32: return mt_ioctl_trans(fd, cmd, argp); #endif - /* One SMB ioctl needs translations. */ -#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) - case SMB_IOC_GETMOUNTUID_32: - return do_smb_getmountuid(fd, cmd, argp); /* Serial */ case TIOCGSERIAL: case TIOCSSERIAL: diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 8c8d64230c2d..7d3607febe1c 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int configfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *configfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); + return mount_single(fs_type, flags, data, configfs_fill_super); } static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", - .get_sb = configfs_get_sb, + .mount = configfs_do_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 1e7a33028d33..32fd5fe9ca0e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; -static int cramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *cramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); } static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", - .get_sb = cramfs_get_sb, + .mount = cramfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a4ed8380e98a..37a8ca7c1222 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -135,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); } -static int debug_get_sb(struct file_system_type *fs_type, +static struct dentry *debug_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); + return mount_single(fs_type, flags, data, debug_fill_super); } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", - .get_sb = debug_get_sb, + .mount = debug_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 8b3ffd5b5235..1bb547c9cad6 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p) } /* - * devpts_get_sb() + * devpts_mount() * * If the '-o newinstance' mount option was specified, mount a new * (private) instance of devpts. PTYs created in this instance are @@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p) * semantics in devpts while preserving backward compatibility of the * current 'single-namespace' semantics. i.e all mounts of devpts * without the 'newinstance' mount option should bind to the initial - * kernel mount, like get_sb_single(). + * kernel mount, like mount_single(). * * Mounts with 'newinstance' option create a new, private namespace. * * NOTE: * - * For single-mount semantics, devpts cannot use get_sb_single(), - * because get_sb_single()/sget() find and use the super-block from + * For single-mount semantics, devpts cannot use mount_single(), + * because mount_single()/sget() find and use the super-block from * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and get_sb_single() would pick the newinstance + * 'newinstance' mount and mount_single() would pick the newinstance * super-block instead of the initial super-block. */ -static int devpts_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *devpts_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int error; struct pts_mount_opts opts; @@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type, error = parse_mount_options(data, PARSE_MOUNT, &opts); if (error) - return error; + return ERR_PTR(error); if (opts.newinstance) s = sget(fs_type, NULL, set_anon_super, NULL); @@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type, s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); if (!s->s_root) { s->s_flags = flags; @@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type, if (error) goto out_undo_sget; - simple_set_mnt(mnt, s); - - return 0; + return dget(s->s_root); out_undo_sget: deactivate_locked_super(s); - return error; + return ERR_PTR(error); } #else @@ -404,10 +402,10 @@ out_undo_sget: * This supports only the legacy single-instance semantics (no * multiple-instance semantics) */ -static int devpts_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); + return mount_single(fs_type, flags, data, devpts_fill_super); } #endif @@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .get_sb = devpts_get_sb, + .mount = devpts_mount, .kill_sb = devpts_kill_sb, }; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 40186b959429..413a3c48f0bb 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat { #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 +#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3fbc94203380..9d1a22d62765 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -32,6 +32,7 @@ #include <linux/crypto.h> #include <linux/fs_stack.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); struct dentry *dentry_save; struct vfsmount *vfsmount_save; + unsigned int flags_save; int rc; dentry_save = nd->path.dentry; vfsmount_save = nd->path.mnt; + flags_save = nd->flags; nd->path.dentry = lower_dentry; nd->path.mnt = lower_mnt; + nd->flags &= ~LOOKUP_OPEN; rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); nd->path.dentry = dentry_save; nd->path.mnt = vfsmount_save; + nd->flags = flags_save; return rc; } @@ -1108,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, - size, flags); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 73811cfa2ea4..b1f6858a5223 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -446,6 +446,7 @@ out: */ static int ecryptfs_find_auth_tok_for_sig( + struct key **auth_tok_key, struct ecryptfs_auth_tok **auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) @@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig( struct ecryptfs_global_auth_tok *global_auth_tok; int rc = 0; + (*auth_tok_key) = NULL; (*auth_tok) = NULL; if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, mount_crypt_stat, sig)) { - struct key *auth_tok_key; - rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, + /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the + * mount_crypt_stat structure, we prevent to use auth toks that + * are not inserted through the ecryptfs_add_global_auth_tok + * function. + */ + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + return -EINVAL; + + rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, sig); } else (*auth_tok) = global_auth_tok->global_auth_tok; @@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, char *filename, size_t filename_size) { struct ecryptfs_write_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; int rc = 0; s = kmalloc(sizeof(*s), GFP_KERNEL); @@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } dest[s->i++] = s->cipher_code; rc = ecryptfs_find_auth_tok_for_sig( + &auth_tok_key, &s->auth_tok, mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig); if (rc) { @@ -753,6 +765,8 @@ out_free_unlock: out_unlock: mutex_unlock(s->tfm_mutex); out: + if (auth_tok_key) + key_put(auth_tok_key); kfree(s); return rc; } @@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size) { struct ecryptfs_parse_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; int rc = 0; (*packet_size) = 0; @@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, * >= ECRYPTFS_MAX_IV_BYTES. */ memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); s->desc.info = s->iv; - rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &s->auth_tok, mount_crypt_stat, s->fnek_sig_hex); if (rc) { printk(KERN_ERR "%s: Error attempting to find auth tok for " @@ -986,6 +1002,8 @@ out: (*filename_size) = 0; (*filename) = NULL; } + if (auth_tok_key) + key_put(auth_tok_key); kfree(s); return rc; } @@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR); rc = -EINVAL; - goto out; + goto out_release_key; } if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { printk(KERN_ERR "Invalid auth_tok structure " "returned from key query\n"); rc = -EINVAL; - goto out; + goto out_release_key; + } +out_release_key: + if (rc) { + key_put(*auth_tok_key); + (*auth_tok_key) = NULL; } out: return rc; @@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_auth_tok_list_item *auth_tok_list_item; size_t tag_11_contents_size; size_t tag_11_packet_size; + struct key *auth_tok_key = NULL; int rc = 0; INIT_LIST_HEAD(&auth_tok_list); @@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * just one will be sufficient to decrypt to get the FEK. */ find_next_matching_auth_tok: found_auth_tok = 0; + if (auth_tok_key) { + key_put(auth_tok_key); + auth_tok_key = NULL; + } list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1800,10 +1828,11 @@ find_next_matching_auth_tok: rc = -EINVAL; goto out_wipe_list; } - ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &matching_auth_tok, crypt_stat->mount_crypt_stat, candidate_auth_tok_sig); - if (matching_auth_tok) { + if (!rc) { found_auth_tok = 1; goto found_matching_auth_tok; } @@ -1866,6 +1895,8 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: + if (auth_tok_key) + key_put(auth_tok_key); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index cbd4e18adb20..a9dbd62518e6 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, - ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; + ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -223,6 +224,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, + {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_err, NULL} }; @@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; + case ecryptfs_opt_mount_auth_tok_only: + mount_crypt_stat->flags |= + ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING @@ -540,9 +546,8 @@ out: * ecryptfs_interpose to perform most of the linking * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ -static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; @@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, err = "Reading sb failed"; goto out; } - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); out: if (sbi) { @@ -616,7 +620,7 @@ out: kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); - return rc; + return ERR_PTR(rc); } /** @@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", - .get_sb = ecryptfs_get_sb, + .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index f7fc286a3aa9..253732382d37 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -180,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",ecryptfs_encrypted_view"); if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) seq_printf(m, ",ecryptfs_unlink_sigs"); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + seq_printf(m, ",ecryptfs_mount_auth_tok_only"); return 0; } diff --git a/fs/efs/super.c b/fs/efs/super.c index f04942810818..5073a07652cc 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -20,16 +20,16 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_fill_super(struct super_block *s, void *d, int silent); -static int efs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *efs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", - .get_sb = efs_get_sb, + .mount = efs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 256bb7bb102a..8cf07242067d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -77,9 +77,6 @@ /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 -/* Maximum msec timeout value storeable in a long int */ -#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) - #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) @@ -1117,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep, static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { - int res, eavail; + int res, eavail, timed_out = 0; unsigned long flags; - long jtimeout; + long slack; wait_queue_t wait; - - /* - * Calculate the timeout by checking for the "infinite" value (-1) - * and the overflow condition. The passed timeout is in milliseconds, - * that why (t * HZ) / 1000. - */ - jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? - MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; + struct timespec end_time; + ktime_t expires, *to = NULL; + + if (timeout > 0) { + ktime_get_ts(&end_time); + timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); + slack = select_estimate_accuracy(&end_time); + to = &expires; + *to = timespec_to_ktime(end_time); + } else if (timeout == 0) { + timed_out = 1; + } retry: spin_lock_irqsave(&ep->lock, flags); @@ -1150,7 +1151,7 @@ retry: * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&ep->rdllist) || !jtimeout) + if (!list_empty(&ep->rdllist) || timed_out) break; if (signal_pending(current)) { res = -EINTR; @@ -1158,7 +1159,9 @@ retry: } spin_unlock_irqrestore(&ep->lock, flags); - jtimeout = schedule_timeout(jtimeout); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; + spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); @@ -1176,7 +1179,7 @@ retry: * more luck. */ if (!res && eavail && - !(res = ep_send_events(ep, events, maxevents)) && jtimeout) + !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto retry; return res; diff --git a/fs/exec.c b/fs/exec.c index 3aa75b8888a1..99d33a1371e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; int suid_dumpable = 0; +struct core_name { + char *corename; + int used, size; +}; +static atomic_t call_count = ATOMIC_INIT(1); + /* The maximal length of core_pattern is also specified in sysctl.c */ static LIST_HEAD(formats); @@ -1003,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ - current->flags &= ~PF_RANDOMIZE; + current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1083,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec); */ int prepare_bprm_creds(struct linux_binprm *bprm) { - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; } @@ -1098,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } kfree(bprm); @@ -1119,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program - * - the caller must hold current->cred_guard_mutex to protect against + * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ int check_unsafe_exec(struct linux_binprm *bprm) @@ -1406,7 +1412,6 @@ int do_execve(const char * filename, if (retval < 0) goto out; - current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval < 0) goto out; @@ -1459,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new) EXPORT_SYMBOL(set_binfmt); +static int expand_corename(struct core_name *cn) +{ + char *old_corename = cn->corename; + + cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); + cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + + if (!cn->corename) { + kfree(old_corename); + return -ENOMEM; + } + + return 0; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + char *cur; + int need; + int ret; + va_list arg; + + va_start(arg, fmt); + need = vsnprintf(NULL, 0, fmt, arg); + va_end(arg); + + if (likely(need < cn->size - cn->used - 1)) + goto out_printf; + + ret = expand_corename(cn); + if (ret) + goto expand_fail; + +out_printf: + cur = cn->corename + cn->used; + va_start(arg, fmt); + vsnprintf(cur, need + 1, fmt, arg); + va_end(arg); + cn->used += need; + return 0; + +expand_fail: + return ret; +} + /* format_corename will inspect the pattern parameter, and output a * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, long signr) +static int format_corename(struct core_name *cn, long signr) { const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); - char *out_ptr = corename; - char *const out_end = corename + CORENAME_MAX_SIZE; - int rc; int pid_in_pattern = 0; + int err = 0; + + cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); + cn->corename = kmalloc(cn->size, GFP_KERNEL); + cn->used = 0; + + if (!cn->corename) + return -ENOMEM; /* Repeat as long as we have more pattern to process and more output space */ while (*pat_ptr) { if (*pat_ptr != '%') { - if (out_ptr == out_end) + if (*pat_ptr == 0) goto out; - *out_ptr++ = *pat_ptr++; + err = cn_printf(cn, "%c", *pat_ptr++); } else { switch (*++pat_ptr) { + /* single % at the end, drop that */ case 0: goto out; /* Double percent, output one percent */ case '%': - if (out_ptr == out_end) - goto out; - *out_ptr++ = '%'; + err = cn_printf(cn, "%c", '%'); break; /* pid */ case 'p': pid_in_pattern = 1; - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", task_tgid_vnr(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", + task_tgid_vnr(current)); break; /* uid */ case 'u': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", cred->uid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", cred->uid); break; /* gid */ case 'g': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", cred->gid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", cred->gid); break; /* signal that caused the coredump */ case 's': - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", signr); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%ld", signr); break; /* UNIX time of coredump */ case 't': { struct timeval tv; do_gettimeofday(&tv); - rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", tv.tv_sec); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%lu", tv.tv_sec); break; } /* hostname */ case 'h': down_read(&uts_sem); - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", utsname()->nodename); + err = cn_printf(cn, "%s", + utsname()->nodename); up_read(&uts_sem); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; break; /* executable */ case 'e': - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", current->comm); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%s", current->comm); break; /* core limit size */ case 'c': - rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", rlimit(RLIMIT_CORE)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%lu", + rlimit(RLIMIT_CORE)); break; default: break; } ++pat_ptr; } + + if (err) + return err; } + /* Backward compatibility with core_uses_pid: * * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (!ispipe && !pid_in_pattern && core_uses_pid) { - rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", task_tgid_vnr(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, ".%d", task_tgid_vnr(current)); + if (err) + return err; } out: - *out_ptr = 0; return ispipe; } @@ -1856,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info) void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; - char corename[CORENAME_MAX_SIZE + 1]; + struct core_name cn; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; const struct cred *old_cred; @@ -1911,7 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(corename, signr); + ispipe = format_corename(&cn, signr); + + if (ispipe == -ENOMEM) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } if (ispipe) { int dump_count; @@ -1948,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); + helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); @@ -1961,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) argv_free(helper_argv); if (retval) { printk(KERN_INFO "Core dump to %s pipe failed\n", - corename); + cn.corename); goto close_fail; } } else { @@ -1970,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - cprm.file = filp_open(corename, + cprm.file = filp_open(cn.corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(cprm.file)) @@ -2012,6 +2044,8 @@ fail_dropcount: if (ispipe) atomic_dec(&core_dump_count); fail_unlock: + kfree(cn.corename); +fail_corename: coredump_finish(mm); revert_creds(old_cred); fail_creds: diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 047e92fa3af8..79c3ae6e0456 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -659,19 +659,19 @@ free_bdi: /* * Set up the superblock (calls exofs_fill_super eventually) */ -static int exofs_get_sb(struct file_system_type *type, +static struct dentry *exofs_mount(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct exofs_mountopt opts; int ret; ret = parse_options(data, &opts); if (ret) - return ret; + return ERR_PTR(ret); opts.dev_name = dev_name; - return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); + return mount_nodev(type, flags, &opts, exofs_fill_super); } /* @@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = { static struct file_system_type exofs_type = { .owner = THIS_MODULE, .name = "exofs", - .get_sb = exofs_get_sb, + .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index c6c684b44ea1..0d06f4e75699 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) return here; } -/* +/** * ext2_try_to_allocate() * @sb: superblock - * @handle: handle to this transaction * @group: given allocation block group * @bitmap_bh: bufferhead holds the block bitmap * @grp_goal: given target block within the group diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0901320671da..d89e0b6a2d78 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1356,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) return 0; } -static int ext2_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); } #ifdef CONFIG_QUOTA @@ -1473,7 +1473,7 @@ out: static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .get_sb = ext2_get_sb, + .mount = ext2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 4a32511f4ded..b3db22649426 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, if (here < 0) here = 0; - p = ((char *)bh->b_data) + (here >> 3); + p = bh->b_data + (here >> 3); r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - ((char *)bh->b_data)) << 3; + next = (r - bh->b_data) << 3; if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) return next; @@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, /** * claim_block() + * @lock: the spin lock for this block group * @block: the free block (group relative) to allocate - * @bh: the bufferhead containts the block group bitmap + * @bh: the buffer_head contains the block group bitmap * * We think we can allocate this block in this bitmap. Try to set the bit. * If that succeeds then check that nobody has allocated and then freed the @@ -956,9 +957,11 @@ fail_access: * but we will shift to the place where start_block is, * then start from there, when looking for a reservable space. * - * @size: the target new reservation window size + * @my_rsv: the reservation window * - * @group_first_block: the first block we consider to start + * @sb: the super block + * + * @start_block: the first block we consider to start * the real search from * * @last_block: @@ -1084,7 +1087,7 @@ static int find_next_reservable_window( * * failed: we failed to find a reservation window in this group * - * @rsv: the reservation + * @my_rsv: the reservation window * * @grp_goal: The goal (group-relative). It is where the search for a * free reservable space should start from. @@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, * @group: given allocation block group * @bitmap_bh: bufferhead holds the block bitmap * @grp_goal: given target block within the group - * @count: target number of blocks to allocate * @my_rsv: reservation window + * @count: target number of blocks to allocate * @errp: pointer to store the error code * * This is the main function used to allocate a new block and its reservation diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 4ab72db3559e..9724aef22460 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -570,9 +570,14 @@ got: ei->i_state_flags = 0; ext3_set_inode_state(inode, EXT3_STATE_NEW); - ei->i_extra_isize = - (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? - sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; + /* See comment in ext3_iget for explanation */ + if (ino >= EXT3_FIRST_INO(sb) + 1 && + EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = + sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; + } else { + ei->i_extra_isize = 0; + } ret = inode; dquot_initialize(inode); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ad05353040a1..a9580617edd2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, } /** - * ext3_blks_to_allocate: Look up the block map and count the number + * ext3_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks @@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, } /** - * ext3_alloc_blocks: multiple allocate blocks needed for a branch + * ext3_alloc_blocks - multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: owner + * @goal: preferred place for allocation * @indirect_blks: the number of blocks need to allocate for indirect * blocks - * + * @blks: number of blocks need to allocated for direct blocks * @new_blocks: on return it will store the new block numbers for * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks + * @err: here we store the error value + * + * return the number of direct blocks allocated */ static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t goal, int indirect_blks, int blks, @@ -598,9 +602,11 @@ failed_out: /** * ext3_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * @@ -700,10 +706,9 @@ failed: /** * ext3_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext3_alloc_branch) * @where: location of missing link * @num: number of indirect blocks we are adding * @blks: number of direct blocks we are adding @@ -2530,7 +2535,6 @@ void ext3_truncate(struct inode *inode) */ } else { /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); ext3_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0ccd7b12b73c..e746d30b1232 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -977,7 +977,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, o_blocks_count = le32_to_cpu(es->s_blocks_count); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", + printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK + " upto "E3FSBLK" blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -985,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to resize to %lu blocks safely\n", + " too large to resize to "E3FSBLK" blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) ext3_warning(sb, __func__, @@ -1065,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, - o_blocks_count + add); + ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); - ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, - o_blocks_count + add); + ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); if ((err = ext3_journal_stop(handle))) goto exit_put; if (test_opt(sb, DEBUG)) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 377768009106..2fedaf8b5012 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1301,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ext3_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= - (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + le16_to_cpu(es->s_max_mnt_count)) ext3_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); @@ -1320,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, valid forever! :) */ es->s_state &= cpu_to_le16(~EXT3_VALID_FS); #endif - if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); @@ -1647,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) * Note: s_es must be initialized as soon as possible because * some ext3 macro-instructions depend on its value */ - es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext3_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT3_SUPER_MAGIC) @@ -1758,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) "error: can't read superblock on 2nd try"); goto failed_mount; } - es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); + es = (struct ext3_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { ext3_msg(sb, KERN_ERR, @@ -1857,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT3_BLOCKS_PER_GROUP(sb)) + 1; - db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / - EXT3_DESC_PER_BLOCK(sb); + db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext3_msg(sb, KERN_ERR, "error: not enough memory"); + ret = -ENOMEM; goto failed_mount; } @@ -1951,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); + ret = err; goto failed_mount3; } @@ -2159,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, goto out_bdev; } - es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext3_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { @@ -2352,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb, if (!sbh) return error; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext3_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -2368,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb, es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - if (sync) + if (sync) { error = sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + ext3_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } return error; } @@ -2997,16 +3020,16 @@ out: #endif -static int ext3_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext3_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); } static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext3_get_sb, + .mount = ext3_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8867b2a1e5fe..c947e36eda6c 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd30799a43ed..14c3af26c671 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * less than the blocksize * 8 ( which is the size * of bitmap ), set rest of the block bitmap to 1 */ - mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); + ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, + bh->b_data); } return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); } @@ -489,7 +490,7 @@ error_return: * Check if filesystem has nblocks free & available for allocation. * On success return 1, return 0 on failure. */ -int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) { s64 free_blocks, dirty_blocks, root_blocks; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3db5084db9bd..fac90f3fba80 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -29,16 +29,15 @@ struct ext4_system_zone { static struct kmem_cache *ext4_system_zone_cachep; -int __init init_ext4_system_zone(void) +int __init ext4_init_system_zone(void) { - ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, - SLAB_RECLAIM_ACCOUNT); + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); if (ext4_system_zone_cachep == NULL) return -ENOMEM; return 0; } -void exit_ext4_system_zone(void) +void ext4_exit_system_zone(void) { kmem_cache_destroy(ext4_system_zone_cachep); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 374510f72baa..ece76fb6a40c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp); const struct file_operations ext4_dir_operations = { - .llseek = generic_file_llseek, + .llseek = ext4_llseek, .read = generic_read_dir, .readdir = ext4_readdir, /* we take BKL. needed?*/ .unlocked_ioctl = ext4_ioctl, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 889ec9d5e6ad..6a5edea2d70b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -168,7 +168,20 @@ struct mpage_da_data { int pages_written; int retval; }; -#define EXT4_IO_UNWRITTEN 0x1 + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_ERROR 0x0002 + +struct ext4_io_page { + struct page *p_page; + atomic_t p_count; +}; + +#define MAX_IO_PAGES 128 + typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ @@ -179,8 +192,18 @@ typedef struct ext4_io_end { struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ + int num_io_pages; + struct ext4_io_page *pages[MAX_IO_PAGES]; } ext4_io_end_t; +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + struct ext4_io_page *io_page; + sector_t io_next_block; +}; + /* * Special inodes numbers */ @@ -205,6 +228,7 @@ typedef struct ext4_io_end { #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else @@ -834,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -889,6 +914,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt @@ -1087,7 +1113,6 @@ struct ext4_sb_info { struct completion s_kobj_unregister; /* Journaling */ - struct inode *s_journal_inode; struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; @@ -1120,10 +1145,7 @@ struct ext4_sb_info { /* for buddy allocator */ struct ext4_group_info ***s_group_info; struct inode *s_buddy_cache; - long s_blocks_reserved; - spinlock_t s_reserve_lock; spinlock_t s_md_lock; - tid_t s_last_transaction; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; @@ -1141,7 +1163,6 @@ struct ext4_sb_info { unsigned long s_mb_last_start; /* stats for buddy allocator */ - spinlock_t s_mb_pa_lock; atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ @@ -1172,6 +1193,11 @@ struct ext4_sb_info { /* timer for periodic error stats printing */ struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1533,7 +1559,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); -extern struct proc_dir_entry *ext4_proc_root; +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + + wait_queue_head_t li_wait_daemon; + wait_queue_head_t li_wait_task; + struct timer_list li_timer; + struct task_struct *li_task; + + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; /* * Function prototypes @@ -1561,7 +1622,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb, extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); @@ -1605,11 +1665,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); /* mballoc.c */ extern long ext4_mb_stats; @@ -1620,16 +1678,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); -extern int __init init_ext4_mballoc(void); -extern void exit_ext4_mballoc(void); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); -extern void ext4_mb_put_buddy_cache_lock(struct super_block *, - ext4_group_t, int); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + /* inode.c */ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); @@ -1657,13 +1714,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); /* ioctl.c */ @@ -1960,6 +2015,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; @@ -1973,8 +2029,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); -extern int __init init_ext4_system_zone(void); -extern void exit_ext4_system_zone(void); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); @@ -2002,6 +2058,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern int ext4_end_io_nolock(ext4_io_end_t *io); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc); /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bdb6ce7e2eb4..28ce70fd9cd0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + extern int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblocks); -extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); -extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); -extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, @@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, extern int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2); -extern int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *); -extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, - ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); -extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, - ext4_lblk_t *, ext4_fsblk_t *); -extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, - ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 06328d3e5717..0554c48cb1fd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,55 +44,6 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" - -/* - * ext_pblock: - * combine low and high parts of physical block number into ext4_fsblk_t - */ -ext4_fsblk_t ext_pblock(struct ext4_extent *ex) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ex->ee_start_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; - return block; -} - -/* - * idx_pblock: - * combine low and high parts of a leaf physical block number into ext4_fsblk_t - */ -ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ix->ei_leaf_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; - return block; -} - -/* - * ext4_ext_store_pblock: - * stores a large physical block number into an extent struct, - * breaking it into parts - */ -void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) -{ - ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); -} - -/* - * ext4_idx_store_pblock: - * stores a large physical block number into an index struct, - * breaking it into parts - */ -static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) -{ - ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); -} - static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, /* try to predict block placement */ ex = path[depth].p_ext; if (ex) - return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); + return (ext4_ext_pblock(ex) + + (block - le32_to_cpu(ex->ee_block))); /* it looks like index is empty; * try to find starting block from index itself */ @@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext); + ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); @@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx); + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } @@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - idx_pblock(path->p_idx)); + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext), - ext_pblock(path->p_ext)); + ext4_ext_pblock(path->p_ext)); } else ext_debug(" []"); } @@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); } @@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode, path->p_idx = l - 1; ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), - idx_pblock(path->p_idx)); + ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { @@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode, path->p_ext = l - 1; ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), - ext_pblock(path->p_ext), + ext4_ext_pblock(path->p_ext), ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); @@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); - path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; @@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) - path[ppos].p_block = ext_pblock(path[ppos].p_ext); + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); @@ -739,9 +691,9 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -int ext4_ext_insert_index(handle_t *handle, struct inode *inode, - struct ext4_ext_path *curp, - int logical, ext4_fsblk_t ptr) +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; @@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), - ext_pblock(path[depth].p_ext), + ext4_ext_pblock(path[depth].p_ext), ext4_ext_is_uninitialized(path[depth].p_ext), ext4_ext_get_actual_len(path[depth].p_ext), newblock); @@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), - idx_pblock(path[i].p_idx), + ext4_idx_pblock(path[i].p_idx), newblock); /*memmove(++fidx, path[i].p_idx++, sizeof(struct ext4_extent_idx)); @@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), - idx_pblock(EXT_FIRST_INDEX(neh))); + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); neh->eh_depth = cpu_to_le16(path->p_depth + 1); err = ext4_ext_dirty(handle, inode, curp); @@ -1232,9 +1184,9 @@ out: * returns 0 at @phys * return value contains 0 (success) or error code */ -int -ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; @@ -1286,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; - *phys = ext_pblock(ex) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } @@ -1297,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, * returns 0 at @phys * return value contains 0 (success) or error code */ -int -ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1342,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, } } *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); return 0; } @@ -1357,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, /* next allocated block in this leaf */ ex++; *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); return 0; } @@ -1376,7 +1328,7 @@ got_index: * follow it and find the closest allocated * block to the right */ ix++; - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { bh = sb_bread(inode->i_sb, block); if (bh == NULL) @@ -1388,7 +1340,7 @@ got_index: return -EIO; } ix = EXT_FIRST_INDEX(eh); - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); put_bh(bh); } @@ -1402,7 +1354,7 @@ got_index: } ex = EXT_FIRST_EXTENT(eh); *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); put_bh(bh); return 0; } @@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, return 0; #endif - if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } @@ -1585,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ -int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex) +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; @@ -1632,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode, * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ -unsigned int ext4_ext_check_overlap(struct inode *inode, - struct ext4_extent *newext, - struct ext4_ext_path *path) +static unsigned int ext4_ext_check_overlap(struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; @@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), ext_pblock(ex)); + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; @@ -1780,7 +1733,7 @@ has_space: /* there is no extent in this leaf, create first one */ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); @@ -1794,7 +1747,7 @@ has_space: ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); @@ -1808,7 +1761,7 @@ has_space: ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); @@ -1819,7 +1772,7 @@ has_space: le16_add_cpu(&eh->eh_entries, 1); nearex = path[depth].p_ext; nearex->ee_block = newext->ee_block; - ext4_ext_store_pblock(nearex, ext_pblock(newext)); + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: @@ -1845,9 +1798,9 @@ cleanup: return err; } -int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) { struct ext4_ext_path *path = NULL; struct ext4_ext_cache cbex; @@ -1923,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, } else { cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext_pblock(ex); + cbex.ec_start = ext4_ext_pblock(ex); cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } @@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, /* free index block */ path--; - leaf = idx_pblock(path->p_idx); + leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EIO; @@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t start; num = le32_to_cpu(ex->ee_block) + ee_len - from; - start = ext_pblock(ex) + ee_len - num; + start = ext4_ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) @@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, goto out; ext_debug("new extent: %u:%u:%llu\n", block, num, - ext_pblock(ex)); + ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2421,9 +2374,9 @@ again: struct buffer_head *bh; /* go to the next level */ ext_debug("move to level %d (block %llu)\n", - i + 1, idx_pblock(path[i].p_idx)); + i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, idx_pblock(path[i].p_idx)); + bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); if (!bh) { /* should we reset i_size? */ err = -EIO; @@ -2535,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb) #endif } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { + ext4_fsblk_t ee_pblock; + unsigned int ee_len; int ret; - struct bio *bio; - int blkbits, blocksize; - sector_t ee_pblock; - struct completion event; - unsigned int ee_len, len, done, offset; - - blkbits = inode->i_blkbits; - blocksize = inode->i_sb->s_blocksize; ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext_pblock(ex); - - /* convert ee_pblock to 512 byte sectors */ - ee_pblock = ee_pblock << (blkbits - 9); - - while (ee_len > 0) { - - if (ee_len > BIO_MAX_PAGES) - len = BIO_MAX_PAGES; - else - len = ee_len; - - bio = bio_alloc(GFP_NOIO, len); - if (!bio) - return -ENOMEM; - - bio->bi_sector = ee_pblock; - bio->bi_bdev = inode->i_sb->s_bdev; - - done = 0; - offset = 0; - while (done < len) { - ret = bio_add_page(bio, ZERO_PAGE(0), - blocksize, offset); - if (ret != blocksize) { - /* - * We can't add any more pages because of - * hardware limitations. Start a new bio. - */ - break; - } - done++; - offset += blocksize; - if (offset >= PAGE_CACHE_SIZE) - offset = 0; - } + ee_pblock = ext4_ext_pblock(ex); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(WRITE, bio); - wait_for_completion(&event); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); + if (ret > 0) + ret = 0; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { - bio_put(bio); - return -EIO; - } - bio_put(bio); - ee_len -= done; - ee_pblock += done << (blkbits - 9); - } - return 0; + return ret; } #define EXT4_EXT_ZERO_LEN 7 @@ -2651,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext_pblock(ex); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); /* * It is safe to convert extent to initialized via explicit @@ -2675,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ return allocated; @@ -2710,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = orig_ex.ee_block; ex->ee_len = cpu_to_le16(ee_len - allocated); ext4_ext_mark_uninitialized(ex); - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); ex3 = &newex; @@ -2725,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto fix_extent_len; ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, + ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* blocks available from map->m_lblk */ return allocated; @@ -2782,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ /* blocks available from map->m_lblk */ @@ -2833,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ /* blocks available from map->m_lblk */ @@ -2902,7 +2800,7 @@ insert: /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ return allocated; @@ -2915,7 +2813,7 @@ out: fix_extent_len: ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_mark_uninitialized(ex); ext4_ext_dirty(handle, inode, path + depth); return err; @@ -2973,12 +2871,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext_pblock(ex); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); /* * It is safe to convert extent to initialized via explicit @@ -3027,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ /* blocks available from map->m_lblk */ @@ -3099,7 +2997,7 @@ insert: /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ return allocated; @@ -3112,7 +3010,7 @@ out: fix_extent_len: ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_mark_uninitialized(ex); ext4_ext_dirty(handle, inode, path + depth); return err; @@ -3180,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, unmap_underlying_metadata(bdev, block + i); } +/* + * Handle EOFBLOCKS_FL flag, clearing it if necessary + */ +static int check_eofblocks_fl(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + unsigned int len) +{ + int i, depth; + struct ext4_extent_header *eh; + struct ext4_extent *ex, *last_ex; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + return 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " + "EOFBLOCKS_FL set"); + return -EIO; + } + last_ex = EXT_LAST_EXTENT(eh); + /* + * We should clear the EOFBLOCKS_FL flag if we are writing the + * last block in the last extent in the file. We test this by + * first checking to see if the caller to + * ext4_ext_get_blocks() was interested in the last block (or + * a block beyond the last block) in the current extent. If + * this turns out to be false, we can bail out from this + * function immediately. + */ + if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + return 0; + /* + * If the caller does appear to be planning to write at or + * beyond the end of the current extent, we then test to see + * if the current extent is the last extent in the file, by + * checking to make sure it was reached via the rightmost node + * at each level of the tree. + */ + for (i = depth-1; i >= 0; i--) + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + return 0; + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + return ext4_mark_inode_dirty(handle, inode); +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3206,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * completed */ if (io) - io->flag = EXT4_IO_UNWRITTEN; + io->flag = EXT4_IO_END_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) @@ -3217,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_CONVERT)) { ret = ext4_convert_unwritten_extents_endio(handle, inode, path); - if (ret >= 0) + if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map, path, + map->m_len); + } else + err = ret; goto out2; } /* buffered IO case */ @@ -3244,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) + if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map, path, map->m_len); + if (err < 0) + goto out2; + } + out: if (ret <= 0) { err = ret; @@ -3292,6 +3250,7 @@ out2: } return err ? err : allocated; } + /* * Block allocation/map/preallocation routine for extents based files * @@ -3315,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex, *last_ex; + struct ext4_extent newex, *ex; ext4_fsblk_t newblock; - int i, err = 0, depth, ret, cache_type; + int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; @@ -3341,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* block is already allocated */ newblock = map->m_lblk - le32_to_cpu(newex.ee_block) - + ext_pblock(&newex); + + ext4_ext_pblock(&newex); /* number of remaining blocks in the extent */ allocated = ext4_ext_get_actual_len(&newex) - (map->m_lblk - le32_to_cpu(newex.ee_block)); @@ -3379,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); - ext4_fsblk_t ee_start = ext_pblock(ex); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* @@ -3488,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { if (io) - io->flag = EXT4_IO_UNWRITTEN; + io->flag = EXT4_IO_END_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); @@ -3497,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, - "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - err = -EIO; - goto out2; - } - last_ex = EXT_LAST_EXTENT(eh); - /* - * If the current leaf block was reached by looking at - * the last index block all the way down the tree, and - * we are extending the inode beyond the last extent - * in the current leaf block, then clear the - * EOFBLOCKS_FL flag. - */ - for (i = depth-1; i >= 0; i--) { - if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) - break; - } - if ((i < 0) && - (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + - ext4_ext_get_actual_len(last_ex))) - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - } + err = check_eofblocks_fl(handle, inode, map, path, ar.len); + if (err) + goto out2; + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; } /* previous routine could use block we allocated */ - newblock = ext_pblock(&newex); + newblock = ext4_ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); if (allocated > map->m_len) allocated = map->m_len; @@ -3729,7 +3667,7 @@ retry: printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, - inode->i_ino, block, max_blocks); + inode->i_ino, map.m_lblk, max_blocks); #endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ee92b66d4558..5a5c55ddceef 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } +/* + * ext4_llseek() copied from generic_file_llseek() to handle both + * block-mapped and extent-mapped maxbytes values. This should + * otherwise be identical with generic_file_llseek(). + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + else + maxbytes = inode->i_sb->s_maxbytes; + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + mutex_unlock(&inode->i_mutex); + return file->f_pos; + } + offset += file->f_pos; + break; + } + + if (offset < 0 || offset > maxbytes) { + mutex_unlock(&inode->i_mutex); + return -EINVAL; + } + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + + return offset; +} + const struct file_operations ext4_file_operations = { - .llseek = generic_file_llseek, + .llseek = ext4_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3f3ff5ee8f9d..c1a7bc923cf6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -34,6 +34,89 @@ #include <trace/events/ext4.h> +static void dump_completed_IO(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); +#endif +} + +/* + * This function is called from ext4_sync_file(). + * + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. + */ +static int flush_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret = 0; + int ret2 = 0; + + if (list_empty(&ei->i_completed_io_list)) + return ret; + + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_io_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + return (ret2 < 0) ? ret2 : 0; +} + /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 45853e0d1f21..1ce240a23ebb 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -50,7 +50,7 @@ * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ -void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; @@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) } /* Initializes an uninitialized inode bitmap */ -unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); @@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); @@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -411,8 +415,8 @@ struct orlov_stats { * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ -void get_orlov_stats(struct super_block *sb, ext4_group_t g, - int flex_size, struct orlov_stats *stats) +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; @@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb, { int free = 0, retval = 0, count; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + /* + * We have to be sure that new inode allocation does not race with + * inode table initialization, because otherwise we may end up + * allocating and writing new inode right before sb_issue_zeroout + * takes place and overwriting our new inode with zeroes. So we + * take alloc_sem to prevent it. + */ + down_read(&grp->alloc_sem); ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ @@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb, if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); ext4_error(sb, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); @@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb, gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); return retval; } @@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb) } return count; } + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_claim_inode until we are finished. + */ +extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + + /* This should not happen, but just to be sure check this */ + if (sb->s_flags & MS_RDONLY) { + ret = 1; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); + + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + ext4_error(sb, "Something is wrong with group %u\n" + "Used itable blocks: %d" + "itable unused count: %u\n", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto out; + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, + group_desc_bh); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 49635ef236f8..bdbe69902207 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -60,6 +61,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, } static void ext4_invalidatepage(struct page *page, unsigned long offset); +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); /* * Test whether an inode is a fast symlink. @@ -172,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -755,6 +763,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -1207,8 +1220,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, break; idx++; num++; - if (num >= max_pages) + if (num >= max_pages) { + done = 1; break; + } } pagevec_release(&pvec); } @@ -1995,16 +2010,23 @@ static void ext4_da_page_release_reservation(struct page *page, * * As pages are already locked by write_cache_pages(), we can't use it */ -static int mpage_da_submit_io(struct mpage_da_data *mpd) +static int mpage_da_submit_io(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) { - long pages_skipped; struct pagevec pvec; unsigned long index, end; int ret = 0, err, nr_pages, i; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + loff_t size = i_size_read(inode); + unsigned int len, block_start; + struct buffer_head *bh, *page_bufs = NULL; + int journal_data = ext4_should_journal_data(inode); + sector_t pblock = 0, cur_logical = 0; + struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); + memset(&io_submit, 0, sizeof(io_submit)); /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -2020,122 +2042,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { + int commit_write = 0, redirty_page = 0; struct page *page = pvec.pages[i]; index = page->index; if (index > end) break; + + if (index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + if (map) { + cur_logical = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + pblock = map->m_pblk + (cur_logical - + map->m_lblk); + } index++; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - pages_skipped = mpd->wbc->pages_skipped; - err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err && (pages_skipped == mpd->wbc->pages_skipped)) - /* - * have successfully written the page - * without skipping the same - */ - mpd->pages_written++; /* - * In error case, we have to continue because - * remaining pages are still locked - * XXX: unlock and re-dirty them? + * If the page does not have buffers (for + * whatever reason), try to create them using + * __block_write_begin. If this fails, + * redirty the page and move on. */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - return ret; -} - -/* - * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers - * - * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten - */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - int blocks = map->m_len; - sector_t pblock = map->m_pblk, cur_logical; - struct buffer_head *head, *bh; - pgoff_t index, end; - struct pagevec pvec; - int nr_pages, i; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - - while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(!page_has_buffers(page)); - - bh = page_buffers(page); - head = bh; - - /* skip blocks out of the range */ - do { - if (cur_logical >= map->m_lblk) - break; - cur_logical++; - } while ((bh = bh->b_this_page) != head); + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: + redirty_page_for_writepage(mpd->wbc, + page); + unlock_page(page); + continue; + } + commit_write = 1; + } + bh = page_bufs = page_buffers(page); + block_start = 0; do { - if (cur_logical >= map->m_lblk + blocks) - break; - - if (buffer_delay(bh) || buffer_unwritten(bh)) { - - BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); - + if (!bh) + goto redirty_page; + if (map && (cur_logical >= map->m_lblk) && + (cur_logical <= (map->m_lblk + + (map->m_len - 1)))) { if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock; - } else { - /* - * unwritten already should have - * blocknr assigned. Verify that - */ - clear_buffer_unwritten(bh); - BUG_ON(bh->b_blocknr != pblock); } + if (buffer_unwritten(bh) || + buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + if (map->m_flags & EXT4_MAP_UNINIT) + set_buffer_uninit(bh); + clear_buffer_unwritten(bh); + } - } else if (buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); + /* redirty page if block allocation undone */ + if (buffer_delay(bh) || buffer_unwritten(bh)) + redirty_page = 1; + bh = bh->b_this_page; + block_start += bh->b_size; cur_logical++; pblock++; - } while ((bh = bh->b_this_page) != head); + } while (bh != page_bufs); + + if (redirty_page) + goto redirty_page; + + if (commit_write) + /* mark the buffer_heads as dirty & uptodate */ + block_commit_write(page, 0, len); + + /* + * Delalloc doesn't support data journalling, + * but eventually maybe we'll lift this + * restriction. + */ + if (unlikely(journal_data && PageChecked(page))) + err = __ext4_journalled_writepage(page, len); + else + err = ext4_bio_write_page(&io_submit, page, + len, mpd->wbc); + + if (!err) + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + */ + if (ret == 0) + ret = err; } pagevec_release(&pvec); } + ext4_io_submit(&io_submit); + return ret; } - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, sector_t logical, long blk_cnt) { @@ -2187,35 +2195,32 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * mpage_da_map_blocks - go through given space + * mpage_da_map_and_submit - go through given space, map them + * if necessary, and then submit them for I/O * * @mpd - bh describing space * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static void mpage_da_map_and_submit(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; - struct ext4_map_blocks map; + struct ext4_map_blocks map, *mapp = NULL; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; loff_t disksize = EXT4_I(mpd->inode)->i_disksize; handle_t *handle = NULL; /* - * We consider only non-mapped and non-allocated blocks + * If the blocks are mapped already, or we couldn't accumulate + * any blocks, then proceed immediately to the submission stage. */ - if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten))) - return 0; - - /* - * If we didn't accumulate anything to write simply return - */ - if (!mpd->b_size) - return 0; + if ((mpd->b_size == 0) || + ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten)))) + goto submit_io; handle = ext4_journal_current_handle(); BUG_ON(!handle); @@ -2252,17 +2257,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) err = blks; /* - * If get block returns with error we simply - * return. Later writepage will redirty the page and - * writepages will find the dirty page again + * If get block returns EAGAIN or ENOSPC and there + * appears to be free blocks we will call + * ext4_writepage() for all of the pages which will + * just redirty the pages. */ if (err == -EAGAIN) - return 0; + goto submit_io; if (err == -ENOSPC && ext4_count_free_blocks(sb)) { mpd->retval = err; - return 0; + goto submit_io; } /* @@ -2287,10 +2293,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); - return err; + return; } BUG_ON(blks == 0); + mapp = ↦ if (map.m_flags & EXT4_MAP_NEW) { struct block_device *bdev = mpd->inode->i_sb->s_bdev; int i; @@ -2299,18 +2306,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) unmap_underlying_metadata(bdev, map.m_pblk + i); } - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if ((mpd->b_state & (1 << BH_Delay)) || - (mpd->b_state & (1 << BH_Unwritten))) - mpage_put_bnr_to_bhs(mpd, &map); - if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); if (err) - return err; + /* This only happens if the journal is aborted */ + return; } /* @@ -2321,10 +2321,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) disksize = i_size_read(mpd->inode); if (disksize > EXT4_I(mpd->inode)->i_disksize) { ext4_update_i_disksize(mpd->inode, disksize); - return ext4_mark_inode_dirty(handle, mpd->inode); + err = ext4_mark_inode_dirty(handle, mpd->inode); + if (err) + ext4_error(mpd->inode->i_sb, + "Failed to mark inode %lu dirty", + mpd->inode->i_ino); } - return 0; +submit_io: + mpage_da_submit_io(mpd, mapp); + mpd->io_done = 1; } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ @@ -2401,9 +2407,7 @@ flush_it: * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); - mpd->io_done = 1; + mpage_da_map_and_submit(mpd); return; } @@ -2422,9 +2426,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) * The function finds extents of pages and scan them for all blocks. */ static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, + struct mpage_da_data *mpd) { - struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; struct buffer_head *bh, *head; sector_t logical; @@ -2435,15 +2439,13 @@ static int __mpage_da_writepage(struct page *page, if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using writepage() + * and start IO on them */ if (mpd->next_page != mpd->first_page) { - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); + mpage_da_map_and_submit(mpd); /* * skip rest of the page in the page_vec */ - mpd->io_done = 1; redirty_page_for_writepage(wbc, page); unlock_page(page); return MPAGE_DA_EXTENT_TAIL; @@ -2622,6 +2624,7 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; + ClearPageChecked(page); page_bufs = page_buffers(page); BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); @@ -2699,7 +2702,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0; + int ret = 0, commit_write = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; @@ -2712,71 +2715,44 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; - if (page_has_buffers(page)) { - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation - * So redirty the page and return - * We may reach here when we do a journal commit - * via journal_submit_inode_data_buffers. - * If we don't have mapping block we just ignore - * them. We can also reach here via shrink_page_list - */ + /* + * If the page does not have buffers (for whatever reason), + * try to create them using __block_write_begin. If this + * fails, redirty the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } - } else { + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* - * The test for page_has_buffers() is subtle: - * We know the page is dirty but it lost buffers. That means - * that at some moment in time after write_begin()/write_end() - * has been called all buffers have been clean and thus they - * must have been written at least once. So they are all - * mapped and we can happily proceed with mapping them - * and writing the page. - * - * Try to initialize the buffer_heads and check whether - * all are mapped and non delay. We don't want to - * do block allocation here. + * We don't want to do block allocation, so redirty + * the page and return. We may reach here when we do + * a journal commit via journal_submit_inode_data_buffers. + * We can also reach here via shrink_page_list */ - ret = __block_write_begin(page, 0, len, - noalloc_get_block_write); - if (!ret) { - page_bufs = page_buffers(page); - /* check whether all are mapped and non delay */ - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } else { - /* - * We can't do block allocation here - * so just redity the page and unlock - * and return - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } + goto redirty_page; + } + if (commit_write) /* now mark the buffer_heads as dirty and uptodate */ block_commit_write(page, 0, len); - } - if (PageChecked(page) && ext4_should_journal_data(inode)) { + if (PageChecked(page) && ext4_should_journal_data(inode)) /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ - ClearPageChecked(page); return __ext4_journalled_writepage(page, len); - } - if (page_bufs && buffer_uninit(page_bufs)) { + if (buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); @@ -2823,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) */ static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, - struct mpage_da_data *mpd) + struct mpage_da_data *mpd, + pgoff_t *done_index) { int ret = 0; int done = 0; struct pagevec pvec; - int nr_pages; + unsigned nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ long nr_to_write = wbc->nr_to_write; + int tag; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + *done_index = index; while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -2861,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping, break; } + *done_index = page->index + 1; + lock_page(page); /* @@ -2946,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping, long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t done_index = 0; + pgoff_t end; trace_ext4_da_writepages(inode, wbc); @@ -2981,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->range_start = index << PAGE_CACHE_SHIFT; wbc->range_end = LLONG_MAX; wbc->range_cyclic = 0; - } else + end = -1; + } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } /* * This works around two forms of stupidity. The first is in @@ -3001,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping, * sbi->max_writeback_mb_bump whichever is smaller. */ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) - desired_nr_to_write = wbc->nr_to_write * 8; - else + if (!range_cyclic && range_whole) { + if (wbc->nr_to_write == LONG_MAX) + desired_nr_to_write = wbc->nr_to_write; + else + desired_nr_to_write = wbc->nr_to_write * 8; + } else desired_nr_to_write = ext4_num_dirty_pages(inode, index, max_pages); if (desired_nr_to_write > max_pages) @@ -3020,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping, pages_skipped = wbc->pages_skipped; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!ret && wbc->nr_to_write > 0) { /* @@ -3058,16 +3054,14 @@ retry: mpd.io_done = 0; mpd.pages_written = 0; mpd.retval = 0; - ret = write_cache_pages_da(mapping, wbc, &mpd); + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit * them for I/O. */ if (!mpd.io_done && mpd.next_page != mpd.first_page) { - if (mpage_da_map_blocks(&mpd) == 0) - mpage_da_submit_io(&mpd); - mpd.io_done = 1; + mpage_da_map_and_submit(&mpd); ret = MPAGE_DA_EXTENT_TAIL; } trace_ext4_da_write_pages(inode, &mpd); @@ -3114,14 +3108,13 @@ retry: __func__, wbc->nr_to_write, ret); /* Update index */ - index += pages_written; wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = index; + mapping->writeback_index = done_index; out_writepages: wbc->nr_to_write -= nr_to_writebump; @@ -3456,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - if (io->page) - put_page(io->page); - iput(io->inode); - kfree(io); -} - static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) { struct buffer_head *head, *bh; @@ -3641,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * check a range of space and convert unwritten extents to written. - */ -static int ext4_end_io_nolock(ext4_io_end_t *io) -{ - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - int ret = 0; - - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," - "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); - - if (list_empty(&io->list)) - return ret; - - if (io->flag != EXT4_IO_UNWRITTEN) - return ret; - - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d" - " io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; - } - - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - io->flag = 0; - return ret; -} - -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) -{ - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret; - - mutex_lock(&inode->i_mutex); - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - mutex_unlock(&inode->i_mutex); - ext4_free_io_end(io); -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - if (list_empty(&ei->i_completed_io_list)) - return ret; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * convertion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (ret < 0) - ret2 = ret; - else - list_del_init(&io->list); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) -{ - ext4_io_end_t *io = NULL; - - io = kmalloc(sizeof(*io), flags); - - if (io) { - igrab(inode); - io->inode = inode; - io->flag = 0; - io->offset = 0; - io->size = 0; - io->page = NULL; - io->iocb = NULL; - io->result = 0; - INIT_WORK(&io->work, ext4_end_io_work); - INIT_LIST_HEAD(&io->list); - } - - return io; -} - static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) @@ -3827,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != EXT4_IO_UNWRITTEN){ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); iocb->private = NULL; out: @@ -3844,14 +3661,14 @@ out: } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); - /* Add the io_end to per-inode completed aio dio list*/ ei = EXT4_I(io_end->inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); iocb->private = NULL; } @@ -3872,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_UNWRITTEN; + io_end->flag = EXT4_IO_END_UNWRITTEN; inode = io_end->inode; /* Add the io_end to per-inode completed io list*/ @@ -5463,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; int error, rc = 0; + int orphan = 0; const unsigned int ia_valid = attr->ia_valid; error = inode_change_ok(inode, attr); @@ -5518,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - - error = ext4_orphan_add(handle, inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5537,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } ext4_orphan_del(handle, inode); + orphan = 0; ext4_journal_stop(handle); goto err_out; } @@ -5559,7 +5380,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ - if (inode->i_nlink) + if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) @@ -5591,9 +5412,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; return 0; @@ -5642,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5830,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 42f77b1dc72d..5b4d4e3a4d58 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -338,6 +338,14 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_ext_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES \ + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1) +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, @@ -939,6 +947,85 @@ out: } /* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, + ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= ngroups) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write_nested(&grp->alloc_sem, i); + } + return i; +} + +static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +/* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! @@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } -/* - * lock the group_info alloc_sem of all the groups - * belonging to the same buddy cache page. This - * make sure other parallel operation on the buddy - * cache doesn't happen whild holding the buddy cache - * lock - */ -int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) -{ - int i; - int block, pnum; - int blocks_per_page; - int groups_per_page; - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - - if ((first_group + i) >= ngroups) - break; - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - down_write_nested(&grp->alloc_sem, i); - } - return i; -} - -void ext4_mb_put_buddy_cache_lock(struct super_block *sb, - ext4_group_t group, int locked_group) -{ - int i; - int block, pnum; - int blocks_per_page; - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - /* release locks on all the groups */ - for (i = 0; i < locked_group; i++) { - - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - up_write(&grp->alloc_sem); - } - -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { - int i, len; + int i; int metalen = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. @@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = offsetof(typeof(**meta_group_info), - bb_counters[sb->s_blocksize_bits + 2]); - meta_group_info = sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kzalloc(len, GFP_KERNEL); + meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); goto exit_group_info; } + memset(meta_group_info[i], 0, kmem_cache_size(cachep)); set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); @@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb) int num_meta_group_infos_max; int array_size; struct ext4_group_desc *desc; + struct kmem_cache *cachep; /* This is the number of blocks used by GDT */ num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - @@ -2389,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb) return 0; err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) - kfree(ext4_get_group_info(sb, i)); + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); i = num_meta_group_infos; while (i-- > 0) kfree(sbi->s_group_info[i]); @@ -2407,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned offset; unsigned max; int ret; + int cache_index; + struct kmem_cache *cachep; + char *namep = NULL; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - return -ENOMEM; + ret = -ENOMEM; + goto out; } i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_offsets); - return -ENOMEM; + ret = -ENOMEM; + goto out; + } + + cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + cachep = ext4_groupinfo_caches[cache_index]; + if (!cachep) { + char name[32]; + int len = offsetof(struct ext4_group_info, + bb_counters[sb->s_blocksize_bits + 2]); + + sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits); + namep = kstrdup(name, GFP_KERNEL); + if (!namep) { + ret = -ENOMEM; + goto out; + } + + /* Need to free the kmem_cache_name() when we + * destroy the slab */ + cachep = kmem_cache_create(namep, len, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!cachep) { + ret = -ENOMEM; + goto out; + } + ext4_groupinfo_caches[cache_index] = cachep; } /* order 0 is regular bitmap */ @@ -2440,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return ret; + goto out; } spin_lock_init(&sbi->s_md_lock); @@ -2457,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return -ENOMEM; + ret = -ENOMEM; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2476,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; +out: + if (ret) { + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + kfree(namep); + } + return ret; } /* need to called with the ext4 group lock held */ @@ -2504,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb) int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { @@ -2514,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb) ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); - kfree(grinfo); + kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> @@ -2558,7 +2605,7 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static inline void ext4_issue_discard(struct super_block *sb, +static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t block, int count) { int ret; @@ -2568,10 +2615,11 @@ static inline void ext4_issue_discard(struct super_block *sb, trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); - if (ret == EOPNOTSUPP) { + if (ret == -EOPNOTSUPP) { ext4_warning(sb, "discard not supported, disabling"); clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); } + return ret; } /* @@ -2659,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void) #endif -int __init init_ext4_mballoc(void) +int __init ext4_init_mballoc(void) { - ext4_pspace_cachep = - kmem_cache_create("ext4_prealloc_space", - sizeof(struct ext4_prealloc_space), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; - ext4_ac_cachep = - kmem_cache_create("ext4_alloc_context", - sizeof(struct ext4_allocation_context), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } - ext4_free_ext_cachep = - kmem_cache_create("ext4_free_block_extents", - sizeof(struct ext4_free_data), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); if (ext4_free_ext_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); @@ -2690,8 +2732,9 @@ int __init init_ext4_mballoc(void) return 0; } -void exit_ext4_mballoc(void) +void ext4_exit_mballoc(void) { + int i; /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. @@ -2700,6 +2743,15 @@ void exit_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + struct kmem_cache *cachep = ext4_groupinfo_caches[i]; + if (cachep) { + char *name = (char *)kmem_cache_name(cachep); + kmem_cache_destroy(cachep); + kfree(name); + } + } ext4_remove_debugfs_entry(); } @@ -3536,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3555,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - } - while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) @@ -3570,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, (unsigned) next - bit, (unsigned) group); free += next - bit; - if (ac) { - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = next - bit; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } - - trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, - next - bit); + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, + grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3602,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, ac, pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = NULL; - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = pa->pa_len; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } @@ -3645,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; struct list_head list; struct ext4_buddy e4b; int err; @@ -3674,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3731,9 +3756,9 @@ repeat: spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); @@ -3741,8 +3766,6 @@ repeat: out: ext4_unlock_group(sb, group); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; @@ -3763,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode) struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; @@ -3779,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode) INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = inode; - } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -3853,7 +3870,7 @@ repeat: ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -3862,8 +3879,6 @@ repeat: list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4061,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4120,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4492,7 +4501,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; unsigned long freed = 0; unsigned int overflow; @@ -4532,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); + if (unlikely(!tbh)) + continue; ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); } @@ -4547,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_inode = inode; - ac->ac_sb = sb; - } - do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4610,12 +4614,7 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - if (ac) { - ac->ac_b_ex.fe_group = block_group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = count; - trace_ext4_mballoc_free(ac); - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4645,8 +4644,6 @@ do_more: mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4686,7 +4683,190 @@ error_return: dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return; } + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex; + int ret = 0; + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + + ret = ext4_issue_discard(sb, group, start, count); + if (ret) + ext4_std_error(sb, ret); + + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @e4b: ext4 buddy + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0; + ext4_group_t group; + int ret = 0; + + BUG_ON(e4b == NULL); + + bitmap = e4b->bd_bitmap; + group = e4b->bd_group; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + ext4_lock_group(sb, group); + + while (start < max) { + start = mb_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = mb_find_next_bit(bitmap, max, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, + next - start, group, e4b); + if (ret < 0) + break; + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b->bd_info->bb_free - count) < minblocks) + break; + } + ext4_unlock_group(sb, group); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + if (ret < 0) + count = ret; + + return count; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_buddy e4b; + ext4_group_t first_group, last_group; + ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_grpblk_t cnt = 0, first_block, last_block; + uint64_t start, len, minlen, trimmed; + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + len = range->len >> sb->s_blocksize_bits; + minlen = range->minlen >> sb->s_blocksize_bits; + trimmed = 0; + + if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + return -EINVAL; + + /* Determine first and last group to examine based on start and len */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_block); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + &last_group, &last_block); + last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; + last_block = EXT4_BLOCKS_PER_GROUP(sb); + + if (first_group > last_group) + return -EINVAL; + + for (group = first_group; group <= last_group; group++) { + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + break; + } + + if (len >= EXT4_BLOCKS_PER_GROUP(sb)) + len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); + else + last_block = len; + + if (e4b.bd_info->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, &e4b, first_block, + last_block, minlen); + if (cnt < 0) { + ret = cnt; + ext4_mb_unload_buddy(&e4b); + break; + } + } + ext4_mb_unload_buddy(&e4b); + trimmed += cnt; + first_block = 0; + } + range->len = trimmed * sb->s_blocksize; + + return ret; +} diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 1765c2c50a9b..25f3a974b725 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct buffer_head *bh; struct ext4_extent_header *eh; - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); bh = sb_bread(inode->i_sb, block); if (!bh) return -EIO; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5f1ed9fc913c..b9f3e7862f13 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext_pblock(path[ppos].p_ext); + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); return 0; } @@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, /* index block */ path[ppos].p_idx++; - path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); if (path[ppos+1].p_bh) brelse(path[ppos+1].p_bh); path[ppos+1].p_bh = @@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, path[cur_ppos].p_idx = EXT_FIRST_INDEX(path[cur_ppos].p_hdr); path[cur_ppos].p_block = - idx_pblock(path[cur_ppos].p_idx); + ext4_idx_pblock(path[cur_ppos].p_idx); if (path[cur_ppos+1].p_bh) brelse(path[cur_ppos+1].p_bh); path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, @@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); path[leaf_ppos].p_block = - ext_pblock(path[leaf_ppos].p_ext); + ext4_ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, */ o_end->ee_block = end_ext->ee_block; o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); } o_start->ee_len = start_ext->ee_len; @@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, */ o_end->ee_block = end_ext->ee_block; o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); /* * Set 0 to the extent block if new_ext was @@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start, /* Insert new entry */ if (new_ext->ee_len) { o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); + ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); } /* Insert end entry */ @@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, start_ext.ee_len = end_ext.ee_len = 0; new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); + ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); new_ext.ee_len = dext->ee_len; new_ext_alen = ext4_ext_get_actual_len(&new_ext); new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; @@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, copy_extent_status(oext, &end_ext); end_ext_alen = ext4_ext_get_actual_len(&end_ext); ext4_ext_store_pblock(&end_ext, - (ext_pblock(o_end) + oext_alen - end_ext_alen)); + (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); end_ext.ee_block = cpu_to_le32(le32_to_cpu(o_end->ee_block) + oext_alen - end_ext_alen); @@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, /* When tmp_dext is too large, pick up the target range. */ diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); + ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); tmp_dext->ee_block = cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); @@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, tmp_dext->ee_len = cpu_to_le16(max_count); orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); + ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); /* Adjust extent length if donor extent is larger than orig */ if (ext4_ext_get_actual_len(tmp_dext) > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bd39885b5998..92203b8a099f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block, b; + const u8 *name = d_name->name; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead @@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } if (is_dx(dir)) { bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); /* @@ -960,55 +971,35 @@ cleanup_and_exit: static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *err) { - struct super_block * sb; + struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; - u32 hash; struct dx_frame frames[2], *frame; - struct ext4_dir_entry_2 *de, *top; struct buffer_head *bh; ext4_lblk_t block; int retval; - int namelen = d_name->len; - const u8 *name = d_name->name; - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ - } - hash = hinfo.hash; + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) + return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread (NULL,dir, block, 0, err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) goto errout; - de = (struct ext4_dir_entry_2 *) bh->b_data; - top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { - int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) - + ((char *) de - bh->b_data); - - if (!ext4_check_dir_entry(dir, de, bh, off)) { - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto errout; - } - if (ext4_match(namelen, name, de)) { - *res_dir = de; - dx_release(frames); - return bh; - } + retval = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { /* Success! */ + dx_release(frames); + return bh; } brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hash, frame, + retval = ext4_htree_next_block(dir, hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning(sb, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c new file mode 100644 index 000000000000..7f5451cd1d38 --- /dev/null +++ b/fs/ext4/page-io.c @@ -0,0 +1,431 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +static struct kmem_cache *io_page_cachep, *io_end_cachep; + +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + +int __init ext4_init_pageio(void) +{ + int i; + + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) + return -ENOMEM; + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) { + kmem_cache_destroy(io_page_cachep); + return -ENOMEM; + } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); + + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_page_cachep); +} + +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + +void ext4_free_io_end(ext4_io_end_t *io) +{ + int i; + wait_queue_head_t *wq; + + BUG_ON(!io); + if (io->page) + put_page(io->page); + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); + io->num_io_pages = 0; + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); + kmem_cache_free(io_end_cachep, io); +} + +/* + * check a range of space and convert unwritten extents to written. + */ +int ext4_end_io_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) + return ret; + + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten " + "extents to written extents, error is %d " + "io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + if (io->iocb) + aio_complete(io->iocb, io->result, 0); + /* clear the DIO AIO unwritten flag */ + io->flag &= ~EXT4_IO_END_UNWRITTEN; + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; + } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = NULL; + + io = kmem_cache_alloc(io_end_cachep, flags); + if (io) { + memset(io, 0, sizeof(*io)); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; + INIT_WORK(&io->work, ext4_end_io_work); + INIT_LIST_HEAD(&io->list); + } + return io; +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + int i; + + BUG_ON(!io_end); + bio->bi_private = NULL; + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + bio_put(bio); + + for (i = 0; i < io_end->num_io_pages; i++) { + struct page *page = io_end->pages[i]->p_page; + struct buffer_head *bh, *head; + int partial_write = 0; + + head = page_buffers(page); + if (error) + SetPageError(page); + BUG_ON(!head); + if (head->b_size == PAGE_CACHE_SIZE) + clear_buffer_dirty(head); + else { + loff_t offset; + loff_t io_end_offset = io_end->offset + io_end->size; + + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; + bh = head; + do { + if ((offset >= io_end->offset) && + (offset+bh->b_size <= io_end_offset)) { + if (error) + buffer_io_error(bh); + + clear_buffer_dirty(bh); + } + if (buffer_delay(bh)) + partial_write = 1; + else if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + else if (buffer_dirty(bh)) + partial_write = 1; + offset += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + put_io_page(io_end->pages[i]); + + /* + * If this is a partial write which happened to make + * all buffers uptodate then we can optimize away a + * bogus readpage() for the next read(). Here we + * 'discover' whether the page went uptodate as a + * result of this (potentially partial) write. + */ + if (!partial_write) + SetPageUptodate(page); + } + io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = 0; + io->io_op = 0; + io->io_end = 0; +} + +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + do { + bio = bio_alloc(GFP_NOIO, nvecs); + nvecs >>= 1; + } while (bio == NULL); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; + bio->bi_end_io = ext4_end_bio; + + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct ext4_io_page *io_page, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + int ret; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (!buffer_mapped(bh) || buffer_delay(bh)) { + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + return 0; + } + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init(io, inode, wbc, bh); + if (ret) + return ret; + } + io_end = io->io_end; + if ((io_end->num_io_pages >= MAX_IO_PAGES) && + (io_end->pages[io_end->num_io_pages-1] != io_page)) + goto submit_and_retry; + if (buffer_uninit(bh)) + io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + io->io_end->size += bh->b_size; + io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + if ((io_end->num_io_pages == 0) || + (io_end->pages[io_end->num_io_pages-1] != io_page)) { + io_end->pages[io_end->num_io_pages++] = io_page; + atomic_inc(&io_page->p_count); + } + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + struct ext4_io_page *io_page; + struct buffer_head *bh, *head; + int ret = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + ClearPageError(page); + + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); + if (!io_page) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } + io_page->p_page = page; + atomic_set(&io_page->p_count, 1); + get_page(page); + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_start >= len) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + set_page_dirty(page); + break; + } + } + unlock_page(page); + /* + * If the page was truncated before we could do the writeback, + * or we had a memory allocation error while trying to write + * the first buffer head, we won't have submitted any pages for + * I/O. In that case we need to make sure we've cleared the + * PageWriteback bit from the page to prevent the system from + * wedging later on. + */ + put_io_page(io_page); + return ret; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ca5c8aa00a2f..dc963929de65 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb, } /* Zero out all of the reserved backup group descriptor table blocks */ - for (i = 0, bit = gdblocks + 1, block = start + bit; - i < reserved_gdb; i++, block++, bit++) { - struct buffer_head *gdb; - - ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; + ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, + GFP_NOFS); + if (err) + goto exit_bh; - if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(gdb); - goto exit_bh; - } - ext4_handle_dirty_metadata(handle, NULL, gdb); - ext4_set_bit(bit, bh->b_data); - brelse(gdb); - } ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); ext4_set_bit(input->block_bitmap - start, bh->b_data); @@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_set_bit(input->inode_bitmap - start, bh->b_data); /* Zero out all of the inode table blocks */ - for (i = 0, block = input->inode_table, bit = block - start; - i < sbi->s_itb_per_group; i++, bit++, block++) { - struct buffer_head *it; - - ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; - - if (IS_ERR(it = bclean(handle, sb, block))) { - err = PTR_ERR(it); - goto exit_bh; - } - ext4_handle_dirty_metadata(handle, NULL, it); - brelse(it); - ext4_set_bit(bit, bh->b_data); - } + block = input->inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); + if (err) + goto exit_bh; if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; - mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); + ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, + bh->b_data); ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); /* Mark unused entries in inode bitmap used */ @@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); ext4_handle_dirty_metadata(handle, NULL, bh); exit_bh: brelse(bh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ecc1e590303..61182fe6254e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -40,6 +40,9 @@ #include <linux/crc16.h> #include <asm/uaccess.h> +#include <linux/kthread.h> +#include <linux/freezer.h> + #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -49,8 +52,11 @@ #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> -struct proc_dir_entry *ext4_proc_root; +static struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; +struct ext4_lazy_init *ext4_li_info; +struct mutex ext4_li_mtx; +struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -67,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); +static void ext4_destroy_lazyinit_thread(void); +static void ext4_unregister_li_request(struct super_block *sb); #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -701,6 +709,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); flush_workqueue(sbi->dio_unwritten_wq); @@ -717,6 +726,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -818,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1042,6 +1062,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) seq_puts(seq, ",block_validity"); + if (!test_opt(sb, INIT_INODE_TABLE)) + seq_puts(seq, ",noinit_inode_table"); + else if (sbi->s_li_wait_mult) + seq_printf(seq, ",init_inode_table=%u", + (unsigned) sbi->s_li_wait_mult); + ext4_show_quota_options(seq, sb); return 0; @@ -1157,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1170,6 +1197,7 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, #endif .bdev_try_to_free_page = bdev_try_to_free_page, + .trim_fs = ext4_trim_fs }; static const struct super_operations ext4_nojournal_sops = { @@ -1177,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -1216,6 +1245,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, + Opt_init_inode_table, Opt_noinit_inode_table, }; static const match_table_t tokens = { @@ -1286,6 +1316,9 @@ static const match_table_t tokens = { {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, + {Opt_init_inode_table, "init_itable=%u"}, + {Opt_init_inode_table, "init_itable"}, + {Opt_noinit_inode_table, "noinit_itable"}, {Opt_err, NULL}, }; @@ -1756,6 +1789,20 @@ set_qf_format: case Opt_dioread_lock: clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); break; + case Opt_init_inode_table: + set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = EXT4_DEF_LI_WAIT_MULT; + if (option < 0) + return 0; + sbi->s_li_wait_mult = option; + break; + case Opt_noinit_inode_table: + clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1939,7 +1986,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, } /* Called at mount-time, super-block is locked */ -static int ext4_check_descriptors(struct super_block *sb) +static int ext4_check_descriptors(struct super_block *sb, + ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -1948,7 +1996,7 @@ static int ext4_check_descriptors(struct super_block *sb) ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; - ext4_group_t i; + ext4_group_t i, grp = sbi->s_groups_count; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) flexbg_flag = 1; @@ -1964,6 +2012,10 @@ static int ext4_check_descriptors(struct super_block *sb) last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2001,6 +2053,8 @@ static int ext4_check_descriptors(struct super_block *sb) if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); @@ -2373,6 +2427,7 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) +#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ @@ -2409,6 +2464,16 @@ static struct attribute *ext4_attrs[] = { NULL, }; +/* Features this copy of ext4 supports */ +EXT4_INFO_ATTR(lazy_itable_init); +EXT4_INFO_ATTR(batched_discard); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + NULL, +}; + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -2437,7 +2502,6 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } - static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -2449,6 +2513,17 @@ static struct kobj_type ext4_ktype = { .release = ext4_sb_release, }; +static void ext4_feat_release(struct kobject *kobj) +{ + complete(&ext4_feat->f_kobj_unregister); +} + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_feat_release, +}; + /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. @@ -2539,6 +2614,371 @@ static void print_daily_error_info(unsigned long arg) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } +static void ext4_lazyinode_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + ext4_group_t group, ngroups; + struct super_block *sb; + unsigned long timeout = 0; + int ret = 0; + + sb = elr->lr_super; + ngroups = EXT4_SB(sb)->s_groups_count; + + for (group = elr->lr_next_group; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group == ngroups) + ret = 1; + + if (!ret) { + timeout = jiffies; + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + if (elr->lr_timeout == 0) { + timeout = jiffies - timeout; + if (elr->lr_sbi->s_li_wait_mult) + timeout *= elr->lr_sbi->s_li_wait_mult; + else + timeout *= 20; + elr->lr_timeout = timeout; + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request tructure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + struct ext4_sb_info *sbi; + + if (!elr) + return; + + sbi = elr->lr_sbi; + + list_del(&elr->lr_request); + sbi->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; + + if (!ext4_li_info) + return; + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(elr); + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup; + DEFINE_WAIT(wait); + + BUG_ON(NULL == eli); + + eli->li_timer.data = (unsigned long)current; + eli->li_timer.function = ext4_lazyinode_timeout; + + eli->li_task = current; + wake_up(&eli->li_wait_task); + +cont_thread: + while (true) { + next_wakeup = MAX_JIFFY_OFFSET; + + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + + list_for_each_safe(pos, n, &eli->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } + } + + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + } + mutex_unlock(&eli->li_list_mtx); + + if (freezing(current)) + refrigerator(); + + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { + cond_resched(); + continue; + } + + eli->li_timer.expires = next_wakeup; + add_timer(&eli->li_timer); + prepare_to_wait(&eli->li_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + if (time_before(jiffies, next_wakeup)) + schedule(); + finish_wait(&eli->li_wait_daemon, &wait); + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + del_timer_sync(&ext4_li_info->li_timer); + eli->li_task = NULL; + wake_up(&eli->li_wait_task); + + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + if (list_empty(&ext4_li_info->li_request_list)) + return; + + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + struct task_struct *t; + + t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + ext4_clear_request_list(); + del_timer_sync(&ext4_li_info->li_timer); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + + wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL); + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + eli->li_task = NULL; + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + init_waitqueue_head(&eli->li_wait_daemon); + init_waitqueue_head(&eli->li_wait_task); + init_timer(&eli->li_timer); + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + unsigned long rnd; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_sbi = sbi; + elr->lr_next_group = start; + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + get_random_bytes(&rnd, sizeof(rnd)); + elr->lr_next_sched = jiffies + (unsigned long)rnd % + (EXT4_DEF_LI_MAX_START_DELAY * HZ); + + return elr; +} + +static int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int ret; + + if (sbi->s_li_request != NULL) + return 0; + + if (first_not_zeroed == ngroups || + (sb->s_flags & MS_RDONLY) || + !test_opt(sb, INIT_INODE_TABLE)) { + sbi->s_li_request = NULL; + return 0; + } + + if (first_not_zeroed == ngroups) { + sbi->s_li_request = NULL; + return 0; + } + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) + return -ENOMEM; + + mutex_lock(&ext4_li_mtx); + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info) + return; + + ext4_clear_request_list(); + + while (ext4_li_info->li_task) { + wake_up(&ext4_li_info->li_wait_daemon); + wait_event(ext4_li_info->li_wait_task, + ext4_li_info->li_task == NULL); + } +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2564,6 +3004,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2624,6 +3065,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sbi->s_mount_opt, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { @@ -2901,7 +3343,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb)) { + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); goto failed_mount2; } @@ -2917,6 +3359,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3015,22 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3122,6 +3579,10 @@ no_journal: goto failed_mount4; } + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount4; + sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, @@ -3176,10 +3637,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3187,6 +3644,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3461,7 +3922,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; ext4_clear_journal_err(sb, es); - if (journal_devnum && + if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); @@ -3515,9 +3976,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -3835,6 +4297,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) enable_quota = 1; } } + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + ext4_setup_system_zone(sb); if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); @@ -4105,12 +4580,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } @@ -4216,17 +4689,17 @@ out: #endif -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -4271,28 +4744,58 @@ static inline void unregister_as_ext3(void) { } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; -static int __init init_ext4_fs(void) +int __init ext4_init_feat_adverts(void) +{ + struct ext4_features *ef; + int ret = -ENOMEM; + + ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); + if (!ef) + goto out; + + ef->f_kobj.kset = ext4_kset; + init_completion(&ef->f_kobj_unregister); + ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, + "features"); + if (ret) { + kfree(ef); + goto out; + } + + ext4_feat = ef; + ret = 0; +out: + return ret; +} + +static int __init ext4_init_fs(void) { int err; ext4_check_flag_values(); - err = init_ext4_system_zone(); + err = ext4_init_pageio(); if (err) return err; + err = ext4_init_system_zone(); + if (err) + goto out5; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) goto out4; ext4_proc_root = proc_mkdir("fs/ext4", NULL); - err = init_ext4_mballoc(); + + err = ext4_init_feat_adverts(); + + err = ext4_init_mballoc(); if (err) goto out3; - err = init_ext4_xattr(); + err = ext4_init_xattr(); if (err) goto out2; err = init_inodecache(); @@ -4303,38 +4806,46 @@ static int __init init_ext4_fs(void) err = register_filesystem(&ext4_fs_type); if (err) goto out; + + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); return 0; out: unregister_as_ext2(); unregister_as_ext3(); destroy_inodecache(); out1: - exit_ext4_xattr(); + ext4_exit_xattr(); out2: - exit_ext4_mballoc(); + ext4_exit_mballoc(); out3: + kfree(ext4_feat); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); out4: - exit_ext4_system_zone(); + ext4_exit_system_zone(); +out5: + ext4_exit_pageio(); return err; } -static void __exit exit_ext4_fs(void) +static void __exit ext4_exit_fs(void) { + ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - exit_ext4_xattr(); - exit_ext4_mballoc(); + ext4_exit_xattr(); + ext4_exit_mballoc(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); - exit_ext4_system_zone(); + ext4_exit_system_zone(); + ext4_exit_pageio(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); -module_init(init_ext4_fs) -module_exit(exit_ext4_fs) +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a8cd8dff1ad..fa4b899da4b3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT int __init -init_ext4_xattr(void) +ext4_init_xattr(void) { ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); if (!ext4_xattr_cache) @@ -1597,7 +1597,7 @@ init_ext4_xattr(void) } void -exit_ext4_xattr(void) +ext4_exit_xattr(void) { if (ext4_xattr_cache) mb_cache_destroy(ext4_xattr_cache); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 518e96e43905..1ef16520b950 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int init_ext4_xattr(void); -extern void exit_ext4_xattr(void); +extern int __init ext4_init_xattr(void); +extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; @@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb) { } -static inline int -init_ext4_xattr(void) +static __init inline int +ext4_init_xattr(void) { return 0; } static inline void -exit_ext4_xattr(void) +ext4_exit_xattr(void) { } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index bbca5c186ae7..3345aabd1dd7 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -675,18 +675,17 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int msdos_get_sb(struct file_system_type *fs_type, +static struct dentry *msdos_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super); } static struct file_system_type msdos_fs_type = { .owner = THIS_MODULE, .name = "msdos", - .get_sb = msdos_get_sb, + .mount = msdos_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6f0f6c9a0152..b936703b8924 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1071,18 +1071,17 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int vfat_get_sb(struct file_system_type *fs_type, +static struct dentry *vfat_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super); } static struct file_system_type vfat_fs_type = { .owner = THIS_MODULE, .name = "vfat", - .get_sb = vfat_get_sb, + .mount = vfat_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fcntl.c b/fs/fcntl.c index f8cc34f542c3..ecc8b3954ed6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head) * match the state "is the filp on a fasync list". * */ -static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) +int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; @@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) return result; } +struct fasync_struct *fasync_alloc(void) +{ + return kmem_cache_alloc(fasync_cache, GFP_KERNEL); +} + /* - * Add a fasync entry. Return negative on error, positive if - * added, and zero if did nothing but change an existing one. + * NOTE! This can be used only for unused fasync entries: + * entries that actually got inserted on the fasync list + * need to be released by rcu - see fasync_remove_entry. + */ +void fasync_free(struct fasync_struct *new) +{ + kmem_cache_free(fasync_cache, new); +} + +/* + * Insert a new entry into the fasync list. Return the pointer to the + * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ -static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { - struct fasync_struct *new, *fa, **fp; - int result = 0; - - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); @@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa spin_lock_irq(&fa->fa_lock); fa->fa_fd = fd; spin_unlock_irq(&fa->fa_lock); - - kmem_cache_free(fasync_cache, new); goto out; } @@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); - result = 1; filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); - return result; + return fa; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new; + + new = fasync_alloc(); + if (!new) + return -ENOMEM; + + /* + * fasync_insert_entry() returns the old (update) entry if + * it existed. + * + * So free the (unused) new entry and return 0 to let the + * caller know that we didn't add any new fasync entries. + */ + if (fasync_insert_entry(fd, filp, fapp, new)) { + fasync_free(new); + return 0; + } + + return 1; } /* diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 71b0148b8784..9d1c99558389 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -246,17 +246,16 @@ out: /* * The usual module blurb. */ -static int vxfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *vxfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); } static struct file_system_type vxfs_fs_type = { .owner = THIS_MODULE, .name = "vxfs", - .get_sb = vxfs_get_sb, + .mount = vxfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aed881a76b22..3d06ccc953aa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -707,6 +707,17 @@ get_next_work_item(struct backing_dev_info *bdi) return work; } +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -724,13 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; - /* - * Add in the number of potentially dirty inodes, because each inode - * write can dirty pagecache in the underlying blockdev. - */ - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - get_nr_dirty_inodes(); + nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { @@ -1076,32 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb) } /** - * writeback_inodes_sb - writeback dirty inodes from given super_block + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock + * @nr: the number of pages to write * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait - * for IO completion of submitted IO. The number of pages submitted is - * returned. + * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - - work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes(); - bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb_nr); + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); +} EXPORT_SYMBOL(writeback_inodes_sb); /** @@ -1124,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb) EXPORT_SYMBOL(writeback_inodes_sb_if_idle); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * @nr: the number of pages to write + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_nr_if_idle(struct super_block *sb, + unsigned long nr) +{ + if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); + writeback_inodes_sb_nr(sb, nr); + up_read(&sb->s_umount); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4eba07661e5c..85542a7daf40 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -322,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { - return get_sb_single(fs_type, flags, raw_data, - fuse_ctl_fill_super, mnt); + return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super); } static void fuse_ctl_kill_sb(struct super_block *sb) @@ -346,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb) static struct file_system_type fuse_ctl_fs_type = { .owner = THIS_MODULE, .name = "fusectl", - .get_sb = fuse_ctl_get_sb, + .mount = fuse_ctl_mount, .kill_sb = fuse_ctl_kill_sb, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b98664275f02..6e07696308dc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1334,12 +1334,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - int i; - - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - page_cache_release(page); - } + release_pages(req->pages, req->num_pages, 0); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index da9e6e11374c..cfce3ad86a92 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static int fuse_get_sb(struct file_system_type *fs_type, +static struct dentry *fuse_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *raw_data, struct vfsmount *mnt) + void *raw_data) { - return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); + return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); } static void fuse_kill_sb_anon(struct super_block *sb) @@ -1065,17 +1065,16 @@ static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE, - .get_sb = fuse_get_sb, + .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; #ifdef CONFIG_BLOCK -static int fuse_get_sb_blk(struct file_system_type *fs_type, +static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, int flags, const char *dev_name, - void *raw_data, struct vfsmount *mnt) + void *raw_data) { - return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); } static void fuse_kill_sb_blk(struct super_block *sb) @@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .get_sb = fuse_get_sb_blk, + .mount = fuse_mount_blk, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index cade1acbcea9..3eb1393f7b81 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1250,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr) } /** - * gfs2_get_sb - Get the GFS2 superblock + * gfs2_mount - Get the GFS2 superblock * @fs_type: The GFS2 filesystem type * @flags: Mount flags * @dev_name: The name of the device * @data: The mount arguments - * @mnt: The vfsmnt for this mount * * Q. Why not use get_sb_bdev() ? * A. We need to select one of two root directories to mount, independent @@ -1264,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr) * Returns: 0 or -ve on error */ -static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct block_device *bdev; struct super_block *s; @@ -1279,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(bdev)) - return PTR_ERR(bdev); + return ERR_CAST(bdev); /* * once the super is inserted into the list by sget, s_umount @@ -1298,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; + if (s->s_root) + close_bdev_exclusive(bdev, mode); + memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; @@ -1309,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, error = gfs2_mount_args(&args, data); if (error) { printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); - if (s->s_root) - goto error_super; - deactivate_locked_super(s); - return error; + goto error_super; } if (s->s_root) { error = -EBUSY; if ((flags ^ s->s_flags) & MS_RDONLY) goto error_super; - close_bdev_exclusive(bdev, mode); } else { char b[BDEVNAME_SIZE]; @@ -1328,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return error; - } + if (error) + goto error_super; s->s_flags |= MS_ACTIVE; bdev->bd_super = s; } sdp = s->s_fs_info; - mnt->mnt_sb = s; if (args.ar_meta) - mnt->mnt_root = dget(sdp->sd_master_dir); + return dget(sdp->sd_master_dir); else - mnt->mnt_root = dget(sdp->sd_root_dir); - return 0; + return dget(sdp->sd_root_dir); error_super: deactivate_locked_super(s); + return ERR_PTR(error); error_bdev: close_bdev_exclusive(bdev, mode); - return error; + return ERR_PTR(error); } static int set_meta_super(struct super_block *s, void *ptr) @@ -1356,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr) return -EINVAL; } -static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct super_block *s; struct gfs2_sbd *sdp; @@ -1368,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, if (error) { printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", dev_name, error); - return error; + return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - return PTR_ERR(s); + return ERR_CAST(s); } if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return -EBUSY; + return ERR_PTR(-EBUSY); } sdp = s->s_fs_info; - mnt->mnt_sb = s; - mnt->mnt_root = dget(sdp->sd_master_dir); - return 0; + return dget(sdp->sd_master_dir); } static void gfs2_kill_sb(struct super_block *sb) @@ -1410,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb, + .mount = gfs2_mount, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1418,7 +1411,7 @@ struct file_system_type gfs2_fs_type = { struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb_meta, + .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6ee1586f2334..4824c27cebb8 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -441,17 +441,16 @@ bail: return res; } -static int hfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *hfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", - .get_sb = hfs_get_sb, + .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e318bbc0daf6..9d59c0571f59 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -317,8 +317,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, sbi->hidden_dir, &str); - if (!res) + if (!res) { inode->i_flags |= S_DEAD; + drop_nlink(inode); + } goto out; } res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 5b4667e08ef7..40a85a3ded6e 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -92,7 +92,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_lock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); out_drop_write: mnt_drop_write(file->f_path.mnt); out: diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9a88d7536103..52cc746d3ba3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static int hfsplus_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *hfsplus_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", - .get_sb = hfsplus_get_sb, + .mount = hfsplus_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cd7c93917cc7..2c0f148a49e6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -962,11 +962,11 @@ out: return err; } -static int hostfs_read_sb(struct file_system_type *type, +static struct dentry *hostfs_read_sb(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); + return mount_nodev(type, flags, data, hostfs_fill_sb_common); } static void hostfs_kill_sb(struct super_block *s) @@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s) static struct file_system_type hostfs_type = { .owner = THIS_MODULE, .name = "hostfs", - .get_sb = hostfs_read_sb, + .mount = hostfs_read_sb, .kill_sb = hostfs_kill_sb, .fs_flags = 0, }; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index eac5f96323e3..793cb9d943d2 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("lock creation\n"); #endif - down(&hpfs_sb(s)->hpfs_creation_de); + mutex_lock(&hpfs_sb(s)->hpfs_creation_de); } void hpfs_unlock_creation(struct super_block *s) @@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("unlock creation\n"); #endif - up(&hpfs_sb(s)->hpfs_creation_de); + mutex_unlock(&hpfs_sb(s)->hpfs_creation_de); } /* Map a sector into a buffer and return pointers to it and to the buffer. */ diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b59eac0232a0..2fee17d0d9ab 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -87,7 +87,7 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - struct semaphore hpfs_creation_de; /* when creating dirents, nobody else + struct mutex hpfs_creation_de; /* when creating dirents, nobody else can alloc blocks */ /*unsigned sb_mounting : 1;*/ int sb_timeshift; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index c969a1aa163a..6c5f01597c3a 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -491,7 +491,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; - init_MUTEX(&sbi->hpfs_creation_de); + mutex_init(&sbi->hpfs_creation_de); uid = current_uid(); gid = current_gid(); @@ -686,17 +686,16 @@ bail0: return -EINVAL; } -static int hpfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *hpfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); } static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", - .get_sb = hpfs_get_sb, + .mount = hpfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 4e2a45ea6140..f702b5f713fc 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -748,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) return(err); } -static int hppfs_read_super(struct file_system_type *type, +static struct dentry *hppfs_read_super(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); + return mount_nodev(type, flags, data, hppfs_fill_super); } static struct file_system_type hppfs_type = { .owner = THIS_MODULE, .name = "hppfs", - .get_sb = hppfs_read_super, + .mount = hppfs_read_super, .kill_sb = kill_anon_super, .fs_flags = 0, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b14be3f781c7..a5fe68189eed 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -896,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta) } } -static int hugetlbfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); } static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", - .get_sb = hugetlbfs_get_sb, + .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; @@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - WARN_ONCE(1, - "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); } else { *user = NULL; return ERR_PTR(-EPERM); diff --git a/fs/internal.h b/fs/internal.h index ebad3b90752d..e43b9a4dbf4e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,5 +106,5 @@ extern void release_open_intent(struct nameidata *); * inode.c */ extern int get_nr_dirty_inodes(void); -extern int evict_inodes(struct super_block *); +extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *); diff --git a/fs/ioctl.c b/fs/ioctl.c index f855ea4fc888..e92fdbb3bc3a 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } +static int ioctl_fstrim(struct file *filp, void __user *argp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If filesystem doesn't support trim feature, return. */ + if (sb->s_op->trim_fs == NULL) + return -EOPNOTSUPP; + + /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ + if (sb->s_bdev == NULL) + return -EINVAL; + + if (argp == NULL) { + range.start = 0; + range.len = ULLONG_MAX; + range.minlen = 0; + } else if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + ret = sb->s_op->trim_fs(sb, &range); + if (ret < 0) + return ret; + + if ((argp != NULL) && + (copy_to_user(argp, &range, sizeof(range)))) + return -EFAULT; + + return 0; +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; + case FITRIM: + error = ioctl_fstrim(filp, argp); + break; + case FS_IOC_FIEMAP: return ioctl_fiemap(filp, arg); diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb92dcc6..2f7d05c89922 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = set_task_ioprio(p, ioprio); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -139,7 +141,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != who) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == who; + rcu_read_unlock(); + if (!match) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -200,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = get_task_ioprio(p); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -232,7 +241,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != user->uid) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == user->uid; + rcu_read_unlock(); + if (!match) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 60c2b944d762..bfdeb82a53be 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -544,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) } /* + * Check if root directory is empty (has less than 3 files). + * + * Used to detect broken CDs where ISO root directory is empty but Joliet root + * directory is OK. If such CD has Rock Ridge extensions, they will be disabled + * (and Joliet used instead) or else no files would be visible. + */ +static bool rootdir_empty(struct super_block *sb, unsigned long block) +{ + int offset = 0, files = 0, de_len; + struct iso_directory_record *de; + struct buffer_head *bh; + + bh = sb_bread(sb, block); + if (!bh) + return true; + while (files < 3) { + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + if (de_len == 0) + break; + files++; + offset += de_len; + } + brelse(bh); + return files < 3; +} + +/* * Initialize the superblock and read the root inode. * * Note: a check_disk_change() has been done immediately prior @@ -843,6 +871,18 @@ root_found: goto out_no_root; /* + * Fix for broken CDs with Rock Ridge and empty ISO root directory but + * correct Joliet root directory. + */ + if (sbi->s_rock == 1 && joliet_level && + rootdir_empty(s, sbi->s_firstdatazone)) { + printk(KERN_NOTICE + "ISOFS: primary root directory is empty. " + "Disabling Rock Ridge and switching to Joliet."); + sbi->s_rock = 0; + } + + /* * If this disk has both Rock Ridge and Joliet on it, then we * want to use Rock Ridge by default. This can be overridden * by using the norock mount option. There is still one other @@ -1467,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb, return inode; } -static int isofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *isofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } static struct file_system_type iso9660_fs_type = { .owner = THIS_MODULE, .name = "iso9660", - .get_sb = isofs_get_sb, + .mount = isofs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05a38b9c4c0e..e4b87bc1fa56 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -221,7 +221,7 @@ restart: goto restart; } if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, int ret = 0; if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 85a6883c0aca..34a4861c14b8 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -587,13 +587,13 @@ void journal_commit_transaction(journal_t *journal) /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ - atomic_inc(&jh2bh(jh)->b_count); + get_bh(jh2bh(jh)); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ - set_bit(BH_JWrite, &jh2bh(jh)->b_state); + set_buffer_jwrite(jh2bh(jh)); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. @@ -603,7 +603,7 @@ void journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + set_buffer_jwrite(jh2bh(new_jh)); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor @@ -713,7 +713,7 @@ wait_for_iobuf: shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); - clear_bit(BH_JWrite, &bh->b_state); + clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2c4b1f109da9..da1b5e4ffce1 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -36,6 +36,7 @@ #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> +#include <linux/ratelimit.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static const char *journal_dev_name(journal_t *journal, char *buffer); /* * Helper function used to manage commit timeouts @@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target) */ if (!tid_geq(journal->j_commit_request, target)) { /* - * We want a new commit: OK, mark the request and wakup the + * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. */ @@ -950,6 +952,8 @@ int journal_create(journal_t *journal) if (err) return err; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); memset (bh->b_data, 0, journal->j_blocksize); BUFFER_TRACE(bh, "marking dirty"); @@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait) goto out; } + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD: previous I/O error detected " + "for journal superblock update for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait) BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); - if (wait) + if (wait) { sync_dirty_buffer(bh); - else + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: I/O error detected " + "when updating journal superblock for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + } else write_dirty_buffer(bh, WRITE); out: @@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void) static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; - static unsigned long last_warning; #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); @@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); - if (time_after(jiffies, last_warning + 5*HZ)) { - printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - last_warning = jiffies; - } + printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __func__); + while (ret == NULL) { yield(); ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 81051dafebf5..5b43e96788e6 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); -#endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); +#endif journal->j_transaction_sequence = ++info.end_transaction; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 5ae71e75a491..846a3f314111 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks) jbd_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); - goto out; } -out: return handle; } @@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -713,7 +711,7 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + offset = offset_in_page(jh2bh(jh)->b_data); source = kmap_atomic(page, KM_USER0); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6571a056e55d..6a79fd0a1a32 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so starting and + * waiting for a commit to finish will cause + * us to wait for a _very_ long time. + */ + printk(KERN_ERR "JBD2: %s: " + "Waiting for Godot: block %llu\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr); jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); ret = 1; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index bc6be8bda1cc..f3ad1598b201 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -26,7 +26,9 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/bitops.h> #include <trace/events/jbd2.h> +#include <asm/system.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -201,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { mapping = jinode->i_vfs_inode->i_mapping; - jinode->i_flags |= JI_COMMIT_RUNNING; + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); spin_unlock(&journal->j_list_lock); /* * submit the inode data buffers. We use writepage @@ -216,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); commit_transaction->t_flushed_data_blocks = 1; - jinode->i_flags &= ~JI_COMMIT_RUNNING; + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -237,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - jinode->i_flags |= JI_COMMIT_RUNNING; + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); if (err) { @@ -253,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, ret = err; } spin_lock(&journal->j_list_lock); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 262419f83d80..c590d155c095 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -42,12 +42,14 @@ #include <linux/log2.h> #include <linux/vmalloc.h> #include <linux/backing-dev.h> +#include <linux/bitops.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> #include <asm/uaccess.h> #include <asm/page.h> +#include <asm/system.h> EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); @@ -478,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ if (!tid_geq(journal->j_commit_request, target)) { /* - * We want a new commit: OK, mark the request and wakup the + * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. */ @@ -1836,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal) */ #define JBD2_MAX_SLABS 8 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; -static DECLARE_MUTEX(jbd2_slab_create_sem); static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", @@ -1857,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void) static int jbd2_journal_create_slab(size_t size) { + static DEFINE_MUTEX(jbd2_slab_create_mutex); int i = order_base_2(size) - 10; size_t slab_size; @@ -1868,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size) if (unlikely(i < 0)) i = 0; - down(&jbd2_slab_create_sem); + mutex_lock(&jbd2_slab_create_mutex); if (jbd2_slab[i]) { - up(&jbd2_slab_create_sem); + mutex_unlock(&jbd2_slab_create_mutex); return 0; /* Already created */ } slab_size = 1 << (i+10); jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, slab_size, 0, NULL); - up(&jbd2_slab_create_sem); + mutex_unlock(&jbd2_slab_create_mutex); if (!jbd2_slab[i]) { printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); return -ENOMEM; @@ -2210,7 +2212,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal, restart: spin_lock(&journal->j_list_lock); /* Is commit writing out inode - we have to wait */ - if (jinode->i_flags & JI_COMMIT_RUNNING) { + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f3479d6e0a83..6bf0a242613e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -156,6 +156,7 @@ alloc_transaction: */ repeat: read_lock(&journal->j_state_lock); + BUG_ON(journal->j_flags & JBD2_UNMOUNT); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a906f538d11c..85c6be2db02f 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, static inline struct jffs2_inode_cache * first_inode_chain(int *i, struct jffs2_sb_info *c) { - for (; *i < INOCACHE_HASHSIZE; (*i)++) { + for (; *i < c->inocache_hashsize; (*i)++) { if (c->inocache_list[*i]) return c->inocache_list[*i]; } diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 617a1e5694c1..de4247021d25 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, spin_unlock(&jffs2_compressor_list_lock); *datalen = orig_slen; *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL); + compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); spin_lock(&jffs2_compressor_list_lock); this->usecount--; if (!compr_ret) { @@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, spin_unlock(&jffs2_compressor_list_lock); *datalen = orig_slen; *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL); + compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen); spin_lock(&jffs2_compressor_list_lock); this->usecount--; if (!compr_ret) { @@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, if (comprtype == this->compr) { this->usecount++; spin_unlock(&jffs2_compressor_list_lock); - ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL); + ret = this->decompress(cdata_in, data_out, cdatalen, datalen); spin_lock(&jffs2_compressor_list_lock); if (ret) { printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index e471a9106fd9..13bb7597ab39 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -49,9 +49,9 @@ struct jffs2_compressor { char *name; char compr; /* JFFS2_COMPR_XXX */ int (*compress)(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *srclen, uint32_t *destlen, void *model); + uint32_t *srclen, uint32_t *destlen); int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, - uint32_t cdatalen, uint32_t datalen, void *model); + uint32_t cdatalen, uint32_t datalen); int usecount; int disabled; /* if set the compressor won't compress */ unsigned char *compr_buf; /* used by size compr. mode */ diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c index ed25ae7c98eb..af186ee674d8 100644 --- a/fs/jffs2/compr_lzo.c +++ b/fs/jffs2/compr_lzo.c @@ -42,7 +42,7 @@ static int __init alloc_workspace(void) } static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, void *model) + uint32_t *sourcelen, uint32_t *dstlen) { size_t compress_size; int ret; @@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, } static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, void *model) + uint32_t srclen, uint32_t destlen) { size_t dl = destlen; int ret; diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 9696ad9ef5f7..16a5047903a6 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -31,8 +31,7 @@ /* _compress returns the compressed size, -1 if bigger */ static int jffs2_rtime_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { short positions[256]; int outpos = 0; @@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in, static int jffs2_rtime_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, - void *model) + uint32_t srclen, uint32_t destlen) { short positions[256]; int outpos = 0; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index a12b4f763373..9e7cec808c4c 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, #if 0 /* _compress returns the compressed size, -1 if bigger */ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, void *model) + uint32_t *sourcelen, uint32_t *dstlen) { return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); @@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, #endif static int jffs2_dynrubin_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { int bits[8]; unsigned char histo[256]; @@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits, static int jffs2_rubinmips_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t sourcelen, uint32_t dstlen, - void *model) + uint32_t sourcelen, uint32_t dstlen) { rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); @@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, static int jffs2_dynrubin_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t sourcelen, uint32_t dstlen, - void *model) + uint32_t sourcelen, uint32_t dstlen) { int bits[8]; int c; diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 97fc45de6f81..fd05a0b9431d 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -68,8 +68,7 @@ static void free_workspaces(void) static int jffs2_zlib_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { int ret; @@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in, static int jffs2_zlib_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, - void *model) + uint32_t srclen, uint32_t destlen) { int ret; int wbits = MAX_WBITS; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 79121aa5858b..92978658ed18 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } /* We use f->target field to store the target path. */ - f->target = kmalloc(targetlen + 1, GFP_KERNEL); + f->target = kmemdup(target, targetlen + 1, GFP_KERNEL); if (!f->target) { printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); mutex_unlock(&f->sem); @@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char goto fail; } - memcpy(f->target, target, targetlen + 1); D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); /* No data here. Only a metadata node, which will be diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index abac961f617b..e513f1913c15 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) } /* Be nice */ - yield(); + cond_resched(); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d9beb06e6fca..e896e67767eb 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -474,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i return inode; } +static int calculate_inocache_hashsize(uint32_t flash_size) +{ + /* + * Pick a inocache hash size based on the size of the medium. + * Count how many megabytes we're dealing with, apply a hashsize twice + * that size, but rounding down to the usual big powers of 2. And keep + * to sensible bounds. + */ + + int size_mb = flash_size / 1024 / 1024; + int hashsize = (size_mb * 2) & ~0x3f; + + if (hashsize < INOCACHE_HASHSIZE_MIN) + return INOCACHE_HASHSIZE_MIN; + if (hashsize > INOCACHE_HASHSIZE_MAX) + return INOCACHE_HASHSIZE_MAX; + + return hashsize; +} int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) { @@ -520,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); + c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size); + c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); if (!c->inocache_list) { ret = -ENOMEM; goto out_wbuf; diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 846a79452497..31dce611337c 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (!list_empty(&c->erase_complete_list) || !list_empty(&c->erase_pending_list)) { spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); - if (jffs2_erase_pending_blocks(c, 1)) { - mutex_unlock(&c->alloc_sem); + if (jffs2_erase_pending_blocks(c, 1)) return 0; - } + D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); spin_lock(&c->erase_completion_lock); + mutex_lock(&c->alloc_sem); } /* First, work out which block we're garbage-collecting */ diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 6784bc89add1..f864005de64c 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -100,6 +100,7 @@ struct jffs2_sb_info { wait_queue_head_t erase_wait; /* For waiting for erases to complete */ wait_queue_head_t inocache_wq; + int inocache_hashsize; struct jffs2_inode_cache **inocache_list; spinlock_t inocache_lock; diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index af02bd138469..5e03233c2363 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t { struct jffs2_inode_cache *ret; - ret = c->inocache_list[ino % INOCACHE_HASHSIZE]; + ret = c->inocache_list[ino % c->inocache_hashsize]; while (ret && ret->ino < ino) { ret = ret->next; } @@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new dbg_inocache("add %p (ino #%u)\n", new, new->ino); - prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; + prev = &c->inocache_list[new->ino % c->inocache_hashsize]; while ((*prev) && (*prev)->ino < new->ino) { prev = &(*prev)->next; @@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) dbg_inocache("del %p (ino #%u)\n", old, old->ino); spin_lock(&c->inocache_lock); - prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; + prev = &c->inocache_list[old->ino % c->inocache_hashsize]; while ((*prev) && (*prev)->ino < old->ino) { prev = &(*prev)->next; @@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c) int i; struct jffs2_inode_cache *this, *next; - for (i=0; i<INOCACHE_HASHSIZE; i++) { + for (i=0; i < c->inocache_hashsize; i++) { this = c->inocache_list[i]; while (this) { next = this->next; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 523a91691052..5a53d9bdb2b5 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -199,7 +199,8 @@ struct jffs2_inode_cache { #define RAWNODE_CLASS_XATTR_DATUM 1 #define RAWNODE_CLASS_XATTR_REF 2 -#define INOCACHE_HASHSIZE 128 +#define INOCACHE_HASHSIZE_MIN 128 +#define INOCACHE_HASHSIZE_MAX 1024 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 46f870d1cc36..b632dddcb482 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -20,7 +20,7 @@ #include "summary.h" #include "debug.h" -#define DEFAULT_EMPTY_SCAN_SIZE 1024 +#define DEFAULT_EMPTY_SCAN_SIZE 256 #define noisy_printk(noise, args...) do { \ if (*(noise)) { \ @@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { struct jffs2_unknown_node *node; struct jffs2_unknown_node crcnode; - uint32_t ofs, prevofs; + uint32_t ofs, prevofs, max_ofs; uint32_t hdr_crc, buf_ofs, buf_len; int err; int noise = 0; @@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ ofs = 0; - - /* Scan only 4KiB of 0xFF before declaring it's empty */ - while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) + max_ofs = EMPTY_SCAN_SIZE(c->sector_size); + /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */ + while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) ofs += 4; - if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { + if (ofs == max_ofs) { #ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { /* scan oob, take care of cleanmarker */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d1ae5dfc22b9..c86041b866a4 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -179,12 +179,11 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) return ret; } -static int jffs2_get_sb(struct file_system_type *fs_type, +static struct dentry *jffs2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, - mnt); + return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super); } static void jffs2_put_super (struct super_block *sb) @@ -229,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb) static struct file_system_type jffs2_fs_type = { .owner = THIS_MODULE, .name = "jffs2", - .get_sb = jffs2_get_sb, + .mount = jffs2_mount, .kill_sb = jffs2_kill_sb, }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 68eee2bf629e..0669fc1cc3bf 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -583,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb) return 0; } -static int jfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *jfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) @@ -770,7 +769,7 @@ static const struct export_operations jfs_export_operations = { static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", - .get_sb = jfs_get_sb, + .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/libfs.c b/fs/libfs.c index 304a5132ca27..a3accdf528ad 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = { * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ -int get_sb_pseudo(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, unsigned long magic, - struct vfsmount *mnt) +struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, unsigned long magic) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); struct dentry *dentry; @@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, struct qstr d_name = {.name = name, .len = strlen(name)}; if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); s->s_flags = MS_NOUSER; s->s_maxbytes = MAX_LFS_FILESIZE; @@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); Enomem: deactivate_locked_super(s); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) @@ -951,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(generic_read_dir); -EXPORT_SYMBOL(get_sb_pseudo); +EXPORT_SYMBOL(mount_pseudo); EXPORT_SYMBOL(simple_write_begin); EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_dir_inode_operations); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b13aabc12298..abfff9d7979d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -22,7 +22,6 @@ #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -130,15 +129,6 @@ lockd(void *vrqstp) dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); - /* - * FIXME: it would be nice if lockd didn't spend its entire life - * running under the BKL. At the very least, it would be good to - * have someone clarify what it's intended to protect here. I've - * seen some handwavy posts about posix locking needing to be - * done under the BKL, but it's far from clear. - */ - lock_kernel(); - if (!nlm_timeout) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; @@ -195,7 +185,6 @@ lockd(void *vrqstp) if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); - unlock_kernel(); return 0; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 6f1ef000975a..c462d346acbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -700,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl) struct nlm_block *block; dprintk("lockd: VFS unblock notification for block %p\n", fl); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { - nlmsvc_insert_block(block, 0); + nlmsvc_insert_block_locked(block, 0); + spin_unlock(&nlm_blocked_lock); svc_wake_up(block->b_daemon); return; } } - + spin_unlock(&nlm_blocked_lock); printk(KERN_WARNING "lockd: notification for unknown block!\n"); } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index d0ef94cfb3da..1ca0679c80bf 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; + lock_flocks(); /* protects i_flock list */ for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -181,6 +182,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; + unlock_flocks(); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -192,6 +194,7 @@ again: goto again; } } + unlock_flocks(); return 0; } @@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file) if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; + lock_flocks(); for (fl = inode->i_flock; fl; fl = fl->fl_next) { - if (fl->fl_lmops == &nlmsvc_lock_operations) + if (fl->fl_lmops == &nlmsvc_lock_operations) { + unlock_flocks(); return 1; + } } + unlock_flocks(); file->f_locks = 0; return 0; } diff --git a/fs/locks.c b/fs/locks.c index 4de3a2666810..0e62dd35d088 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -142,6 +142,7 @@ int lease_break_time = 45; static LIST_HEAD(file_lock_list); static LIST_HEAD(blocked_list); +static DEFINE_SPINLOCK(file_lock_lock); /* * Protects the two list heads above, plus the inode->i_flock list @@ -149,23 +150,24 @@ static LIST_HEAD(blocked_list); */ void lock_flocks(void) { - lock_kernel(); + spin_lock(&file_lock_lock); } EXPORT_SYMBOL_GPL(lock_flocks); void unlock_flocks(void) { - unlock_kernel(); + spin_unlock(&file_lock_lock); } EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +struct file_lock *locks_alloc_lock(void) { return kmem_cache_alloc(filelock_cache, GFP_KERNEL); } +EXPORT_SYMBOL_GPL(locks_alloc_lock); void locks_release_private(struct file_lock *fl) { @@ -184,7 +186,7 @@ void locks_release_private(struct file_lock *fl) EXPORT_SYMBOL_GPL(locks_release_private); /* Free a lock which is not in use. */ -static void locks_free_lock(struct file_lock *fl) +void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); BUG_ON(!list_empty(&fl->fl_block)); @@ -193,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl) locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } +EXPORT_SYMBOL(locks_free_lock); void locks_init_lock(struct file_lock *fl) { @@ -232,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl) fl->fl_ops->fl_copy_lock(new, fl); new->fl_ops = fl->fl_ops; } - if (fl->fl_lmops) { - if (fl->fl_lmops->fl_copy_lock) - fl->fl_lmops->fl_copy_lock(new, fl); + if (fl->fl_lmops) new->fl_lmops = fl->fl_lmops; - } } /* @@ -1365,31 +1365,27 @@ int fcntl_getlease(struct file *filp) int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; - struct file_lock *new_fl = NULL; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; + lease = *flp; + + error = -EACCES; if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - return -EACCES; + goto out; + error = -EINVAL; if (!S_ISREG(inode->i_mode)) - return -EINVAL; + goto out; error = security_file_lock(filp, arg); if (error) - return error; + goto out; time_out_leases(inode); BUG_ON(!(*flp)->fl_lmops->fl_break); - lease = *flp; - if (arg != F_UNLCK) { - error = -ENOMEM; - new_fl = locks_alloc_lock(); - if (new_fl == NULL) - goto out; - error = -EAGAIN; if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; @@ -1429,12 +1425,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; if (my_before != NULL) { - *flp = *my_before; error = lease->fl_lmops->fl_change(my_before, arg); + if (!error) + *flp = *my_before; goto out; } - error = 0; if (arg == F_UNLCK) goto out; @@ -1442,15 +1438,10 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if (!leases_enable) goto out; - locks_copy_lock(new_fl, lease); - locks_insert_lock(before, new_fl); - - *flp = new_fl; + locks_insert_lock(before, lease); return 0; out: - if (new_fl != NULL) - locks_free_lock(new_fl); return error; } EXPORT_SYMBOL(generic_setlease); @@ -1502,6 +1493,59 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) } EXPORT_SYMBOL_GPL(vfs_setlease); +static int do_fcntl_delete_lease(struct file *filp) +{ + struct file_lock fl, *flp = &fl; + + lease_init(filp, F_UNLCK, flp); + + return vfs_setlease(filp, F_UNLCK, &flp); +} + +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +{ + struct file_lock *fl, *ret; + struct fasync_struct *new; + int error; + + fl = lease_alloc(filp, arg); + if (IS_ERR(fl)) + return PTR_ERR(fl); + + new = fasync_alloc(); + if (!new) { + locks_free_lock(fl); + return -ENOMEM; + } + ret = fl; + lock_flocks(); + error = __vfs_setlease(filp, arg, &ret); + if (error) { + unlock_flocks(); + locks_free_lock(fl); + goto out_free_fasync; + } + if (ret != fl) + locks_free_lock(fl); + + /* + * fasync_insert_entry() returns the old entry if any. + * If there was no old entry, then it used 'new' and + * inserted it into the fasync list. Clear new so that + * we don't release it here. + */ + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) + new = NULL; + + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + unlock_flocks(); + +out_free_fasync: + if (new) + fasync_free(new); + return error; +} + /** * fcntl_setlease - sets a lease on an open file * @fd: open file descriptor @@ -1514,34 +1558,9 @@ EXPORT_SYMBOL_GPL(vfs_setlease); */ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { - struct file_lock fl, *flp = &fl; - struct inode *inode = filp->f_path.dentry->d_inode; - int error; - - locks_init_lock(&fl); - error = lease_init(filp, arg, &fl); - if (error) - return error; - - lock_flocks(); - - error = __vfs_setlease(filp, arg, &flp); - if (error || arg == F_UNLCK) - goto out_unlock; - - error = fasync_helper(fd, filp, 1, &flp->fl_fasync); - if (error < 0) { - /* remove lease just inserted by setlease */ - flp->fl_type = F_UNLCK | F_INPROGRESS; - flp->fl_break_time = jiffies - 10; - time_out_leases(inode); - goto out_unlock; - } - - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); -out_unlock: - unlock_flocks(); - return error; + if (arg == F_UNLCK) + return do_fcntl_delete_lease(filp); + return do_fcntl_add_lease(fd, filp, arg); } /** diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 9bd2ce2a3040..92ca6fbe09bd 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page) return sync_request(page, bdev, WRITE); } -static void bdev_put_device(struct super_block *sb) +static void bdev_put_device(struct logfs_super *s) { - close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); + close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); } static int bdev_can_write_buf(struct super_block *sb, u64 ofs) @@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = { .put_device = bdev_put_device, }; -int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt) +int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, + const char *devname) { struct block_device *bdev; @@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags, if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { int mtdnr = MINOR(bdev->bd_dev); close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); - return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + return logfs_get_sb_mtd(p, mtdnr); } - return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); + p->s_bdev = bdev; + p->s_mtd = NULL; + p->s_devops = &bd_devops; + return 0; } diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index a85d47d13e4b..7466e9dcc8c5 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); } -static void mtd_put_device(struct super_block *sb) +static void mtd_put_device(struct logfs_super *s) { - put_mtd_device(logfs_super(sb)->s_mtd); + put_mtd_device(s->s_mtd); } static int mtd_can_write_buf(struct super_block *sb, u64 ofs) @@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = { .put_device = mtd_put_device, }; -int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt) +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) { - struct mtd_info *mtd; - const struct logfs_device_ops *devops = &mtd_devops; - - mtd = get_mtd_device(NULL, mtdnr); + struct mtd_info *mtd = get_mtd_device(NULL, mtdnr); if (IS_ERR(mtd)) return PTR_ERR(mtd); - return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); + + s->s_bdev = NULL; + s->s_mtd = mtd; + s->s_devops = &mtd_devops; + return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index b8786264d243..57afd4a6fabb 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -136,6 +136,7 @@ struct logfs_area_ops { int (*erase_segment)(struct logfs_area *area); }; +struct logfs_super; /* forward */ /** * struct logfs_device_ops - device access operations * @@ -156,7 +157,7 @@ struct logfs_device_ops { int ensure_write); int (*can_write_buf)(struct super_block *sb, u64 ofs); void (*sync)(struct super_block *sb); - void (*put_device)(struct super_block *sb); + void (*put_device)(struct logfs_super *s); }; /** @@ -471,11 +472,13 @@ void logfs_compr_exit(void); /* dev_bdev.c */ #ifdef CONFIG_BLOCK -int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt); +int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname); #else -static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt) +static inline int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname) { return -ENODEV; } @@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, /* dev_mtd.c */ #ifdef CONFIG_MTD -int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt); +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else -static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt) +static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) { return -ENODEV; } @@ -619,9 +620,6 @@ void emergency_read_end(struct page *page); void logfs_crash_dump(struct super_block *sb); void *memchr_inv(const void *s, int c, size_t n); int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); -int logfs_get_sb_device(struct file_system_type *type, int flags, - struct mtd_info *mtd, struct block_device *bdev, - const struct logfs_device_ops *devops, struct vfsmount *mnt); int logfs_check_ds(struct logfs_disk_super *ds); int logfs_write_sb(struct super_block *sb); diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 5336155c5d81..33435e4b14d2 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb) return 0; } -static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +static int logfs_get_sb_final(struct super_block *sb) { struct logfs_super *super = logfs_super(sb); struct inode *rootdir; @@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) } log_super("LogFS: Finished mounting\n"); - simple_set_mnt(mnt, sb); return 0; fail: @@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb) logfs_cleanup_rw(sb); if (super->s_erase_page) __free_page(super->s_erase_page); - super->s_devops->put_device(sb); + super->s_devops->put_device(super); logfs_mempool_destroy(super->s_btree_pool); logfs_mempool_destroy(super->s_alias_pool); kfree(super); log_super("LogFS: Finished unmounting\n"); } -int logfs_get_sb_device(struct file_system_type *type, int flags, - struct mtd_info *mtd, struct block_device *bdev, - const struct logfs_device_ops *devops, struct vfsmount *mnt) +static struct dentry *logfs_get_sb_device(struct logfs_super *super, + struct file_system_type *type, int flags) { - struct logfs_super *super; struct super_block *sb; int err = -ENOMEM; static int mount_count; log_super("LogFS: Start mount %x\n", mount_count++); - super = kzalloc(sizeof(*super), GFP_KERNEL); - if (!super) - goto err0; - super->s_mtd = mtd; - super->s_bdev = bdev; err = -EINVAL; sb = sget(type, logfs_sb_test, logfs_sb_set, super); - if (IS_ERR(sb)) - goto err0; + if (IS_ERR(sb)) { + super->s_devops->put_device(super); + kfree(super); + return ERR_CAST(sb); + } if (sb->s_root) { /* Device is already in use */ - err = 0; - simple_set_mnt(mnt, sb); - goto err0; + super->s_devops->put_device(super); + kfree(super); + return dget(sb->s_root); } - super->s_devops = devops; - /* * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache * only covers 16TB and the upper 8TB are used for indirect blocks. @@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags, goto err1; sb->s_flags |= MS_ACTIVE; - err = logfs_get_sb_final(sb, mnt); - if (err) + err = logfs_get_sb_final(sb); + if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); + } + return dget(sb->s_root); err1: /* no ->s_root, no ->put_super() */ @@ -592,37 +587,45 @@ err1: iput(super->s_segfile_inode); iput(super->s_mapping_inode); deactivate_locked_super(sb); - return err; -err0: - kfree(super); - //devops->put_device(sb); - return err; + return ERR_PTR(err); } -static int logfs_get_sb(struct file_system_type *type, int flags, - const char *devname, void *data, struct vfsmount *mnt) +static struct dentry *logfs_mount(struct file_system_type *type, int flags, + const char *devname, void *data) { ulong mtdnr; + struct logfs_super *super; + int err; - if (!devname) - return logfs_get_sb_bdev(type, flags, devname, mnt); - if (strncmp(devname, "mtd", 3)) - return logfs_get_sb_bdev(type, flags, devname, mnt); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return ERR_PTR(-ENOMEM); - { + if (!devname) + err = logfs_get_sb_bdev(super, type, devname); + else if (strncmp(devname, "mtd", 3)) + err = logfs_get_sb_bdev(super, type, devname); + else { char *garbage; mtdnr = simple_strtoul(devname+3, &garbage, 0); if (*garbage) - return -EINVAL; + err = -EINVAL; + else + err = logfs_get_sb_mtd(super, mtdnr); + } + + if (err) { + kfree(super); + return ERR_PTR(err); } - return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + return logfs_get_sb_device(super, type, flags); } static struct file_system_type logfs_fs_type = { .owner = THIS_MODULE, .name = "logfs", - .get_sb = logfs_get_sb, + .mount = logfs_mount, .kill_sb = logfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e39d6bf2e8fb..fb2020858a34 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode) V2_minix_truncate(inode); } -static int minix_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *minix_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super); } static struct file_system_type minix_fs_type = { .owner = THIS_MODULE, .name = "minix", - .get_sb = minix_get_sb, + .mount = minix_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/namei.c b/fs/namei.c index f7dbc06857ab..5362af9b7372 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1574,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd, */ if (will_truncate) mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit: @@ -1675,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } filp = nameidata_to_filp(nd); mnt_drop_write(nd->path.mnt); + path_put(&nd->path); if (!IS_ERR(filp)) { error = ima_file_check(filp, acc_mode); if (error) { diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 985fabb26aca..d290545aa0c4 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -1020,16 +1020,16 @@ out: return result; } -static int ncp_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ncp_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); + return mount_nodev(fs_type, flags, data, ncp_fill_super); } static struct file_system_type ncp_fs_type = { .owner = THIS_MODULE, .name = "ncpfs", - .get_sb = ncp_get_sb, + .mount = ncp_mount, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index fd667652c502..ba306658a6db 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,7 +1,6 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING - depends on BKL # fix as soon as lockd is done select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 064a80961677..84d3c8b90206 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -873,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); dreq->l_ctx = nfs_get_lock_context(dreq->ctx); - if (dreq->l_ctx != NULL) + if (dreq->l_ctx == NULL) goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e756075637b0..60677f9f1311 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -884,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) dprintk("NFS: setlease(%s/%s, arg=%ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, arg); - return -EINVAL; } diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dec47ed8b6b9..4e2d9b6b1380 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -123,7 +123,7 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, size_t desclen = typelen + namelen + 2; *desc = kmalloc(desclen, GFP_KERNEL); - if (!desc) + if (!*desc) return -ENOMEM; cp = *desc; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 32c8758c99fd..0f24cdf2cb13 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -429,7 +429,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * * returned NFS4ERR_DELAY as per Section 2.10.6.2 * of RFC5661. */ - dprintk("%s: slot=%ld seq=%d: Operation in progress\n", + dprintk("%s: slot=%td seq=%d: Operation in progress\n", __func__, res->sr_slot - res->sr_session->fc_slot_table.slots, res->sr_slot->seq_nr); @@ -573,7 +573,7 @@ int nfs4_setup_sequence(const struct nfs_server *server, goto out; } - dprintk("--> %s clp %p session %p sr_slot %ld\n", + dprintk("--> %s clp %p session %p sr_slot %td\n", __func__, session->clp, session, res->sr_slot ? res->sr_slot - session->fc_slot_table.slots : -1); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 919490232e17..137b549e63db 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, if (req == NULL) return ERR_PTR(-ENOMEM); + /* get lock context early so we can deal with alloc failures */ + req->wb_lock_context = nfs_get_lock_context(ctx); + if (req->wb_lock_context == NULL) { + nfs_page_free(req); + return ERR_PTR(-ENOMEM); + } + /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ @@ -79,7 +86,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); - req->wb_lock_context = nfs_get_lock_context(ctx); kref_init(&req->wb_kref); return req; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3600ec700d58..0a42e8f4adcb 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -260,8 +260,8 @@ static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); -static int nfs_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static void nfs_put_super(struct super_block *); static void nfs_kill_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); @@ -277,7 +277,7 @@ static struct file_system_type nfs_fs_type = { struct file_system_type nfs_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_xdev_get_sb, + .mount = nfs_xdev_mount, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -302,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name, struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_remote_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static void nfs4_kill_super(struct super_block *sb); static struct file_system_type nfs4_fs_type = { @@ -323,7 +323,7 @@ static struct file_system_type nfs4_fs_type = { static struct file_system_type nfs4_remote_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_remote_get_sb, + .mount = nfs4_remote_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -331,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = { struct file_system_type nfs4_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_xdev_get_sb, + .mount = nfs4_xdev_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -339,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = { static struct file_system_type nfs4_remote_referral_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_remote_referral_get_sb, + .mount = nfs4_remote_referral_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -2397,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s) /* * Clone an NFS2/3 server record on xdev traversal (FSID-change) */ -static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -2411,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, }; int error; - dprintk("--> nfs_xdev_get_sb()\n"); + dprintk("--> nfs_xdev_mount()\n"); /* create a new volume representation */ server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); @@ -2458,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; /* clone any lsm security options from the parent to the new sb */ security_sb_clone_mnt_opts(data->sb, s); - dprintk("<-- nfs_xdev_get_sb() = 0\n"); - return 0; + dprintk("<-- nfs_xdev_mount() = 0\n"); + return mntroot; out_err_nosb: nfs_free_server(server); out_err_noserver: - dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); - return error; + dprintk("<-- nfs_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) bdi_unregister(&server->backing_dev_info); error_splat_bdi: deactivate_locked_super(s); - dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); - return error; + dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); } #ifdef CONFIG_NFS_V4 @@ -2649,8 +2647,9 @@ out_no_address: /* * Get the superblock for the NFS4 root partition */ -static int nfs4_remote_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_parsed_mount_data *data = raw_data; struct super_block *s; @@ -2714,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, goto error_splat_root; s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - error = 0; + + security_free_mnt_opts(&data->lsm_opts); + nfs_free_fhandle(mntfh); + return mntroot; out: security_free_mnt_opts(&data->lsm_opts); out_free_fh: nfs_free_fhandle(mntfh); - return error; + return ERR_PTR(error); out_free: nfs_free_server(server); @@ -2968,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb) /* * Clone an NFS4 server record on xdev traversal (FSID-change) */ -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs4_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -2982,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, }; int error; - dprintk("--> nfs4_xdev_get_sb()\n"); + dprintk("--> nfs4_xdev_mount()\n"); /* create a new volume representation */ server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); @@ -3029,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; security_sb_clone_mnt_opts(data->sb, s); - dprintk("<-- nfs4_xdev_get_sb() = 0\n"); - return 0; + dprintk("<-- nfs4_xdev_mount() = 0\n"); + return mntroot; out_err_nosb: nfs_free_server(server); out_err_noserver: - dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); - return error; + dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) bdi_unregister(&server->backing_dev_info); error_splat_bdi: deactivate_locked_super(s); - dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); - return error; + dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); } -static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -3118,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; security_sb_clone_mnt_opts(data->sb, s); nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = 0\n"); - return 0; + return mntroot; out_err_nosb: nfs_free_server(server); @@ -3133,7 +3129,7 @@ out_err_noserver: nfs_free_fhandle(mntfh); out_err_nofh: dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); - return error; + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) @@ -3142,7 +3138,7 @@ error_splat_bdi: deactivate_locked_super(s); nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); - return error; + return ERR_PTR(error); } /* diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 9a16bad5d2ea..7bdec8531400 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -444,9 +444,9 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, /* set up nfs_renamedata */ data->old_dir = old_dir; - atomic_inc(&old_dir->i_count); + ihold(old_dir); data->new_dir = new_dir; - atomic_inc(&new_dir->i_count); + ihold(new_dir); data->old_dentry = dget(old_dentry); data->new_dentry = dget(new_dentry); nfs_fattr_init(&data->old_fattr); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 31a78fce4732..18b3e8975fe0 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -2,7 +2,6 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING - depends on BKL # fix as soon as lockd is done select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9019e8ec9dc8..ad2bfa68d534 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) spin_unlock(&clp->cl_lock); } -static void nfsd4_register_conn(struct nfsd4_conn *conn) +static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; - register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { struct nfsd4_conn *conn; u32 flags = NFS4_CDFC4_FORE; + int ret; if (ses->se_flags & SESSION4_BACK_CHAN) flags |= NFS4_CDFC4_BACK; @@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); - nfsd4_register_conn(conn); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); return nfs_ok; } @@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); @@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi } __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); - nfsd4_register_conn(new); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); return; } @@ -2310,22 +2318,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl) } /* - * Set the delegation file_lock back pointer. - * - * Called from setlease() with lock_kernel() held. - */ -static -void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl) -{ - struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner; - - dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp); - if (!dp) - return; - dp->dl_flock = new; -} - -/* * Called from setlease() with lock_kernel() held */ static @@ -2355,7 +2347,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, .fl_release_private = nfsd_release_deleg_cb, - .fl_copy_lock = nfsd_copy_lock_deleg_cb, .fl_mylease = nfsd_same_client_deleg_cb, .fl_change = nfsd_change_deleg_cb, }; @@ -2614,7 +2605,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; int cb_up = atomic_read(&sop->so_client->cl_cb_set); - struct file_lock fl, *flp = &fl; + struct file_lock *fl; int status, flag = 0; flag = NFS4_OPEN_DELEGATE_NONE; @@ -2648,21 +2639,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta flag = NFS4_OPEN_DELEGATE_NONE; goto out; } - locks_init_lock(&fl); - fl.fl_lmops = &nfsd_lease_mng_ops; - fl.fl_flags = FL_LEASE; - fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl.fl_end = OFFSET_MAX; - fl.fl_owner = (fl_owner_t)dp; - fl.fl_file = find_readable_file(stp->st_file); - BUG_ON(!fl.fl_file); - fl.fl_pid = current->tgid; + status = -ENOMEM; + fl = locks_alloc_lock(); + if (!fl) + goto out; + locks_init_lock(fl); + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_LEASE; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)dp; + fl->fl_file = find_readable_file(stp->st_file); + BUG_ON(!fl->fl_file); + fl->fl_pid = current->tgid; + dp->dl_flock = fl; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ - if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { + if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); + dp->dl_flock = NULL; + locks_free_lock(fl); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; goto out; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d6dc3f61f8ba..4514ebbee4d6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1405,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) return simple_fill_super(sb, 0x6e667364, nfsd_files); } -static int nfsd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *nfsd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); + return mount_single(fs_type, flags, data, nfsd_fill_super); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", - .get_sb = nfsd_get_sb, + .mount = nfsd_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 35ae03c0db86..f804d41ec9d3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1141,9 +1141,9 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } -static int -nilfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry * +nilfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct nilfs_super_data sd; struct super_block *s; @@ -1156,7 +1156,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) - return PTR_ERR(sd.bdev); + return ERR_CAST(sd.bdev); sd.cno = 0; sd.flags = flags; @@ -1235,9 +1235,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, if (!s_new) close_bdev_exclusive(sd.bdev, mode); - mnt->mnt_sb = s; - mnt->mnt_root = root_dentry; - return 0; + return root_dentry; failed_super: deactivate_locked_super(s); @@ -1245,13 +1243,13 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, failed: if (!s_new) close_bdev_exclusive(sd.bdev, mode); - return err; + return ERR_PTR(err); } struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", - .get_sb = nilfs_get_sb, + .mount = nilfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index b388443c3a09..22c629eedd82 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -3,4 +3,4 @@ config FSNOTIFY source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" -#source "fs/notify/fanotify/Kconfig" +source "fs/notify/fanotify/Kconfig" diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 85366c78cc37..b04f88eed09e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -131,6 +131,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -160,20 +161,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, __u32 event_mask, void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; + struct path *path = data; pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, inode_mark, vfsmnt_mark, event_mask, data, data_type); - /* sorry, fanotify only gives a damn about files and dirs */ - if (!S_ISREG(to_tell->i_mode) && - !S_ISDIR(to_tell->i_mode)) - return false; - /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) return false; + /* sorry, fanotify only gives a damn about files and dirs */ + if (!S_ISREG(path->dentry->d_inode->i_mode) && + !S_ISDIR(path->dentry->d_inode->i_mode)) + return false; + if (inode_mark && vfsmnt_mark) { marks_mask = (vfsmnt_mark->mask | inode_mark->mask); marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); @@ -194,16 +196,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, BUG(); } + if (S_ISDIR(path->dentry->d_inode->i_mode) && + (marks_ignored_mask & FS_ISDIR)) + return false; + if (event_mask & marks_mask & ~marks_ignored_mask) return true; return false; } +static void fanotify_free_group_priv(struct fsnotify_group *group) +{ + struct user_struct *user; + + user = group->fanotify_data.user; + atomic_dec(&user->fanotify_listeners); + free_uid(user); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .should_send_event = fanotify_should_send_event, - .free_group_priv = NULL, + .free_group_priv = fanotify_free_group_priv, .free_event_priv = NULL, .freeing_mark = NULL, }; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bbcb98e7fcc6..063224812b7e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,10 @@ #include <asm/ioctls.h> +#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 +#define FANOTIFY_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 + extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; @@ -326,7 +330,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; - ret = -EINTR; + ret = -ERESTARTSYS; if (signal_pending(current)) break; @@ -372,11 +376,10 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_response_event *re, *lre; - - pr_debug("%s: file=%p group=%p\n", __func__, file, group); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_response_event *re, *lre; + mutex_lock(&group->fanotify_data.access_mutex); group->fanotify_data.bypass_perm = true; @@ -554,18 +557,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags) { - __u32 oldmask; + __u32 oldmask = -1; spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { oldmask = fsn_mark->mask; fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); } else { - oldmask = fsn_mark->ignored_mask; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask)); + __u32 tmask = fsn_mark->ignored_mask | mask; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } + + if (!(flags & FAN_MARK_ONDIR)) { + __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + } + spin_unlock(&fsn_mark->lock); return mask & ~oldmask; @@ -582,6 +591,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, if (!fsn_mark) { int ret; + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fsn_mark) return -ENOMEM; @@ -610,10 +622,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); + /* + * If some other task has this inode open for write we should not add + * an ignored mark, unless that ignored mark is supposed to survive + * modification changes anyway. + */ + if ((flags & FAN_MARK_IGNORED_MASK) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && + (atomic_read(&inode->i_writecount) > 0)) + return 0; + fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { int ret; + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fsn_mark) return -ENOMEM; @@ -637,6 +662,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct fsnotify_group *group; int f_flags, fd; + struct user_struct *user; pr_debug("%s: flags=%d event_f_flags=%d\n", __func__, flags, event_f_flags); @@ -647,6 +673,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; + user = get_current_user(); + if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { + free_uid(user); + return -EMFILE; + } + f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -658,12 +690,47 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (IS_ERR(group)) return PTR_ERR(group); + group->fanotify_data.user = user; + atomic_inc(&user->fanotify_listeners); + group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); #endif + switch (flags & FAN_ALL_CLASS_BITS) { + case FAN_CLASS_NOTIF: + group->priority = FS_PRIO_0; + break; + case FAN_CLASS_CONTENT: + group->priority = FS_PRIO_1; + break; + case FAN_CLASS_PRE_CONTENT: + group->priority = FS_PRIO_2; + break; + default: + fd = -EINVAL; + goto out_put_group; + } + + if (flags & FAN_UNLIMITED_QUEUE) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->max_events = UINT_MAX; + } else { + group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + } + + if (flags & FAN_UNLIMITED_MARKS) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->fanotify_data.max_marks = UINT_MAX; + } else { + group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; + } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); if (fd < 0) @@ -704,6 +771,12 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, default: return -EINVAL; } + + if (mask & FAN_ONDIR) { + flags |= FAN_MARK_ONDIR; + mask &= ~FAN_ONDIR; + } + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) #else @@ -719,6 +792,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, ret = -EINVAL; if (unlikely(filp->f_op != &fanotify_fops)) goto fput_and_out; + group = filp->private_data; + + /* + * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not + * allowed to set permissions events. + */ + ret = -EINVAL; + if (mask & FAN_ALL_PERM_EVENTS && + group->priority == FS_PRIO_0) + goto fput_and_out; ret = fanotify_find_path(dfd, pathname, &path, flags); if (ret) @@ -729,7 +812,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, inode = path.dentry->d_inode; else mnt = path.mnt; - group = filp->private_data; /* create/update an inode mark */ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4498a208df94..20dc218707ca 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -84,16 +84,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; + int ret = 0; if (!dentry) dentry = path->dentry; if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) - return; + return 0; parent = dget_parent(dentry); p_inode = parent->d_inode; @@ -106,14 +107,16 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) mask |= FS_EVENT_ON_CHILD; if (path) - fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, + dentry->d_name.name, 0); else - fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, + dentry->d_name.name, 0); } dput(parent); + + return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); @@ -252,20 +255,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, + data_is, cookie, file_name, &event); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, + data_is, cookie, file_name, &event); inode_group = NULL; } else { - send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, + mask, data, data_is, cookie, file_name, + &event); } + if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) + goto out; + if (inode_group) inode_node = srcu_dereference(inode_node->next, &fsnotify_mark_srcu); @@ -273,7 +279,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, vfsmount_node = srcu_dereference(vfsmount_node->next, &fsnotify_mark_srcu); } - + ret = 0; +out: srcu_read_unlock(&fsnotify_mark_srcu, idx); /* * fsnotify_create_event() took a reference so the event can't be cleaned diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 21ed10660b80..4c29fcf557d1 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, * Attach an initialized mark to a given inode. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group and for which inodes. These - * marks are ordered according to the group's location in memory. + * marks are ordered according to priority, highest number first, and then by + * the group's location in memory. */ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, @@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group < lmark->group) + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) continue; hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 24edc1185d53..444c305a468c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -862,7 +862,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); - BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); + BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 56772b578fbd..85eebff6d0d7 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group < lmark->group) + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) continue; hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d3fbe5730bfc..a30ecacc01f2 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3059,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide mutex. */ DEFINE_MUTEX(ntfs_lock); -static int ntfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); } static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs", - .get_sb = ntfs_get_sb, + .mount = ntfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 75e115f1bd73..b2df490a19ed 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -643,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = { .setattr = dlmfs_file_setattr, }; -static int dlmfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *dlmfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, dlmfs_fill_super); } static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .get_sb = dlmfs_get_sb, + .mount = dlmfs_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f0cb395820..f02c0ef31578 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1236,14 +1236,12 @@ read_super_error: return status; } -static int ocfs2_get_sb(struct file_system_type *fs_type, +static struct dentry *ocfs2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, - struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } static void ocfs2_kill_sb(struct super_block *sb) @@ -1267,8 +1265,7 @@ out: static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", - .get_sb = ocfs2_get_sb, /* is this called when we mount - * the fs? */ + .mount = ocfs2_mount, .kill_sb = ocfs2_kill_sb, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 14a22863291a..e043c4cb9a97 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -557,17 +557,16 @@ end: return ret; } -static int omfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct vfsmount *m) +static struct dentry *omfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); + return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); } static struct file_system_type omfs_fs_type = { .owner = THIS_MODULE, .name = "omfs", - .get_sb = omfs_get_sb, + .mount = omfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/open.c b/fs/open.c index d74e1983e8dc..4197b9ed023d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ - if (filp->f_path.dentry == NULL) + if (filp->f_path.dentry == NULL) { + path_get(&nd->path); filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); - else - path_put(&nd->path); + } return filp; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ffcd04f0012c..911e61f348fc 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -415,16 +415,16 @@ out_no_root: return ret; } -static int openprom_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *openprom_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); + return mount_single(fs_type, flags, data, openprom_fill_super); } static struct file_system_type openprom_fs_type = { .owner = THIS_MODULE, .name = "openpromfs", - .get_sb = openprom_get_sb, + .mount = openprom_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/pipe.c b/fs/pipe.c index d2d7566ce68e..a8012a955720 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1247,16 +1247,15 @@ out: * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static int pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *pipefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); + return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } static struct file_system_type pipe_fs_type = { .name = "pipefs", - .get_sb = pipefs_get_sb, + .mount = pipefs_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b094c1c8465..f3d02ca461ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm; - if (mutex_lock_killable(&task->cred_guard_mutex)) + if (mutex_lock_killable(&task->signal->cred_guard_mutex)) return NULL; mm = get_task_mm(task); @@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) mmput(mm); mm = NULL; } - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); return mm; } @@ -2354,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out_free; /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(&task->cred_guard_mutex); + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (length < 0) goto out_free; length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, (void*)page, count); - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out_free: free_page((unsigned long) page); out: diff --git a/fs/proc/root.c b/fs/proc/root.c index 93d99b316325..ef9fa8e24ad6 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -static int proc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int err; struct super_block *sb; @@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type, sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); if (!sb->s_root) { sb->s_flags = flags; err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); } ei = PROC_I(sb->s_root->d_inode); @@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; - ns->proc_mnt = mnt; } - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } static void proc_kill_sb(struct super_block *sb) @@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb) static struct file_system_type proc_fs_type = { .name = "proc", - .get_sb = proc_get_sb, + .mount = proc_mount, .kill_sb = proc_kill_sb, }; @@ -115,6 +113,7 @@ void __init proc_root_init(void) return; } + init_pid_ns.proc_mnt = proc_mnt; proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -213,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) if (IS_ERR(mnt)) return PTR_ERR(mnt); + ns->proc_mnt = mnt; return 0; } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 1807c2419f17..37994737c983 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v) { int i, j; - seq_printf(p, " "); + seq_printf(p, " "); for_each_possible_cpu(i) seq_printf(p, "CPU%-8d", i); seq_printf(p, "\n"); for (i = 0; i < NR_SOFTIRQS; i++) { - seq_printf(p, "%8s:", softirq_to_name[i]); + seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); seq_printf(p, "\n"); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bf31b03fc275..e15a19c93bae 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v) u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; - unsigned int per_irq_sum; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v) guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); guest_nice = cputime64_add(guest_nice, kstat_cpu(i).cpustat.guest_nice); - for_each_irq_nr(j) { - sum += kstat_irqs_cpu(j, i); - } + sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { @@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_nr(j) { - per_irq_sum = 0; - for_each_possible_cpu(i) - per_irq_sum += kstat_irqs_cpu(j, i); - - seq_printf(p, " %u", per_irq_sum); - } + for_each_irq_nr(j) + seq_printf(p, " %u", kstat_irqs(j)); seq_printf(p, "\nctxt %llu\n" diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 871e25ed0069..da6b01d70f01 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -327,6 +327,7 @@ struct mem_size_stats { unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + unsigned long anonymous; unsigned long swap; u64 pss; }; @@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!page) continue; + if (PageAnon(page)) + mss->anonymous += PAGE_SIZE; + mss->resident += PAGE_SIZE; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) @@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n", @@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, + mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 01bad30026fc..fcada42f1aa3 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -454,17 +454,16 @@ static void destroy_inodecache(void) kmem_cache_destroy(qnx4_inode_cachep); } -static int qnx4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *qnx4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); } static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", - .get_sb = qnx4_get_sb, + .mount = qnx4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 3e21b1e2ad3a..880fd9884366 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -4,6 +4,7 @@ config QUOTA bool "Quota support" + select QUOTACTL help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the @@ -65,8 +66,7 @@ config QFMT_V2 config QUOTACTL bool - depends on XFS_QUOTA || QUOTA - default y + default n config QUOTACTL_COMPAT bool diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index aad1316a977f..0fed41e6efcd 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1386,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type) /* Avoid races with quotaoff() */ if (!sb_has_quota_active(sb, cnt)) continue; + /* We could race with quotaon or dqget() could have failed */ + if (!got[cnt]) + continue; if (!inode->i_dquot[cnt]) { inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; @@ -1736,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; + char is_valid[MAXQUOTAS] = {}; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; @@ -1757,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + /* + * Skip changes for same uid or gid or for turned off quota-type. + */ if (!transfer_to[cnt]) continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(inode->i_sb, cnt)) + continue; + is_valid[cnt] = 1; transfer_from[cnt] = inode->i_dquot[cnt]; ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); if (ret) @@ -1772,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) * Finally perform the needed transfer from transfer_from to transfer_to */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - /* - * Skip changes for same uid or gid or for turned off quota-type. - */ - if (!transfer_to[cnt]) + if (!is_valid[cnt]) continue; - /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { warntype_from_inodes[cnt] = @@ -1801,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); - /* Pass back references to put */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - transfer_to[cnt] = transfer_from[cnt]; -warn: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); - return ret; + /* Pass back references to put */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (is_valid[cnt]) + transfer_to[cnt] = transfer_from[cnt]; + return 0; over_quota: spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto warn; + flush_warnings(transfer_to, warntype_to); + return ret; } EXPORT_SYMBOL(__dquot_transfer); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 67fadb1ad2c1..eacb166fb259 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -255,17 +255,16 @@ fail: return err; } -int ramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +struct dentry *ramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, ramfs_fill_super); } -static int rootfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, - mnt); + return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); } static void ramfs_kill_sb(struct super_block *sb) @@ -276,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb) static struct file_system_type ramfs_fs_type = { .name = "ramfs", - .get_sb = ramfs_get_sb, + .mount = ramfs_mount, .kill_sb = ramfs_kill_sb, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", - .get_sb = rootfs_get_sb, + .mount = rootfs_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/read_write.c b/fs/read_write.c index 9cd9d148105d..431a0ed610c8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -243,8 +243,6 @@ bad: * them to something that fits in "int" so that others * won't have to do range checks all the time. */ -#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK) - int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) { struct inode *inode; @@ -584,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, struct iovec **ret_pointer) - { +{ unsigned long seg; - ssize_t ret; + ssize_t ret; struct iovec *iov = fast_pointer; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ if (nr_segs == 0) { ret = 0; - goto out; + goto out; } - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ if (nr_segs > UIO_MAXIOV) { ret = -EINVAL; - goto out; + goto out; } if (nr_segs > fast_segs) { - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); if (iov == NULL) { ret = -ENOMEM; - goto out; + goto out; } - } + } if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { ret = -EFAULT; - goto out; + goto out; } - /* + /* * According to the Single Unix Specification we should return EINVAL * if an element length is < 0 when cast to ssize_t or if the * total length would overflow the ssize_t return value of the * system call. - */ + * + * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the + * overflow case. + */ ret = 0; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; /* see if we we're about to use an invalid len or if * it's about to overflow ssize_t */ - if (len < 0 || (ret + len < ret)) { + if (len < 0) { ret = -EINVAL; - goto out; + goto out; } if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; - goto out; + goto out; + } + if (len > MAX_RW_COUNT - ret) { + len = MAX_RW_COUNT - ret; + iov[seg].iov_len = len; } - ret += len; - } + } out: *ret_pointer = iov; return ret; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e15ff612002d..3bf7a6457f4d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2213,12 +2213,11 @@ out: #endif -static int get_super_block(struct file_system_type *fs_type, +static struct dentry *get_super_block(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); } static int __init init_reiserfs_fs(void) @@ -2253,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void) struct file_system_type reiserfs_fs_type = { .owner = THIS_MODULE, .name = "reiserfs", - .get_sb = get_super_block, + .mount = get_super_block, .kill_sb = reiserfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 268580535c92..6647f90e55cd 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -552,20 +552,19 @@ error_rsb: /* * get a superblock for mounting */ -static int romfs_get_sb(struct file_system_type *fs_type, +static struct dentry *romfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - int ret = -EINVAL; + struct dentry *ret = ERR_PTR(-EINVAL); #ifdef CONFIG_ROMFS_ON_MTD - ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, - mnt); + ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super); #endif #ifdef CONFIG_ROMFS_ON_BLOCK - if (ret == -EINVAL) - ret = get_sb_bdev(fs_type, flags, dev_name, data, - romfs_fill_super, mnt); + if (ret == ERR_PTR(-EINVAL)) + ret = mount_bdev(fs_type, flags, dev_name, data, + romfs_fill_super); #endif return ret; } @@ -592,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb) static struct file_system_type romfs_fs_type = { .owner = THIS_MODULE, .name = "romfs", - .get_sb = romfs_get_sb, + .mount = romfs_mount, .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/select.c b/fs/select.c index 500a669f7790..b7b10aa30861 100644 --- a/fs/select.c +++ b/fs/select.c @@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -static long estimate_accuracy(struct timespec *tv) +long select_estimate_accuracy(struct timespec *tv) { unsigned long ret; struct timespec now; @@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) } if (end_time && !timed_out) - slack = estimate_accuracy(end_time); + slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { @@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, } if (end_time && !timed_out) - slack = estimate_accuracy(end_time); + slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig deleted file mode 100644 index 2bc24a8c4039..000000000000 --- a/fs/smbfs/Kconfig +++ /dev/null @@ -1,56 +0,0 @@ -config SMB_FS - tristate "SMB file system support (OBSOLETE, please use CIFS)" - depends on BKL # probably unfixable - depends on INET - select NLS - help - SMB (Server Message Block) is the protocol Windows for Workgroups - (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share - files and printers over local networks. Saying Y here allows you to - mount their file systems (often called "shares" in this context) and - access them just like any other Unix directory. Currently, this - works only if the Windows machines use TCP/IP as the underlying - transport protocol, and not NetBEUI. For details, read - <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO, - available from <http://www.tldp.org/docs.html#howto>. - - Note: if you just want your box to act as an SMB *server* and make - files and printing services available to Windows clients (which need - to have a TCP/IP stack), you don't need to say Y here; you can use - the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>) - for that. - - General information about how to connect Linux, Windows machines and - Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. - - To compile the SMB support as a module, choose M here: - the module will be called smbfs. Most people say N, however. - -config SMB_NLS_DEFAULT - bool "Use a default NLS" - depends on SMB_FS - help - Enabling this will make smbfs use nls translations by default. You - need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls - settings and you need to give the default nls for the SMB server as - CONFIG_SMB_NLS_REMOTE. - - The nls settings can be changed at mount time, if your smbmount - supports that, using the codepage and iocharset parameters. - - smbmount from samba 2.2.0 or later supports this. - -config SMB_NLS_REMOTE - string "Default Remote NLS Option" - depends on SMB_NLS_DEFAULT - default "cp437" - help - This setting allows you to specify a default value for which - codepage the server uses. If this field is left blank no - translations will be done by default. The local codepage/charset - default to CONFIG_NLS_DEFAULT. - - The nls settings can be changed at mount time, if your smbmount - supports that, using the codepage and iocharset parameters. - - smbmount from samba 2.2.0 or later supports this. diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile deleted file mode 100644 index 4faf8c4722c3..000000000000 --- a/fs/smbfs/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# -# Makefile for the linux smb-filesystem routines. -# - -obj-$(CONFIG_SMB_FS) += smbfs.o - -smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \ - symlink.o smbiod.o request.o - -# If you want debugging output, you may add these flags to the EXTRA_CFLAGS -# SMBFS_PARANOIA should normally be enabled. - -EXTRA_CFLAGS += -DSMBFS_PARANOIA -#EXTRA_CFLAGS += -DSMBFS_DEBUG -#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE -#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP -#EXTRA_CFLAGS += -Werror - diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c deleted file mode 100644 index 8c177eb7e344..000000000000 --- a/fs/smbfs/cache.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * cache.c - * - * Copyright (C) 1997 by Bill Hawes - * - * Routines to support directory cacheing using the page cache. - * This cache code is almost directly taken from ncpfs. - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/smb_fs.h> -#include <linux/pagemap.h> -#include <linux/net.h> - -#include <asm/page.h> - -#include "smb_debug.h" -#include "proto.h" - -/* - * Force the next attempt to use the cache to be a timeout. - * If we can't find the page that's fine, it will cause a refresh. - */ -void -smb_invalid_dir_cache(struct inode * dir) -{ - struct smb_sb_info *server = server_from_inode(dir); - union smb_dir_cache *cache = NULL; - struct page *page = NULL; - - page = grab_cache_page(&dir->i_data, 0); - if (!page) - goto out; - - if (!PageUptodate(page)) - goto out_unlock; - - cache = kmap(page); - cache->head.time = jiffies - SMB_MAX_AGE(server); - - kunmap(page); - SetPageUptodate(page); -out_unlock: - unlock_page(page); - page_cache_release(page); -out: - return; -} - -/* - * Mark all dentries for 'parent' as invalid, forcing them to be re-read - */ -void -smb_invalidate_dircache_entries(struct dentry *parent) -{ - struct smb_sb_info *server = server_from_dentry(parent); - struct list_head *next; - struct dentry *dentry; - - spin_lock(&dcache_lock); - next = parent->d_subdirs.next; - while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_u.d_child); - dentry->d_fsdata = NULL; - smb_age_dentry(server, dentry); - next = next->next; - } - spin_unlock(&dcache_lock); -} - -/* - * dget, but require that fpos and parent matches what the dentry contains. - * dentry is not known to be a valid pointer at entry. - */ -struct dentry * -smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) -{ - struct dentry *dent = dentry; - struct list_head *next; - - if (d_validate(dent, parent)) { - if (dent->d_name.len <= SMB_MAXNAMELEN && - (unsigned long)dent->d_fsdata == fpos) { - if (!dent->d_inode) { - dput(dent); - dent = NULL; - } - return dent; - } - dput(dent); - } - - /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); - next = parent->d_subdirs.next; - while (next != &parent->d_subdirs) { - dent = list_entry(next, struct dentry, d_u.d_child); - if ((unsigned long)dent->d_fsdata == fpos) { - if (dent->d_inode) - dget_locked(dent); - else - dent = NULL; - goto out_unlock; - } - next = next->next; - } - dent = NULL; -out_unlock: - spin_unlock(&dcache_lock); - return dent; -} - - -/* - * Create dentry/inode for this file and add it to the dircache. - */ -int -smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct smb_cache_control *ctrl, struct qstr *qname, - struct smb_fattr *entry) -{ - struct dentry *newdent, *dentry = filp->f_path.dentry; - struct inode *newino, *inode = dentry->d_inode; - struct smb_cache_control ctl = *ctrl; - int valid = 0; - int hashed = 0; - ino_t ino = 0; - - qname->hash = full_name_hash(qname->name, qname->len); - - if (dentry->d_op && dentry->d_op->d_hash) - if (dentry->d_op->d_hash(dentry, qname) != 0) - goto end_advance; - - newdent = d_lookup(dentry, qname); - - if (!newdent) { - newdent = d_alloc(dentry, qname); - if (!newdent) - goto end_advance; - } else { - hashed = 1; - memcpy((char *) newdent->d_name.name, qname->name, - newdent->d_name.len); - } - - if (!newdent->d_inode) { - smb_renew_times(newdent); - entry->f_ino = iunique(inode->i_sb, 2); - newino = smb_iget(inode->i_sb, entry); - if (newino) { - smb_new_dentry(newdent); - d_instantiate(newdent, newino); - if (!hashed) - d_rehash(newdent); - } - } else - smb_set_inode_attr(newdent->d_inode, entry); - - if (newdent->d_inode) { - ino = newdent->d_inode->i_ino; - newdent->d_fsdata = (void *) ctl.fpos; - smb_new_dentry(newdent); - } - - if (ctl.idx >= SMB_DIRCACHE_SIZE) { - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - page_cache_release(ctl.page); - } - ctl.cache = NULL; - ctl.idx -= SMB_DIRCACHE_SIZE; - ctl.ofs += 1; - ctl.page = grab_cache_page(&inode->i_data, ctl.ofs); - if (ctl.page) - ctl.cache = kmap(ctl.page); - } - if (ctl.cache) { - ctl.cache->dentry[ctl.idx] = newdent; - valid = 1; - } - dput(newdent); - -end_advance: - if (!valid) - ctl.valid = 0; - if (!ctl.filled && (ctl.fpos == filp->f_pos)) { - if (!ino) - ino = find_inode_number(dentry, qname); - if (!ino) - ino = iunique(inode->i_sb, 2); - ctl.filled = filldir(dirent, qname->name, qname->len, - filp->f_pos, ino, DT_UNKNOWN); - if (!ctl.filled) - filp->f_pos += 1; - } - ctl.fpos += 1; - ctl.idx += 1; - *ctrl = ctl; - return (ctl.valid || !ctl.filled); -} diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c deleted file mode 100644 index f678d421e541..000000000000 --- a/fs/smbfs/dir.c +++ /dev/null @@ -1,696 +0,0 @@ -/* - * dir.c - * - * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/smp_lock.h> -#include <linux/ctype.h> -#include <linux/net.h> -#include <linux/sched.h> - -#include <linux/smb_fs.h> -#include <linux/smb_mount.h> -#include <linux/smbno.h> - -#include "smb_debug.h" -#include "proto.h" - -static int smb_readdir(struct file *, void *, filldir_t); -static int smb_dir_open(struct inode *, struct file *); - -static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *); -static int smb_create(struct inode *, struct dentry *, int, struct nameidata *); -static int smb_mkdir(struct inode *, struct dentry *, int); -static int smb_rmdir(struct inode *, struct dentry *); -static int smb_unlink(struct inode *, struct dentry *); -static int smb_rename(struct inode *, struct dentry *, - struct inode *, struct dentry *); -static int smb_make_node(struct inode *,struct dentry *,int,dev_t); -static int smb_link(struct dentry *, struct inode *, struct dentry *); - -const struct file_operations smb_dir_operations = -{ - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = smb_readdir, - .unlocked_ioctl = smb_ioctl, - .open = smb_dir_open, -}; - -const struct inode_operations smb_dir_inode_operations = -{ - .create = smb_create, - .lookup = smb_lookup, - .unlink = smb_unlink, - .mkdir = smb_mkdir, - .rmdir = smb_rmdir, - .rename = smb_rename, - .getattr = smb_getattr, - .setattr = smb_notify_change, -}; - -const struct inode_operations smb_dir_inode_operations_unix = -{ - .create = smb_create, - .lookup = smb_lookup, - .unlink = smb_unlink, - .mkdir = smb_mkdir, - .rmdir = smb_rmdir, - .rename = smb_rename, - .getattr = smb_getattr, - .setattr = smb_notify_change, - .symlink = smb_symlink, - .mknod = smb_make_node, - .link = smb_link, -}; - -/* - * Read a directory, using filldir to fill the dirent memory. - * smb_proc_readdir does the actual reading from the smb server. - * - * The cache code is almost directly taken from ncpfs - */ -static int -smb_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct dentry *dentry = filp->f_path.dentry; - struct inode *dir = dentry->d_inode; - struct smb_sb_info *server = server_from_dentry(dentry); - union smb_dir_cache *cache = NULL; - struct smb_cache_control ctl; - struct page *page = NULL; - int result; - - ctl.page = NULL; - ctl.cache = NULL; - - VERBOSE("reading %s/%s, f_pos=%d\n", - DENTRY_PATH(dentry), (int) filp->f_pos); - - result = 0; - - lock_kernel(); - - switch ((unsigned int) filp->f_pos) { - case 0: - if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos = 1; - /* fallthrough */ - case 1: - if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0) - goto out; - filp->f_pos = 2; - } - - /* - * Make sure our inode is up-to-date. - */ - result = smb_revalidate_inode(dentry); - if (result) - goto out; - - - page = grab_cache_page(&dir->i_data, 0); - if (!page) - goto read_really; - - ctl.cache = cache = kmap(page); - ctl.head = cache->head; - - if (!PageUptodate(page) || !ctl.head.eof) { - VERBOSE("%s/%s, page uptodate=%d, eof=%d\n", - DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof); - goto init_cache; - } - - if (filp->f_pos == 2) { - if (jiffies - ctl.head.time >= SMB_MAX_AGE(server)) - goto init_cache; - - /* - * N.B. ncpfs checks mtime of dentry too here, we don't. - * 1. common smb servers do not update mtime on dir changes - * 2. it requires an extra smb request - * (revalidate has the same timeout as ctl.head.time) - * - * Instead smbfs invalidates its own cache on local changes - * and remote changes are not seen until timeout. - */ - } - - if (filp->f_pos > ctl.head.end) - goto finished; - - ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2); - ctl.ofs = ctl.fpos / SMB_DIRCACHE_SIZE; - ctl.idx = ctl.fpos % SMB_DIRCACHE_SIZE; - - for (;;) { - if (ctl.ofs != 0) { - ctl.page = find_lock_page(&dir->i_data, ctl.ofs); - if (!ctl.page) - goto invalid_cache; - ctl.cache = kmap(ctl.page); - if (!PageUptodate(ctl.page)) - goto invalid_cache; - } - while (ctl.idx < SMB_DIRCACHE_SIZE) { - struct dentry *dent; - int res; - - dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx], - dentry, filp->f_pos); - if (!dent) - goto invalid_cache; - - res = filldir(dirent, dent->d_name.name, - dent->d_name.len, filp->f_pos, - dent->d_inode->i_ino, DT_UNKNOWN); - dput(dent); - if (res) - goto finished; - filp->f_pos += 1; - ctl.idx += 1; - if (filp->f_pos > ctl.head.end) - goto finished; - } - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - page_cache_release(ctl.page); - ctl.page = NULL; - } - ctl.idx = 0; - ctl.ofs += 1; - } -invalid_cache: - if (ctl.page) { - kunmap(ctl.page); - unlock_page(ctl.page); - page_cache_release(ctl.page); - ctl.page = NULL; - } - ctl.cache = cache; -init_cache: - smb_invalidate_dircache_entries(dentry); - ctl.head.time = jiffies; - ctl.head.eof = 0; - ctl.fpos = 2; - ctl.ofs = 0; - ctl.idx = SMB_DIRCACHE_START; - ctl.filled = 0; - ctl.valid = 1; -read_really: - result = server->ops->readdir(filp, dirent, filldir, &ctl); - if (result == -ERESTARTSYS && page) - ClearPageUptodate(page); - if (ctl.idx == -1) - goto invalid_cache; /* retry */ - ctl.head.end = ctl.fpos - 1; - ctl.head.eof = ctl.valid; -finished: - if (page) { - cache->head = ctl.head; - kunmap(page); - if (result != -ERESTARTSYS) - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); - } - if (ctl.page) { - kunmap(ctl.page); - SetPageUptodate(ctl.page); - unlock_page(ctl.page); - page_cache_release(ctl.page); - } -out: - unlock_kernel(); - return result; -} - -static int -smb_dir_open(struct inode *dir, struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct smb_sb_info *server; - int error = 0; - - VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name, - file->f_path.dentry->d_name.name); - - /* - * Directory timestamps in the core protocol aren't updated - * when a file is added, so we give them a very short TTL. - */ - lock_kernel(); - server = server_from_dentry(dentry); - if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) { - unsigned long age = jiffies - SMB_I(dir)->oldmtime; - if (age > 2*HZ) - smb_invalid_dir_cache(dir); - } - - /* - * Note: in order to allow the smbmount process to open the - * mount point, we only revalidate if the connection is valid or - * if the process is trying to access something other than the root. - */ - if (server->state == CONN_VALID || !IS_ROOT(dentry)) - error = smb_revalidate_inode(dentry); - unlock_kernel(); - return error; -} - -/* - * Dentry operations routines - */ -static int smb_lookup_validate(struct dentry *, struct nameidata *); -static int smb_hash_dentry(struct dentry *, struct qstr *); -static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *); -static int smb_delete_dentry(struct dentry *); - -static const struct dentry_operations smbfs_dentry_operations = -{ - .d_revalidate = smb_lookup_validate, - .d_hash = smb_hash_dentry, - .d_compare = smb_compare_dentry, - .d_delete = smb_delete_dentry, -}; - -static const struct dentry_operations smbfs_dentry_operations_case = -{ - .d_revalidate = smb_lookup_validate, - .d_delete = smb_delete_dentry, -}; - - -/* - * This is the callback when the dcache has a lookup hit. - */ -static int -smb_lookup_validate(struct dentry * dentry, struct nameidata *nd) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - struct inode * inode = dentry->d_inode; - unsigned long age = jiffies - dentry->d_time; - int valid; - - /* - * The default validation is based on dentry age: - * we believe in dentries for a few seconds. (But each - * successful server lookup renews the timestamp.) - */ - valid = (age <= SMB_MAX_AGE(server)); -#ifdef SMBFS_DEBUG_VERBOSE - if (!valid) - VERBOSE("%s/%s not valid, age=%lu\n", - DENTRY_PATH(dentry), age); -#endif - - if (inode) { - lock_kernel(); - if (is_bad_inode(inode)) { - PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry)); - valid = 0; - } else if (!valid) - valid = (smb_revalidate_inode(dentry) == 0); - unlock_kernel(); - } else { - /* - * What should we do for negative dentries? - */ - } - return valid; -} - -static int -smb_hash_dentry(struct dentry *dir, struct qstr *this) -{ - unsigned long hash; - int i; - - hash = init_name_hash(); - for (i=0; i < this->len ; i++) - hash = partial_name_hash(tolower(this->name[i]), hash); - this->hash = end_name_hash(hash); - - return 0; -} - -static int -smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b) -{ - int i, result = 1; - - if (a->len != b->len) - goto out; - for (i=0; i < a->len; i++) { - if (tolower(a->name[i]) != tolower(b->name[i])) - goto out; - } - result = 0; -out: - return result; -} - -/* - * This is the callback from dput() when d_count is going to 0. - * We use this to unhash dentries with bad inodes. - */ -static int -smb_delete_dentry(struct dentry * dentry) -{ - if (dentry->d_inode) { - if (is_bad_inode(dentry->d_inode)) { - PARANOIA("bad inode, unhashing %s/%s\n", - DENTRY_PATH(dentry)); - return 1; - } - } else { - /* N.B. Unhash negative dentries? */ - } - return 0; -} - -/* - * Initialize a new dentry - */ -void -smb_new_dentry(struct dentry *dentry) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - - if (server->mnt->flags & SMB_MOUNT_CASE) - dentry->d_op = &smbfs_dentry_operations_case; - else - dentry->d_op = &smbfs_dentry_operations; - dentry->d_time = jiffies; -} - - -/* - * Whenever a lookup succeeds, we know the parent directories - * are all valid, so we want to update the dentry timestamps. - * N.B. Move this to dcache? - */ -void -smb_renew_times(struct dentry * dentry) -{ - dget(dentry); - dentry->d_time = jiffies; - - while (!IS_ROOT(dentry)) { - struct dentry *parent = dget_parent(dentry); - dput(dentry); - dentry = parent; - - dentry->d_time = jiffies; - } - dput(dentry); -} - -static struct dentry * -smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - struct smb_fattr finfo; - struct inode *inode; - int error; - struct smb_sb_info *server; - - error = -ENAMETOOLONG; - if (dentry->d_name.len > SMB_MAXNAMELEN) - goto out; - - /* Do not allow lookup of names with backslashes in */ - error = -EINVAL; - if (memchr(dentry->d_name.name, '\\', dentry->d_name.len)) - goto out; - - lock_kernel(); - error = smb_proc_getattr(dentry, &finfo); -#ifdef SMBFS_PARANOIA - if (error && error != -ENOENT) - PARANOIA("find %s/%s failed, error=%d\n", - DENTRY_PATH(dentry), error); -#endif - - inode = NULL; - if (error == -ENOENT) - goto add_entry; - if (!error) { - error = -EACCES; - finfo.f_ino = iunique(dentry->d_sb, 2); - inode = smb_iget(dir->i_sb, &finfo); - if (inode) { - add_entry: - server = server_from_dentry(dentry); - if (server->mnt->flags & SMB_MOUNT_CASE) - dentry->d_op = &smbfs_dentry_operations_case; - else - dentry->d_op = &smbfs_dentry_operations; - - d_add(dentry, inode); - smb_renew_times(dentry); - error = 0; - } - } - unlock_kernel(); -out: - return ERR_PTR(error); -} - -/* - * This code is common to all routines creating a new inode. - */ -static int -smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - struct inode *inode; - int error; - struct smb_fattr fattr; - - VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid); - - error = smb_proc_getattr(dentry, &fattr); - if (error) - goto out_close; - - smb_renew_times(dentry); - fattr.f_ino = iunique(dentry->d_sb, 2); - inode = smb_iget(dentry->d_sb, &fattr); - if (!inode) - goto out_no_inode; - - if (have_id) { - struct smb_inode_info *ei = SMB_I(inode); - ei->fileid = fileid; - ei->access = SMB_O_RDWR; - ei->open = server->generation; - } - d_instantiate(dentry, inode); -out: - return error; - -out_no_inode: - error = -EACCES; -out_close: - if (have_id) { - PARANOIA("%s/%s failed, error=%d, closing %u\n", - DENTRY_PATH(dentry), error, fileid); - smb_close_fileid(dentry, fileid); - } - goto out; -} - -/* N.B. How should the mode argument be used? */ -static int -smb_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - __u16 fileid; - int error; - struct iattr attr; - - VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode); - - lock_kernel(); - smb_invalid_dir_cache(dir); - error = smb_proc_create(dentry, 0, get_seconds(), &fileid); - if (!error) { - if (server->opt.capabilities & SMB_CAP_UNIX) { - /* Set attributes for new file */ - attr.ia_valid = ATTR_MODE; - attr.ia_mode = mode; - error = smb_proc_setattr_unix(dentry, &attr, 0, 0); - } - error = smb_instantiate(dentry, fileid, 1); - } else { - PARANOIA("%s/%s failed, error=%d\n", - DENTRY_PATH(dentry), error); - } - unlock_kernel(); - return error; -} - -/* N.B. How should the mode argument be used? */ -static int -smb_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - int error; - struct iattr attr; - - lock_kernel(); - smb_invalid_dir_cache(dir); - error = smb_proc_mkdir(dentry); - if (!error) { - if (server->opt.capabilities & SMB_CAP_UNIX) { - /* Set attributes for new directory */ - attr.ia_valid = ATTR_MODE; - attr.ia_mode = mode; - error = smb_proc_setattr_unix(dentry, &attr, 0, 0); - } - error = smb_instantiate(dentry, 0, 0); - } - unlock_kernel(); - return error; -} - -static int -smb_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int error; - - /* - * Close the directory if it's open. - */ - lock_kernel(); - smb_close(inode); - - /* - * Check that nobody else is using the directory.. - */ - error = -EBUSY; - if (!d_unhashed(dentry)) - goto out; - - smb_invalid_dir_cache(dir); - error = smb_proc_rmdir(dentry); - -out: - unlock_kernel(); - return error; -} - -static int -smb_unlink(struct inode *dir, struct dentry *dentry) -{ - int error; - - /* - * Close the file if it's open. - */ - lock_kernel(); - smb_close(dentry->d_inode); - - smb_invalid_dir_cache(dir); - error = smb_proc_unlink(dentry); - if (!error) - smb_renew_times(dentry); - unlock_kernel(); - return error; -} - -static int -smb_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error; - - /* - * Close any open files, and check whether to delete the - * target before attempting the rename. - */ - lock_kernel(); - if (old_dentry->d_inode) - smb_close(old_dentry->d_inode); - if (new_dentry->d_inode) { - smb_close(new_dentry->d_inode); - error = smb_proc_unlink(new_dentry); - if (error) { - VERBOSE("unlink %s/%s, error=%d\n", - DENTRY_PATH(new_dentry), error); - goto out; - } - /* FIXME */ - d_delete(new_dentry); - } - - smb_invalid_dir_cache(old_dir); - smb_invalid_dir_cache(new_dir); - error = smb_proc_mv(old_dentry, new_dentry); - if (!error) { - smb_renew_times(old_dentry); - smb_renew_times(new_dentry); - } -out: - unlock_kernel(); - return error; -} - -/* - * FIXME: samba servers won't let you create device nodes unless uid/gid - * matches the connection credentials (and we don't know which those are ...) - */ -static int -smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -{ - int error; - struct iattr attr; - - attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID; - attr.ia_mode = mode; - current_euid_egid(&attr.ia_uid, &attr.ia_gid); - - if (!new_valid_dev(dev)) - return -EINVAL; - - smb_invalid_dir_cache(dir); - error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev)); - if (!error) { - error = smb_instantiate(dentry, 0, 0); - } - return error; -} - -/* - * dentry = existing file - * new_dentry = new file - */ -static int -smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry) -{ - int error; - - DEBUG1("smb_link old=%s/%s new=%s/%s\n", - DENTRY_PATH(dentry), DENTRY_PATH(new_dentry)); - smb_invalid_dir_cache(dir); - error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry); - if (!error) { - smb_renew_times(dentry); - error = smb_instantiate(new_dentry, 0, 0); - } - return error; -} diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c deleted file mode 100644 index 8e187a0f94bb..000000000000 --- a/fs/smbfs/file.c +++ /dev/null @@ -1,454 +0,0 @@ -/* - * file.c - * - * Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/smp_lock.h> -#include <linux/net.h> -#include <linux/aio.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -#include <linux/smbno.h> -#include <linux/smb_fs.h> - -#include "smb_debug.h" -#include "proto.h" - -static int -smb_fsync(struct file *file, int datasync) -{ - struct dentry *dentry = file->f_path.dentry; - struct smb_sb_info *server = server_from_dentry(dentry); - int result; - - VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry)); - - /* - * The VFS will writepage() all dirty pages for us, but we - * should send a SMBflush to the server, letting it know that - * we want things synchronized with actual storage. - * - * Note: this function requires all pages to have been written already - * (should be ok with writepage_sync) - */ - result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid); - return result; -} - -/* - * Read a page synchronously. - */ -static int -smb_readpage_sync(struct dentry *dentry, struct page *page) -{ - char *buffer = kmap(page); - loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - struct smb_sb_info *server = server_from_dentry(dentry); - unsigned int rsize = smb_get_rsize(server); - int count = PAGE_SIZE; - int result; - - VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n", - DENTRY_PATH(dentry), count, offset, rsize); - - result = smb_open(dentry, SMB_O_RDONLY); - if (result < 0) - goto io_error; - - do { - if (count < rsize) - rsize = count; - - result = server->ops->read(dentry->d_inode,offset,rsize,buffer); - if (result < 0) - goto io_error; - - count -= result; - offset += result; - buffer += result; - dentry->d_inode->i_atime = - current_fs_time(dentry->d_inode->i_sb); - if (result < rsize) - break; - } while (count); - - memset(buffer, 0, count); - flush_dcache_page(page); - SetPageUptodate(page); - result = 0; - -io_error: - kunmap(page); - unlock_page(page); - return result; -} - -/* - * We are called with the page locked and we unlock it when done. - */ -static int -smb_readpage(struct file *file, struct page *page) -{ - int error; - struct dentry *dentry = file->f_path.dentry; - - page_cache_get(page); - error = smb_readpage_sync(dentry, page); - page_cache_release(page); - return error; -} - -/* - * Write a page synchronously. - * Offset is the data offset within the page. - */ -static int -smb_writepage_sync(struct inode *inode, struct page *page, - unsigned long pageoffset, unsigned int count) -{ - loff_t offset; - char *buffer = kmap(page) + pageoffset; - struct smb_sb_info *server = server_from_inode(inode); - unsigned int wsize = smb_get_wsize(server); - int ret = 0; - - offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset; - VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n", - inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize); - - do { - int write_ret; - - if (count < wsize) - wsize = count; - - write_ret = server->ops->write(inode, offset, wsize, buffer); - if (write_ret < 0) { - PARANOIA("failed write, wsize=%d, write_ret=%d\n", - wsize, write_ret); - ret = write_ret; - break; - } - /* N.B. what if result < wsize?? */ -#ifdef SMBFS_PARANOIA - if (write_ret < wsize) - PARANOIA("short write, wsize=%d, write_ret=%d\n", - wsize, write_ret); -#endif - buffer += wsize; - offset += wsize; - count -= wsize; - /* - * Update the inode now rather than waiting for a refresh. - */ - inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb); - SMB_I(inode)->flags |= SMB_F_LOCALWRITE; - if (offset > inode->i_size) - inode->i_size = offset; - } while (count); - - kunmap(page); - return ret; -} - -/* - * Write a page to the server. This will be used for NFS swapping only - * (for now), and we currently do this synchronously only. - * - * We are called with the page locked and we unlock it when done. - */ -static int -smb_writepage(struct page *page, struct writeback_control *wbc) -{ - struct address_space *mapping = page->mapping; - struct inode *inode; - unsigned long end_index; - unsigned offset = PAGE_CACHE_SIZE; - int err; - - BUG_ON(!mapping); - inode = mapping->host; - BUG_ON(!inode); - - end_index = inode->i_size >> PAGE_CACHE_SHIFT; - - /* easy case */ - if (page->index < end_index) - goto do_it; - /* things got complicated... */ - offset = inode->i_size & (PAGE_CACHE_SIZE-1); - /* OK, are we completely out? */ - if (page->index >= end_index+1 || !offset) - return 0; /* truncated - don't care */ -do_it: - page_cache_get(page); - err = smb_writepage_sync(inode, page, 0, offset); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); - return err; -} - -static int -smb_updatepage(struct file *file, struct page *page, unsigned long offset, - unsigned int count) -{ - struct dentry *dentry = file->f_path.dentry; - - DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count, - ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset); - - return smb_writepage_sync(dentry->d_inode, page, offset, count); -} - -static ssize_t -smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file * file = iocb->ki_filp; - struct dentry * dentry = file->f_path.dentry; - ssize_t status; - - VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry), - (unsigned long) iocb->ki_left, (unsigned long) pos); - - status = smb_revalidate_inode(dentry); - if (status) { - PARANOIA("%s/%s validation failed, error=%Zd\n", - DENTRY_PATH(dentry), status); - goto out; - } - - VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n", - (long)dentry->d_inode->i_size, - dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec); - - status = generic_file_aio_read(iocb, iov, nr_segs, pos); -out: - return status; -} - -static int -smb_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - struct dentry * dentry = file->f_path.dentry; - int status; - - VERBOSE("file %s/%s, address %lu - %lu\n", - DENTRY_PATH(dentry), vma->vm_start, vma->vm_end); - - status = smb_revalidate_inode(dentry); - if (status) { - PARANOIA("%s/%s validation failed, error=%d\n", - DENTRY_PATH(dentry), status); - goto out; - } - status = generic_file_mmap(file, vma); -out: - return status; -} - -static ssize_t -smb_file_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - struct dentry *dentry = file->f_path.dentry; - ssize_t status; - - VERBOSE("file %s/%s, pos=%Ld, count=%lu\n", - DENTRY_PATH(dentry), *ppos, count); - - status = smb_revalidate_inode(dentry); - if (status) { - PARANOIA("%s/%s validation failed, error=%Zd\n", - DENTRY_PATH(dentry), status); - goto out; - } - status = generic_file_splice_read(file, ppos, pipe, count, flags); -out: - return status; -} - -/* - * This does the "real" work of the write. The generic routine has - * allocated the page, locked it, done all the page alignment stuff - * calculations etc. Now we should just copy the data from user - * space and write it back to the real medium.. - * - * If the writer ends up delaying the write, the writer needs to - * increment the page use counts until he is done with the page. - */ -static int smb_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = grab_cache_page_write_begin(mapping, index, flags); - if (!*pagep) - return -ENOMEM; - return 0; -} - -static int smb_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int status; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - - lock_kernel(); - status = smb_updatepage(file, page, offset, copied); - unlock_kernel(); - - if (!status) { - if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) - SetPageUptodate(page); - status = copied; - } - - unlock_page(page); - page_cache_release(page); - - return status; -} - -const struct address_space_operations smb_file_aops = { - .readpage = smb_readpage, - .writepage = smb_writepage, - .write_begin = smb_write_begin, - .write_end = smb_write_end, -}; - -/* - * Write to a file (through the page cache). - */ -static ssize_t -smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file * file = iocb->ki_filp; - struct dentry * dentry = file->f_path.dentry; - ssize_t result; - - VERBOSE("file %s/%s, count=%lu@%lu\n", - DENTRY_PATH(dentry), - (unsigned long) iocb->ki_left, (unsigned long) pos); - - result = smb_revalidate_inode(dentry); - if (result) { - PARANOIA("%s/%s validation failed, error=%Zd\n", - DENTRY_PATH(dentry), result); - goto out; - } - - result = smb_open(dentry, SMB_O_WRONLY); - if (result) - goto out; - - if (iocb->ki_left > 0) { - result = generic_file_aio_write(iocb, iov, nr_segs, pos); - VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n", - (long) file->f_pos, (long) dentry->d_inode->i_size, - dentry->d_inode->i_mtime.tv_sec, - dentry->d_inode->i_atime.tv_sec); - } -out: - return result; -} - -static int -smb_file_open(struct inode *inode, struct file * file) -{ - int result; - struct dentry *dentry = file->f_path.dentry; - int smb_mode = (file->f_mode & O_ACCMODE) - 1; - - lock_kernel(); - result = smb_open(dentry, smb_mode); - if (result) - goto out; - SMB_I(inode)->openers++; -out: - unlock_kernel(); - return result; -} - -static int -smb_file_release(struct inode *inode, struct file * file) -{ - lock_kernel(); - if (!--SMB_I(inode)->openers) { - /* We must flush any dirty pages now as we won't be able to - write anything after close. mmap can trigger this. - "openers" should perhaps include mmap'ers ... */ - filemap_write_and_wait(inode->i_mapping); - smb_close(inode); - } - unlock_kernel(); - return 0; -} - -/* - * Check whether the required access is compatible with - * an inode's permission. SMB doesn't recognize superuser - * privileges, so we need our own check for this. - */ -static int -smb_file_permission(struct inode *inode, int mask) -{ - int mode = inode->i_mode; - int error = 0; - - VERBOSE("mode=%x, mask=%x\n", mode, mask); - - /* Look at user permissions */ - mode >>= 6; - if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) - error = -EACCES; - return error; -} - -static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin) -{ - loff_t ret; - lock_kernel(); - ret = generic_file_llseek_unlocked(file, offset, origin); - unlock_kernel(); - return ret; -} - -const struct file_operations smb_file_operations = -{ - .llseek = smb_remote_llseek, - .read = do_sync_read, - .aio_read = smb_file_aio_read, - .write = do_sync_write, - .aio_write = smb_file_aio_write, - .unlocked_ioctl = smb_ioctl, - .mmap = smb_file_mmap, - .open = smb_file_open, - .release = smb_file_release, - .fsync = smb_fsync, - .splice_read = smb_file_splice_read, -}; - -const struct inode_operations smb_file_inode_operations = -{ - .permission = smb_file_permission, - .getattr = smb_getattr, - .setattr = smb_notify_change, -}; diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c deleted file mode 100644 index 7ae0f5273ab1..000000000000 --- a/fs/smbfs/getopt.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * getopt.c - */ - -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/net.h> - -#include "getopt.h" - -/** - * smb_getopt - option parser - * @caller: name of the caller, for error messages - * @options: the options string - * @opts: an array of &struct option entries controlling parser operations - * @optopt: output; will contain the current option - * @optarg: output; will contain the value (if one exists) - * @flag: output; may be NULL; should point to a long for or'ing flags - * @value: output; may be NULL; will be overwritten with the integer value - * of the current argument. - * - * Helper to parse options on the format used by mount ("a=b,c=d,e,f"). - * Returns opts->val if a matching entry in the 'opts' array is found, - * 0 when no more tokens are found, -1 if an error is encountered. - */ -int smb_getopt(char *caller, char **options, struct option *opts, - char **optopt, char **optarg, unsigned long *flag, - unsigned long *value) -{ - char *token; - char *val; - int i; - - do { - if ((token = strsep(options, ",")) == NULL) - return 0; - } while (*token == '\0'); - *optopt = token; - - *optarg = NULL; - if ((val = strchr (token, '=')) != NULL) { - *val++ = 0; - if (value) - *value = simple_strtoul(val, NULL, 0); - *optarg = val; - } - - for (i = 0; opts[i].name != NULL; i++) { - if (!strcmp(opts[i].name, token)) { - if (!opts[i].flag && (!val || !*val)) { - printk("%s: the %s option requires an argument\n", - caller, token); - return -1; - } - - if (flag && opts[i].flag) - *flag |= opts[i].flag; - - return opts[i].val; - } - } - printk("%s: Unrecognized mount option %s\n", caller, token); - return -1; -} diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h deleted file mode 100644 index 146219ac7c46..000000000000 --- a/fs/smbfs/getopt.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _LINUX_GETOPT_H -#define _LINUX_GETOPT_H - -struct option { - const char *name; - unsigned long flag; - int val; -}; - -extern int smb_getopt(char *caller, char **options, struct option *opts, - char **optopt, char **optarg, unsigned long *flag, - unsigned long *value); - -#endif /* _LINUX_GETOPT_H */ diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c deleted file mode 100644 index f6e9ee59757e..000000000000 --- a/fs/smbfs/inode.c +++ /dev/null @@ -1,843 +0,0 @@ -/* - * inode.c - * - * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/module.h> -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/file.h> -#include <linux/dcache.h> -#include <linux/smp_lock.h> -#include <linux/nls.h> -#include <linux/seq_file.h> -#include <linux/mount.h> -#include <linux/net.h> -#include <linux/vfs.h> -#include <linux/highuid.h> -#include <linux/sched.h> -#include <linux/smb_fs.h> -#include <linux/smbno.h> -#include <linux/smb_mount.h> - -#include <asm/system.h> -#include <asm/uaccess.h> - -#include "smb_debug.h" -#include "getopt.h" -#include "proto.h" - -/* Always pick a default string */ -#ifdef CONFIG_SMB_NLS_REMOTE -#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE -#else -#define SMB_NLS_REMOTE "" -#endif - -#define SMB_TTL_DEFAULT 1000 - -static void smb_evict_inode(struct inode *); -static void smb_put_super(struct super_block *); -static int smb_statfs(struct dentry *, struct kstatfs *); -static int smb_show_options(struct seq_file *, struct vfsmount *); - -static struct kmem_cache *smb_inode_cachep; - -static struct inode *smb_alloc_inode(struct super_block *sb) -{ - struct smb_inode_info *ei; - ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void smb_destroy_inode(struct inode *inode) -{ - kmem_cache_free(smb_inode_cachep, SMB_I(inode)); -} - -static void init_once(void *foo) -{ - struct smb_inode_info *ei = (struct smb_inode_info *) foo; - - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - smb_inode_cachep = kmem_cache_create("smb_inode_cache", - sizeof(struct smb_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (smb_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(smb_inode_cachep); -} - -static int smb_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_NODIRATIME; - return 0; -} - -static const struct super_operations smb_sops = -{ - .alloc_inode = smb_alloc_inode, - .destroy_inode = smb_destroy_inode, - .drop_inode = generic_delete_inode, - .evict_inode = smb_evict_inode, - .put_super = smb_put_super, - .statfs = smb_statfs, - .show_options = smb_show_options, - .remount_fs = smb_remount, -}; - - -/* We are always generating a new inode here */ -struct inode * -smb_iget(struct super_block *sb, struct smb_fattr *fattr) -{ - struct smb_sb_info *server = SMB_SB(sb); - struct inode *result; - - DEBUG1("smb_iget: %p\n", fattr); - - result = new_inode(sb); - if (!result) - return result; - result->i_ino = fattr->f_ino; - SMB_I(result)->open = 0; - SMB_I(result)->fileid = 0; - SMB_I(result)->access = 0; - SMB_I(result)->flags = 0; - SMB_I(result)->closed = 0; - SMB_I(result)->openers = 0; - smb_set_inode_attr(result, fattr); - if (S_ISREG(result->i_mode)) { - result->i_op = &smb_file_inode_operations; - result->i_fop = &smb_file_operations; - result->i_data.a_ops = &smb_file_aops; - } else if (S_ISDIR(result->i_mode)) { - if (server->opt.capabilities & SMB_CAP_UNIX) - result->i_op = &smb_dir_inode_operations_unix; - else - result->i_op = &smb_dir_inode_operations; - result->i_fop = &smb_dir_operations; - } else if (S_ISLNK(result->i_mode)) { - result->i_op = &smb_link_inode_operations; - } else { - init_special_inode(result, result->i_mode, fattr->f_rdev); - } - insert_inode_hash(result); - return result; -} - -/* - * Copy the inode data to a smb_fattr structure. - */ -void -smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr) -{ - memset(fattr, 0, sizeof(struct smb_fattr)); - fattr->f_mode = inode->i_mode; - fattr->f_nlink = inode->i_nlink; - fattr->f_ino = inode->i_ino; - fattr->f_uid = inode->i_uid; - fattr->f_gid = inode->i_gid; - fattr->f_size = inode->i_size; - fattr->f_mtime = inode->i_mtime; - fattr->f_ctime = inode->i_ctime; - fattr->f_atime = inode->i_atime; - fattr->f_blocks = inode->i_blocks; - - fattr->attr = SMB_I(inode)->attr; - /* - * Keep the attributes in sync with the inode permissions. - */ - if (fattr->f_mode & S_IWUSR) - fattr->attr &= ~aRONLY; - else - fattr->attr |= aRONLY; -} - -/* - * Update the inode, possibly causing it to invalidate its pages if mtime/size - * is different from last time. - */ -void -smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr) -{ - struct smb_inode_info *ei = SMB_I(inode); - - /* - * A size change should have a different mtime, or same mtime - * but different size. - */ - time_t last_time = inode->i_mtime.tv_sec; - loff_t last_sz = inode->i_size; - - inode->i_mode = fattr->f_mode; - inode->i_nlink = fattr->f_nlink; - inode->i_uid = fattr->f_uid; - inode->i_gid = fattr->f_gid; - inode->i_ctime = fattr->f_ctime; - inode->i_blocks = fattr->f_blocks; - inode->i_size = fattr->f_size; - inode->i_mtime = fattr->f_mtime; - inode->i_atime = fattr->f_atime; - ei->attr = fattr->attr; - - /* - * Update the "last time refreshed" field for revalidation. - */ - ei->oldmtime = jiffies; - - if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) { - VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n", - inode->i_ino, - (long) last_time, (long) inode->i_mtime.tv_sec, - (long) last_sz, (long) inode->i_size); - - if (!S_ISDIR(inode->i_mode)) - invalidate_remote_inode(inode); - } -} - -/* - * This is called if the connection has gone bad ... - * try to kill off all the current inodes. - */ -void -smb_invalidate_inodes(struct smb_sb_info *server) -{ - VERBOSE("\n"); - shrink_dcache_sb(SB_of(server)); -} - -/* - * This is called to update the inode attributes after - * we've made changes to a file or directory. - */ -static int -smb_refresh_inode(struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int error; - struct smb_fattr fattr; - - error = smb_proc_getattr(dentry, &fattr); - if (!error) { - smb_renew_times(dentry); - /* - * Check whether the type part of the mode changed, - * and don't update the attributes if it did. - * - * And don't dick with the root inode - */ - if (inode->i_ino == 2) - return error; - if (S_ISLNK(inode->i_mode)) - return error; /* VFS will deal with it */ - - if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) { - smb_set_inode_attr(inode, &fattr); - } else { - /* - * Big trouble! The inode has become a new object, - * so any operations attempted on it are invalid. - * - * To limit damage, mark the inode as bad so that - * subsequent lookup validations will fail. - */ - PARANOIA("%s/%s changed mode, %07o to %07o\n", - DENTRY_PATH(dentry), - inode->i_mode, fattr.f_mode); - - fattr.f_mode = inode->i_mode; /* save mode */ - make_bad_inode(inode); - inode->i_mode = fattr.f_mode; /* restore mode */ - /* - * No need to worry about unhashing the dentry: the - * lookup validation will see that the inode is bad. - * But we do want to invalidate the caches ... - */ - if (!S_ISDIR(inode->i_mode)) - invalidate_remote_inode(inode); - else - smb_invalid_dir_cache(inode); - error = -EIO; - } - } - return error; -} - -/* - * This is called when we want to check whether the inode - * has changed on the server. If it has changed, we must - * invalidate our local caches. - */ -int -smb_revalidate_inode(struct dentry *dentry) -{ - struct smb_sb_info *s = server_from_dentry(dentry); - struct inode *inode = dentry->d_inode; - int error = 0; - - DEBUG1("smb_revalidate_inode\n"); - lock_kernel(); - - /* - * Check whether we've recently refreshed the inode. - */ - if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) { - VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n", - inode->i_ino, jiffies, SMB_I(inode)->oldmtime); - goto out; - } - - error = smb_refresh_inode(dentry); -out: - unlock_kernel(); - return error; -} - -/* - * This routine is called when i_nlink == 0 and i_count goes to 0. - * All blocking cleanup operations need to go here to avoid races. - */ -static void -smb_evict_inode(struct inode *ino) -{ - DEBUG1("ino=%ld\n", ino->i_ino); - truncate_inode_pages(&ino->i_data, 0); - end_writeback(ino); - lock_kernel(); - if (smb_close(ino)) - PARANOIA("could not close inode %ld\n", ino->i_ino); - unlock_kernel(); -} - -static struct option opts[] = { - { "version", 0, 'v' }, - { "win95", SMB_MOUNT_WIN95, 1 }, - { "oldattr", SMB_MOUNT_OLDATTR, 1 }, - { "dirattr", SMB_MOUNT_DIRATTR, 1 }, - { "case", SMB_MOUNT_CASE, 1 }, - { "uid", 0, 'u' }, - { "gid", 0, 'g' }, - { "file_mode", 0, 'f' }, - { "dir_mode", 0, 'd' }, - { "iocharset", 0, 'i' }, - { "codepage", 0, 'c' }, - { "ttl", 0, 't' }, - { NULL, 0, 0} -}; - -static int -parse_options(struct smb_mount_data_kernel *mnt, char *options) -{ - int c; - unsigned long flags; - unsigned long value; - char *optarg; - char *optopt; - - flags = 0; - while ( (c = smb_getopt("smbfs", &options, opts, - &optopt, &optarg, &flags, &value)) > 0) { - - VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>"); - switch (c) { - case 1: - /* got a "flag" option */ - break; - case 'v': - if (value != SMB_MOUNT_VERSION) { - printk ("smbfs: Bad mount version %ld, expected %d\n", - value, SMB_MOUNT_VERSION); - return 0; - } - mnt->version = value; - break; - case 'u': - mnt->uid = value; - flags |= SMB_MOUNT_UID; - break; - case 'g': - mnt->gid = value; - flags |= SMB_MOUNT_GID; - break; - case 'f': - mnt->file_mode = (value & S_IRWXUGO) | S_IFREG; - flags |= SMB_MOUNT_FMODE; - break; - case 'd': - mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR; - flags |= SMB_MOUNT_DMODE; - break; - case 'i': - strlcpy(mnt->codepage.local_name, optarg, - SMB_NLS_MAXNAMELEN); - break; - case 'c': - strlcpy(mnt->codepage.remote_name, optarg, - SMB_NLS_MAXNAMELEN); - break; - case 't': - mnt->ttl = value; - break; - default: - printk ("smbfs: Unrecognized mount option %s\n", - optopt); - return -1; - } - } - mnt->flags = flags; - return c; -} - -/* - * smb_show_options() is for displaying mount options in /proc/mounts. - * It tries to avoid showing settings that were not changed from their - * defaults. - */ -static int -smb_show_options(struct seq_file *s, struct vfsmount *m) -{ - struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt; - int i; - - for (i = 0; opts[i].name != NULL; i++) - if (mnt->flags & opts[i].flag) - seq_printf(s, ",%s", opts[i].name); - - if (mnt->flags & SMB_MOUNT_UID) - seq_printf(s, ",uid=%d", mnt->uid); - if (mnt->flags & SMB_MOUNT_GID) - seq_printf(s, ",gid=%d", mnt->gid); - if (mnt->mounted_uid != 0) - seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid); - - /* - * Defaults for file_mode and dir_mode are unknown to us; they - * depend on the current umask of the user doing the mount. - */ - if (mnt->flags & SMB_MOUNT_FMODE) - seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO); - if (mnt->flags & SMB_MOUNT_DMODE) - seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO); - - if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT)) - seq_printf(s, ",iocharset=%s", mnt->codepage.local_name); - if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE)) - seq_printf(s, ",codepage=%s", mnt->codepage.remote_name); - - if (mnt->ttl != SMB_TTL_DEFAULT) - seq_printf(s, ",ttl=%d", mnt->ttl); - - return 0; -} - -static void -smb_unload_nls(struct smb_sb_info *server) -{ - unload_nls(server->remote_nls); - unload_nls(server->local_nls); -} - -static void -smb_put_super(struct super_block *sb) -{ - struct smb_sb_info *server = SMB_SB(sb); - - lock_kernel(); - - smb_lock_server(server); - server->state = CONN_INVALID; - smbiod_unregister_server(server); - - smb_close_socket(server); - - if (server->conn_pid) - kill_pid(server->conn_pid, SIGTERM, 1); - - bdi_destroy(&server->bdi); - kfree(server->ops); - smb_unload_nls(server); - sb->s_fs_info = NULL; - smb_unlock_server(server); - put_pid(server->conn_pid); - kfree(server); - - unlock_kernel(); -} - -static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) -{ - struct smb_sb_info *server; - struct smb_mount_data_kernel *mnt; - struct smb_mount_data *oldmnt; - struct inode *root_inode; - struct smb_fattr root; - int ver; - void *mem; - static int warn_count; - - lock_kernel(); - - if (warn_count < 5) { - warn_count++; - printk(KERN_EMERG "smbfs is deprecated and will be removed" - " from the 2.6.27 kernel. Please migrate to cifs\n"); - } - - if (!raw_data) - goto out_no_data; - - oldmnt = (struct smb_mount_data *) raw_data; - ver = oldmnt->version; - if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII) - goto out_wrong_data; - - sb->s_flags |= MS_NODIRATIME; - sb->s_blocksize = 1024; /* Eh... Is this correct? */ - sb->s_blocksize_bits = 10; - sb->s_magic = SMB_SUPER_MAGIC; - sb->s_op = &smb_sops; - sb->s_time_gran = 100; - - server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL); - if (!server) - goto out_no_server; - sb->s_fs_info = server; - - if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY)) - goto out_bdi; - - sb->s_bdi = &server->bdi; - - server->super_block = sb; - server->mnt = NULL; - server->sock_file = NULL; - init_waitqueue_head(&server->conn_wq); - init_MUTEX(&server->sem); - INIT_LIST_HEAD(&server->entry); - INIT_LIST_HEAD(&server->xmitq); - INIT_LIST_HEAD(&server->recvq); - server->conn_error = 0; - server->conn_pid = NULL; - server->state = CONN_INVALID; /* no connection yet */ - server->generation = 0; - - /* Allocate the global temp buffer and some superblock helper structs */ - /* FIXME: move these to the smb_sb_info struct */ - VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) + - sizeof(struct smb_mount_data_kernel)); - mem = kmalloc(sizeof(struct smb_ops) + - sizeof(struct smb_mount_data_kernel), GFP_KERNEL); - if (!mem) - goto out_no_mem; - - server->ops = mem; - smb_install_null_ops(server->ops); - server->mnt = mem + sizeof(struct smb_ops); - - /* Setup NLS stuff */ - server->remote_nls = NULL; - server->local_nls = NULL; - - mnt = server->mnt; - - memset(mnt, 0, sizeof(struct smb_mount_data_kernel)); - strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT, - SMB_NLS_MAXNAMELEN); - strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE, - SMB_NLS_MAXNAMELEN); - - mnt->ttl = SMB_TTL_DEFAULT; - if (ver == SMB_MOUNT_OLDVERSION) { - mnt->version = oldmnt->version; - - SET_UID(mnt->uid, oldmnt->uid); - SET_GID(mnt->gid, oldmnt->gid); - - mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG; - mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR; - - mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID | - SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE; - } else { - mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH | S_IFREG; - mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH | S_IFDIR; - if (parse_options(mnt, raw_data)) - goto out_bad_option; - } - mnt->mounted_uid = current_uid(); - smb_setcodepage(server, &mnt->codepage); - - /* - * Display the enabled options - * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2) - */ - if (mnt->flags & SMB_MOUNT_OLDATTR) - printk("SMBFS: Using core getattr (Win 95 speedup)\n"); - else if (mnt->flags & SMB_MOUNT_DIRATTR) - printk("SMBFS: Using dir ff getattr\n"); - - if (smbiod_register_server(server) < 0) { - printk(KERN_ERR "smbfs: failed to start smbiod\n"); - goto out_no_smbiod; - } - - /* - * Keep the super block locked while we get the root inode. - */ - smb_init_root_dirent(server, &root, sb); - root_inode = smb_iget(sb, &root); - if (!root_inode) - goto out_no_root; - - sb->s_root = d_alloc_root(root_inode); - if (!sb->s_root) - goto out_no_root; - - smb_new_dentry(sb->s_root); - - unlock_kernel(); - return 0; - -out_no_root: - iput(root_inode); -out_no_smbiod: - smb_unload_nls(server); -out_bad_option: - kfree(mem); -out_no_mem: - bdi_destroy(&server->bdi); -out_bdi: - if (!server->mnt) - printk(KERN_ERR "smb_fill_super: allocation failure\n"); - sb->s_fs_info = NULL; - kfree(server); - goto out_fail; -out_wrong_data: - printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver); - goto out_fail; -out_no_data: - printk(KERN_ERR "smb_fill_super: missing data argument\n"); -out_fail: - unlock_kernel(); - return -EINVAL; -out_no_server: - printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n"); - unlock_kernel(); - return -ENOMEM; -} - -static int -smb_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int result; - - lock_kernel(); - - result = smb_proc_dskattr(dentry, buf); - - unlock_kernel(); - - buf->f_type = SMB_SUPER_MAGIC; - buf->f_namelen = SMB_MAXPATHLEN; - return result; -} - -int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) -{ - int err = smb_revalidate_inode(dentry); - if (!err) - generic_fillattr(dentry->d_inode, stat); - return err; -} - -int -smb_notify_change(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - struct smb_sb_info *server = server_from_dentry(dentry); - unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO); - int error, changed, refresh = 0; - struct smb_fattr fattr; - - lock_kernel(); - - error = smb_revalidate_inode(dentry); - if (error) - goto out; - - if ((error = inode_change_ok(inode, attr)) < 0) - goto out; - - error = -EPERM; - if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid)) - goto out; - - if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid)) - goto out; - - if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask)) - goto out; - - if ((attr->ia_valid & ATTR_SIZE) != 0) { - VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n", - DENTRY_PATH(dentry), - (long) inode->i_size, (long) attr->ia_size); - - filemap_write_and_wait(inode->i_mapping); - - error = smb_open(dentry, O_WRONLY); - if (error) - goto out; - error = server->ops->truncate(inode, attr->ia_size); - if (error) - goto out; - truncate_setsize(inode, attr->ia_size); - refresh = 1; - } - - if (server->opt.capabilities & SMB_CAP_UNIX) { - /* For now we don't want to set the size with setattr_unix */ - attr->ia_valid &= ~ATTR_SIZE; - /* FIXME: only call if we actually want to set something? */ - error = smb_proc_setattr_unix(dentry, attr, 0, 0); - if (!error) - refresh = 1; - - goto out; - } - - /* - * Initialize the fattr and check for changed fields. - * Note: CTIME under SMB is creation time rather than - * change time, so we don't attempt to change it. - */ - smb_get_inode_attr(inode, &fattr); - - changed = 0; - if ((attr->ia_valid & ATTR_MTIME) != 0) { - fattr.f_mtime = attr->ia_mtime; - changed = 1; - } - if ((attr->ia_valid & ATTR_ATIME) != 0) { - fattr.f_atime = attr->ia_atime; - /* Earlier protocols don't have an access time */ - if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) - changed = 1; - } - if (changed) { - error = smb_proc_settime(dentry, &fattr); - if (error) - goto out; - refresh = 1; - } - - /* - * Check for mode changes ... we're extremely limited in - * what can be set for SMB servers: just the read-only bit. - */ - if ((attr->ia_valid & ATTR_MODE) != 0) { - VERBOSE("%s/%s mode change, old=%x, new=%x\n", - DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode); - changed = 0; - if (attr->ia_mode & S_IWUSR) { - if (fattr.attr & aRONLY) { - fattr.attr &= ~aRONLY; - changed = 1; - } - } else { - if (!(fattr.attr & aRONLY)) { - fattr.attr |= aRONLY; - changed = 1; - } - } - if (changed) { - error = smb_proc_setattr(dentry, &fattr); - if (error) - goto out; - refresh = 1; - } - } - error = 0; - -out: - if (refresh) - smb_refresh_inode(dentry); - unlock_kernel(); - return error; -} - -static int smb_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt); -} - -static struct file_system_type smb_fs_type = { - .owner = THIS_MODULE, - .name = "smbfs", - .get_sb = smb_get_sb, - .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, -}; - -static int __init init_smb_fs(void) -{ - int err; - DEBUG1("registering ...\n"); - - err = init_inodecache(); - if (err) - goto out_inode; - err = smb_init_request_cache(); - if (err) - goto out_request; - err = register_filesystem(&smb_fs_type); - if (err) - goto out; - return 0; -out: - smb_destroy_request_cache(); -out_request: - destroy_inodecache(); -out_inode: - return err; -} - -static void __exit exit_smb_fs(void) -{ - DEBUG1("unregistering ...\n"); - unregister_filesystem(&smb_fs_type); - smb_destroy_request_cache(); - destroy_inodecache(); -} - -module_init(init_smb_fs) -module_exit(exit_smb_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c deleted file mode 100644 index 07215312ad39..000000000000 --- a/fs/smbfs/ioctl.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * ioctl.c - * - * Copyright (C) 1995, 1996 by Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/ioctl.h> -#include <linux/time.h> -#include <linux/mm.h> -#include <linux/highuid.h> -#include <linux/smp_lock.h> -#include <linux/net.h> - -#include <linux/smb_fs.h> -#include <linux/smb_mount.h> - -#include <asm/uaccess.h> - -#include "proto.h" - -long -smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode); - struct smb_conn_opt opt; - int result = -EINVAL; - - lock_kernel(); - switch (cmd) { - uid16_t uid16; - uid_t uid32; - case SMB_IOC_GETMOUNTUID: - SET_UID(uid16, server->mnt->mounted_uid); - result = put_user(uid16, (uid16_t __user *) arg); - break; - case SMB_IOC_GETMOUNTUID32: - SET_UID(uid32, server->mnt->mounted_uid); - result = put_user(uid32, (uid_t __user *) arg); - break; - - case SMB_IOC_NEWCONN: - /* arg is smb_conn_opt, or NULL if no connection was made */ - if (!arg) { - result = 0; - smb_lock_server(server); - server->state = CONN_RETRIED; - printk(KERN_ERR "Connection attempt failed! [%d]\n", - server->conn_error); - smbiod_flush(server); - smb_unlock_server(server); - break; - } - - result = -EFAULT; - if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt))) - result = smb_newconn(server, &opt); - break; - default: - break; - } - unlock_kernel(); - - return result; -} diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c deleted file mode 100644 index 3dcf638d4d3a..000000000000 --- a/fs/smbfs/proc.c +++ /dev/null @@ -1,3503 +0,0 @@ -/* - * proc.c - * - * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/types.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/dcache.h> -#include <linux/nls.h> -#include <linux/smp_lock.h> -#include <linux/net.h> -#include <linux/vfs.h> -#include <linux/smb_fs.h> -#include <linux/smbno.h> -#include <linux/smb_mount.h> - -#include <net/sock.h> - -#include <asm/string.h> -#include <asm/div64.h> - -#include "smb_debug.h" -#include "proto.h" -#include "request.h" - - -/* Features. Undefine if they cause problems, this should perhaps be a - config option. */ -#define SMBFS_POSIX_UNLINK 1 - -/* Allow smb_retry to be interrupted. */ -#define SMB_RETRY_INTR - -#define SMB_VWV(packet) ((packet) + SMB_HEADER_LEN) -#define SMB_CMD(packet) (*(packet+8)) -#define SMB_WCT(packet) (*(packet+SMB_HEADER_LEN - 1)) - -#define SMB_DIRINFO_SIZE 43 -#define SMB_STATUS_SIZE 21 - -#define SMB_ST_BLKSIZE (PAGE_SIZE) -#define SMB_ST_BLKSHIFT (PAGE_SHIFT) - -static struct smb_ops smb_ops_core; -static struct smb_ops smb_ops_os2; -static struct smb_ops smb_ops_win95; -static struct smb_ops smb_ops_winNT; -static struct smb_ops smb_ops_unix; -static struct smb_ops smb_ops_null; - -static void -smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr); -static void -smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr); -static int -smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *fattr); -static int -smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry, - struct smb_fattr *fattr); -static int -smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry, - u16 attr); -static int -smb_proc_setattr_ext(struct smb_sb_info *server, - struct inode *inode, struct smb_fattr *fattr); -static int -smb_proc_query_cifsunix(struct smb_sb_info *server); -static void -install_ops(struct smb_ops *dst, struct smb_ops *src); - - -static void -str_upper(char *name, int len) -{ - while (len--) - { - if (*name >= 'a' && *name <= 'z') - *name -= ('a' - 'A'); - name++; - } -} - -#if 0 -static void -str_lower(char *name, int len) -{ - while (len--) - { - if (*name >= 'A' && *name <= 'Z') - *name += ('a' - 'A'); - name++; - } -} -#endif - -/* reverse a string inline. This is used by the dircache walking routines */ -static void reverse_string(char *buf, int len) -{ - char c; - char *end = buf+len-1; - - while(buf < end) { - c = *buf; - *(buf++) = *end; - *(end--) = c; - } -} - -/* no conversion, just a wrapper for memcpy. */ -static int convert_memcpy(unsigned char *output, int olen, - const unsigned char *input, int ilen, - struct nls_table *nls_from, - struct nls_table *nls_to) -{ - if (olen < ilen) - return -ENAMETOOLONG; - memcpy(output, input, ilen); - return ilen; -} - -static inline int write_char(unsigned char ch, char *output, int olen) -{ - if (olen < 4) - return -ENAMETOOLONG; - sprintf(output, ":x%02x", ch); - return 4; -} - -static inline int write_unichar(wchar_t ch, char *output, int olen) -{ - if (olen < 5) - return -ENAMETOOLONG; - sprintf(output, ":%04x", ch); - return 5; -} - -/* convert from one "codepage" to another (possibly being utf8). */ -static int convert_cp(unsigned char *output, int olen, - const unsigned char *input, int ilen, - struct nls_table *nls_from, - struct nls_table *nls_to) -{ - int len = 0; - int n; - wchar_t ch; - - while (ilen > 0) { - /* convert by changing to unicode and back to the new cp */ - n = nls_from->char2uni(input, ilen, &ch); - if (n == -EINVAL) { - ilen--; - n = write_char(*input++, output, olen); - if (n < 0) - goto fail; - output += n; - olen -= n; - len += n; - continue; - } else if (n < 0) - goto fail; - input += n; - ilen -= n; - - n = nls_to->uni2char(ch, output, olen); - if (n == -EINVAL) - n = write_unichar(ch, output, olen); - if (n < 0) - goto fail; - output += n; - olen -= n; - - len += n; - } - return len; -fail: - return n; -} - -/* ----------------------------------------------------------- */ - -/* - * nls_unicode - * - * This encodes/decodes little endian unicode format - */ - -static int uni2char(wchar_t uni, unsigned char *out, int boundlen) -{ - if (boundlen < 2) - return -EINVAL; - *out++ = uni & 0xff; - *out++ = uni >> 8; - return 2; -} - -static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) -{ - if (boundlen < 2) - return -EINVAL; - *uni = (rawstring[1] << 8) | rawstring[0]; - return 2; -} - -static struct nls_table unicode_table = { - .charset = "unicode", - .uni2char = uni2char, - .char2uni = char2uni, -}; - -/* ----------------------------------------------------------- */ - -static int setcodepage(struct nls_table **p, char *name) -{ - struct nls_table *nls; - - if (!name || !*name) { - nls = NULL; - } else if ( (nls = load_nls(name)) == NULL) { - printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name); - return -EINVAL; - } - - /* if already set, unload the previous one. */ - if (*p && *p != &unicode_table) - unload_nls(*p); - *p = nls; - - return 0; -} - -/* Handles all changes to codepage settings. */ -int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp) -{ - int n = 0; - - smb_lock_server(server); - - /* Don't load any nls_* at all, if no remote is requested */ - if (!*cp->remote_name) - goto out; - - /* local */ - n = setcodepage(&server->local_nls, cp->local_name); - if (n != 0) - goto out; - - /* remote */ - if (!strcmp(cp->remote_name, "unicode")) { - server->remote_nls = &unicode_table; - } else { - n = setcodepage(&server->remote_nls, cp->remote_name); - if (n != 0) - setcodepage(&server->local_nls, NULL); - } - -out: - if (server->local_nls != NULL && server->remote_nls != NULL) - server->ops->convert = convert_cp; - else - server->ops->convert = convert_memcpy; - - smb_unlock_server(server); - return n; -} - - -/*****************************************************************************/ -/* */ -/* Encoding/Decoding section */ -/* */ -/*****************************************************************************/ - -static __u8 * -smb_encode_smb_length(__u8 * p, __u32 len) -{ - *p = 0; - *(p+1) = 0; - *(p+2) = (len & 0xFF00) >> 8; - *(p+3) = (len & 0xFF); - if (len > 0xFFFF) - { - *(p+1) = 1; - } - return p + 4; -} - -/* - * smb_build_path: build the path to entry and name storing it in buf. - * The path returned will have the trailing '\0'. - */ -static int smb_build_path(struct smb_sb_info *server, unsigned char *buf, - int maxlen, - struct dentry *entry, struct qstr *name) -{ - unsigned char *path = buf; - int len; - int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0; - - if (maxlen < (2<<unicode)) - return -ENAMETOOLONG; - - if (maxlen > SMB_MAXPATHLEN + 1) - maxlen = SMB_MAXPATHLEN + 1; - - if (entry == NULL) - goto test_name_and_out; - - /* - * If IS_ROOT, we have to do no walking at all. - */ - if (IS_ROOT(entry) && !name) { - *path++ = '\\'; - if (unicode) *path++ = '\0'; - *path++ = '\0'; - if (unicode) *path++ = '\0'; - return path-buf; - } - - /* - * Build the path string walking the tree backward from end to ROOT - * and store it in reversed order [see reverse_string()] - */ - dget(entry); - while (!IS_ROOT(entry)) { - struct dentry *parent; - - if (maxlen < (3<<unicode)) { - dput(entry); - return -ENAMETOOLONG; - } - - spin_lock(&entry->d_lock); - len = server->ops->convert(path, maxlen-2, - entry->d_name.name, entry->d_name.len, - server->local_nls, server->remote_nls); - if (len < 0) { - spin_unlock(&entry->d_lock); - dput(entry); - return len; - } - reverse_string(path, len); - path += len; - if (unicode) { - /* Note: reverse order */ - *path++ = '\0'; - maxlen--; - } - *path++ = '\\'; - maxlen -= len+1; - spin_unlock(&entry->d_lock); - - parent = dget_parent(entry); - dput(entry); - entry = parent; - } - dput(entry); - reverse_string(buf, path-buf); - - /* maxlen has space for at least one char */ -test_name_and_out: - if (name) { - if (maxlen < (3<<unicode)) - return -ENAMETOOLONG; - *path++ = '\\'; - if (unicode) { - *path++ = '\0'; - maxlen--; - } - len = server->ops->convert(path, maxlen-2, - name->name, name->len, - server->local_nls, server->remote_nls); - if (len < 0) - return len; - path += len; - maxlen -= len+1; - } - /* maxlen has space for at least one char */ - *path++ = '\0'; - if (unicode) *path++ = '\0'; - return path-buf; -} - -static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen, - struct dentry *dir, struct qstr *name) -{ - int result; - - result = smb_build_path(server, buf, maxlen, dir, name); - if (result < 0) - goto out; - if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS) - str_upper(buf, result); -out: - return result; -} - -/* encode_path for non-trans2 request SMBs */ -static int smb_simple_encode_path(struct smb_request *req, char **p, - struct dentry * entry, struct qstr * name) -{ - struct smb_sb_info *server = req->rq_server; - char *s = *p; - int res; - int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s; - int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE); - - if (!maxlen) - return -ENAMETOOLONG; - *s++ = 4; /* ASCII data format */ - - /* - * SMB Unicode strings must be 16bit aligned relative the start of the - * packet. If they are not they must be padded with 0. - */ - if (unicode) { - int align = s - (char *)req->rq_buffer; - if (!(align & 1)) { - *s++ = '\0'; - maxlen--; - } - } - - res = smb_encode_path(server, s, maxlen-1, entry, name); - if (res < 0) - return res; - *p = s + res; - return 0; -} - -/* The following are taken directly from msdos-fs */ - -/* Linear day numbers of the respective 1sts in non-leap years. */ - -static int day_n[] = -{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; - /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */ - - -static time_t -utc2local(struct smb_sb_info *server, time_t time) -{ - return time - server->opt.serverzone*60; -} - -static time_t -local2utc(struct smb_sb_info *server, time_t time) -{ - return time + server->opt.serverzone*60; -} - -/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ - -static time_t -date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time) -{ - int month, year; - time_t secs; - - /* first subtract and mask after that... Otherwise, if - date == 0, bad things happen */ - month = ((date >> 5) - 1) & 15; - year = date >> 9; - secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 * - ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 && - month < 2 ? 1 : 0) + 3653); - /* days since 1.1.70 plus 80's leap day */ - return local2utc(server, secs); -} - - -/* Convert linear UNIX date to a MS-DOS time/date pair. */ - -static void -date_unix2dos(struct smb_sb_info *server, - int unix_date, __u16 *date, __u16 *time) -{ - int day, year, nl_day, month; - - unix_date = utc2local(server, unix_date); - if (unix_date < 315532800) - unix_date = 315532800; - - *time = (unix_date % 60) / 2 + - (((unix_date / 60) % 60) << 5) + - (((unix_date / 3600) % 24) << 11); - - day = unix_date / 86400 - 3652; - year = day / 365; - if ((year + 3) / 4 + 365 * year > day) - year--; - day -= (year + 3) / 4 + 365 * year; - if (day == 59 && !(year & 3)) { - nl_day = day; - month = 2; - } else { - nl_day = (year & 3) || day <= 59 ? day : day - 1; - for (month = 1; month < 12; month++) - if (day_n[month] > nl_day) - break; - } - *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9); -} - -/* The following are taken from fs/ntfs/util.c */ - -#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) - -/* - * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) - * into Unix UTC (based 1970-01-01, in seconds). - */ -static struct timespec -smb_ntutc2unixutc(u64 ntutc) -{ - struct timespec ts; - /* FIXME: what about the timezone difference? */ - /* Subtract the NTFS time offset, then convert to 1s intervals. */ - u64 t = ntutc - NTFS_TIME_OFFSET; - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; - return ts; -} - -/* Convert the Unix UTC into NT time */ -static u64 -smb_unixutc2ntutc(struct timespec ts) -{ - /* Note: timezone conversion is probably wrong. */ - /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */ - return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET; -} - -#define MAX_FILE_MODE 6 -static mode_t file_mode[] = { - S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK -}; - -static int smb_filetype_to_mode(u32 filetype) -{ - if (filetype > MAX_FILE_MODE) { - PARANOIA("Filetype out of range: %d\n", filetype); - return S_IFREG; - } - return file_mode[filetype]; -} - -static u32 smb_filetype_from_mode(int mode) -{ - if (S_ISREG(mode)) - return UNIX_TYPE_FILE; - if (S_ISDIR(mode)) - return UNIX_TYPE_DIR; - if (S_ISLNK(mode)) - return UNIX_TYPE_SYMLINK; - if (S_ISCHR(mode)) - return UNIX_TYPE_CHARDEV; - if (S_ISBLK(mode)) - return UNIX_TYPE_BLKDEV; - if (S_ISFIFO(mode)) - return UNIX_TYPE_FIFO; - if (S_ISSOCK(mode)) - return UNIX_TYPE_SOCKET; - return UNIX_TYPE_UNKNOWN; -} - - -/*****************************************************************************/ -/* */ -/* Support section. */ -/* */ -/*****************************************************************************/ - -__u32 -smb_len(__u8 * p) -{ - return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3); -} - -static __u16 -smb_bcc(__u8 * packet) -{ - int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16); - return WVAL(packet, pos); -} - -/* smb_valid_packet: We check if packet fulfills the basic - requirements of a smb packet */ - -static int -smb_valid_packet(__u8 * packet) -{ - return (packet[4] == 0xff - && packet[5] == 'S' - && packet[6] == 'M' - && packet[7] == 'B' - && (smb_len(packet) + 4 == SMB_HEADER_LEN - + SMB_WCT(packet) * 2 + smb_bcc(packet))); -} - -/* smb_verify: We check if we got the answer we expected, and if we - got enough data. If bcc == -1, we don't care. */ - -static int -smb_verify(__u8 * packet, int command, int wct, int bcc) -{ - if (SMB_CMD(packet) != command) - goto bad_command; - if (SMB_WCT(packet) < wct) - goto bad_wct; - if (bcc != -1 && smb_bcc(packet) < bcc) - goto bad_bcc; - return 0; - -bad_command: - printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n", - command, SMB_CMD(packet)); - goto fail; -bad_wct: - printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n", - command, wct, SMB_WCT(packet)); - goto fail; -bad_bcc: - printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n", - command, bcc, smb_bcc(packet)); -fail: - return -EIO; -} - -/* - * Returns the maximum read or write size for the "payload". Making all of the - * packet fit within the negotiated max_xmit size. - * - * N.B. Since this value is usually computed before locking the server, - * the server's packet size must never be decreased! - */ -static inline int -smb_get_xmitsize(struct smb_sb_info *server, int overhead) -{ - return server->opt.max_xmit - overhead; -} - -/* - * Calculate the maximum read size - */ -int -smb_get_rsize(struct smb_sb_info *server) -{ - /* readX has 12 parameters, read has 5 */ - int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2; - int size = smb_get_xmitsize(server, overhead); - - VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size); - - return size; -} - -/* - * Calculate the maximum write size - */ -int -smb_get_wsize(struct smb_sb_info *server) -{ - /* writeX has 14 parameters, write has 5 */ - int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2; - int size = smb_get_xmitsize(server, overhead); - - VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size); - - return size; -} - -/* - * Convert SMB error codes to -E... errno values. - */ -int -smb_errno(struct smb_request *req) -{ - int errcls = req->rq_rcls; - int error = req->rq_err; - char *class = "Unknown"; - - VERBOSE("errcls %d code %d from command 0x%x\n", - errcls, error, SMB_CMD(req->rq_header)); - - if (errcls == ERRDOS) { - switch (error) { - case ERRbadfunc: - return -EINVAL; - case ERRbadfile: - case ERRbadpath: - return -ENOENT; - case ERRnofids: - return -EMFILE; - case ERRnoaccess: - return -EACCES; - case ERRbadfid: - return -EBADF; - case ERRbadmcb: - return -EREMOTEIO; - case ERRnomem: - return -ENOMEM; - case ERRbadmem: - return -EFAULT; - case ERRbadenv: - case ERRbadformat: - return -EREMOTEIO; - case ERRbadaccess: - return -EACCES; - case ERRbaddata: - return -E2BIG; - case ERRbaddrive: - return -ENXIO; - case ERRremcd: - return -EREMOTEIO; - case ERRdiffdevice: - return -EXDEV; - case ERRnofiles: - return -ENOENT; - case ERRbadshare: - return -ETXTBSY; - case ERRlock: - return -EDEADLK; - case ERRfilexists: - return -EEXIST; - case ERROR_INVALID_PARAMETER: - return -EINVAL; - case ERROR_DISK_FULL: - return -ENOSPC; - case ERROR_INVALID_NAME: - return -ENOENT; - case ERROR_DIR_NOT_EMPTY: - return -ENOTEMPTY; - case ERROR_NOT_LOCKED: - return -ENOLCK; - case ERROR_ALREADY_EXISTS: - return -EEXIST; - default: - class = "ERRDOS"; - goto err_unknown; - } - } else if (errcls == ERRSRV) { - switch (error) { - /* N.B. This is wrong ... EIO ? */ - case ERRerror: - return -ENFILE; - case ERRbadpw: - return -EINVAL; - case ERRbadtype: - case ERRtimeout: - return -EIO; - case ERRaccess: - return -EACCES; - /* - * This is a fatal error, as it means the "tree ID" - * for this connection is no longer valid. We map - * to a special error code and get a new connection. - */ - case ERRinvnid: - return -EBADSLT; - default: - class = "ERRSRV"; - goto err_unknown; - } - } else if (errcls == ERRHRD) { - switch (error) { - case ERRnowrite: - return -EROFS; - case ERRbadunit: - return -ENODEV; - case ERRnotready: - return -EUCLEAN; - case ERRbadcmd: - case ERRdata: - return -EIO; - case ERRbadreq: - return -ERANGE; - case ERRbadshare: - return -ETXTBSY; - case ERRlock: - return -EDEADLK; - case ERRdiskfull: - return -ENOSPC; - default: - class = "ERRHRD"; - goto err_unknown; - } - } else if (errcls == ERRCMD) { - class = "ERRCMD"; - } else if (errcls == SUCCESS) { - return 0; /* This is the only valid 0 return */ - } - -err_unknown: - printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n", - class, error, SMB_CMD(req->rq_header)); - return -EIO; -} - -/* smb_request_ok: We expect the server to be locked. Then we do the - request and check the answer completely. When smb_request_ok - returns 0, you can be quite sure that everything went well. When - the answer is <=0, the returned number is a valid unix errno. */ - -static int -smb_request_ok(struct smb_request *req, int command, int wct, int bcc) -{ - int result; - - req->rq_resp_wct = wct; - req->rq_resp_bcc = bcc; - - result = smb_add_request(req); - if (result != 0) { - DEBUG1("smb_request failed\n"); - goto out; - } - - if (smb_valid_packet(req->rq_header) != 0) { - PARANOIA("invalid packet!\n"); - goto out; - } - - result = smb_verify(req->rq_header, command, wct, bcc); - -out: - return result; -} - -/* - * This implements the NEWCONN ioctl. It installs the server pid, - * sets server->state to CONN_VALID, and wakes up the waiting process. - */ -int -smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt) -{ - struct file *filp; - struct sock *sk; - int error; - - VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid); - - smb_lock_server(server); - - /* - * Make sure we don't already have a valid connection ... - */ - error = -EINVAL; - if (server->state == CONN_VALID) - goto out; - - error = -EACCES; - if (current_uid() != server->mnt->mounted_uid && - !capable(CAP_SYS_ADMIN)) - goto out; - - error = -EBADF; - filp = fget(opt->fd); - if (!filp) - goto out; - if (!smb_valid_socket(filp->f_path.dentry->d_inode)) - goto out_putf; - - server->sock_file = filp; - server->conn_pid = get_pid(task_pid(current)); - server->opt = *opt; - server->generation += 1; - server->state = CONN_VALID; - error = 0; - - if (server->conn_error) { - /* - * conn_error is the returncode we originally decided to - * drop the old connection on. This message should be positive - * and not make people ask questions on why smbfs is printing - * error messages ... - */ - printk(KERN_INFO "SMB connection re-established (%d)\n", - server->conn_error); - server->conn_error = 0; - } - - /* - * Store the server in sock user_data (Only used by sunrpc) - */ - sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk; - sk->sk_user_data = server; - - /* chain into the data_ready callback */ - server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready); - - /* check if we have an old smbmount that uses seconds for the - serverzone */ - if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60) - server->opt.serverzone /= 60; - - /* now that we have an established connection we can detect the server - type and enable bug workarounds */ - if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) - install_ops(server->ops, &smb_ops_core); - else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2) - install_ops(server->ops, &smb_ops_os2); - else if (server->opt.protocol == SMB_PROTOCOL_NT1 && - (server->opt.max_xmit < 0x1000) && - !(server->opt.capabilities & SMB_CAP_NT_SMBS)) { - /* FIXME: can we kill the WIN95 flag now? */ - server->mnt->flags |= SMB_MOUNT_WIN95; - VERBOSE("detected WIN95 server\n"); - install_ops(server->ops, &smb_ops_win95); - } else { - /* - * Samba has max_xmit 65535 - * NT4spX has max_xmit 4536 (or something like that) - * win2k has ... - */ - VERBOSE("detected NT1 (Samba, NT4/5) server\n"); - install_ops(server->ops, &smb_ops_winNT); - } - - /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */ - if (server->mnt->flags & SMB_MOUNT_OLDATTR) { - server->ops->getattr = smb_proc_getattr_core; - } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) { - server->ops->getattr = smb_proc_getattr_ff; - } - - /* Decode server capabilities */ - if (server->opt.capabilities & SMB_CAP_LARGE_FILES) { - /* Should be ok to set this now, as no one can access the - mount until the connection has been established. */ - SB_of(server)->s_maxbytes = ~0ULL >> 1; - VERBOSE("LFS enabled\n"); - } - if (server->opt.capabilities & SMB_CAP_UNICODE) { - server->mnt->flags |= SMB_MOUNT_UNICODE; - VERBOSE("Unicode enabled\n"); - } else { - server->mnt->flags &= ~SMB_MOUNT_UNICODE; - } -#if 0 - /* flags we may test for other patches ... */ - if (server->opt.capabilities & SMB_CAP_LARGE_READX) { - VERBOSE("Large reads enabled\n"); - } - if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) { - VERBOSE("Large writes enabled\n"); - } -#endif - if (server->opt.capabilities & SMB_CAP_UNIX) { - struct inode *inode; - VERBOSE("Using UNIX CIFS extensions\n"); - install_ops(server->ops, &smb_ops_unix); - inode = SB_of(server)->s_root->d_inode; - if (inode) - inode->i_op = &smb_dir_inode_operations_unix; - } - - VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n", - server->opt.protocol, server->opt.max_xmit, - pid_nr(server->conn_pid), server->opt.capabilities); - - /* FIXME: this really should be done by smbmount. */ - if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) { - server->opt.max_xmit = SMB_MAX_PACKET_SIZE; - } - - smb_unlock_server(server); - smbiod_wake_up(); - if (server->opt.capabilities & SMB_CAP_UNIX) - smb_proc_query_cifsunix(server); - - server->conn_complete++; - wake_up_interruptible_all(&server->conn_wq); - return error; - -out: - smb_unlock_server(server); - smbiod_wake_up(); - return error; - -out_putf: - fput(filp); - goto out; -} - -/* smb_setup_header: We completely set up the packet. You only have to - insert the command-specific fields */ - -__u8 * -smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc) -{ - __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2; - __u8 *p = req->rq_header; - struct smb_sb_info *server = req->rq_server; - - p = smb_encode_smb_length(p, xmit_len - 4); - - *p++ = 0xff; - *p++ = 'S'; - *p++ = 'M'; - *p++ = 'B'; - *p++ = command; - - memset(p, '\0', 19); - p += 19; - p += 8; - - if (server->opt.protocol > SMB_PROTOCOL_CORE) { - int flags = SMB_FLAGS_CASELESS_PATHNAMES; - int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS | - SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */ - - *(req->rq_header + smb_flg) = flags; - if (server->mnt->flags & SMB_MOUNT_UNICODE) - flags2 |= SMB_FLAGS2_UNICODE_STRINGS; - WSET(req->rq_header, smb_flg2, flags2); - } - *p++ = wct; /* wct */ - p += 2 * wct; - WSET(p, 0, bcc); - - /* Include the header in the data to send */ - req->rq_iovlen = 1; - req->rq_iov[0].iov_base = req->rq_header; - req->rq_iov[0].iov_len = xmit_len - bcc; - - return req->rq_buffer; -} - -static void -smb_setup_bcc(struct smb_request *req, __u8 *p) -{ - u16 bcc = p - req->rq_buffer; - u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header); - - WSET(pbcc, 0, bcc); - - smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + - 2*SMB_WCT(req->rq_header) - 2 + bcc); - - /* Include the "bytes" in the data to send */ - req->rq_iovlen = 2; - req->rq_iov[1].iov_base = req->rq_buffer; - req->rq_iov[1].iov_len = bcc; -} - -static int -smb_proc_seek(struct smb_sb_info *server, __u16 fileid, - __u16 mode, off_t offset) -{ - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBlseek, 4, 0); - WSET(req->rq_header, smb_vwv0, fileid); - WSET(req->rq_header, smb_vwv1, mode); - DSET(req->rq_header, smb_vwv2, offset); - req->rq_flags |= SMB_REQ_NORETRY; - - result = smb_request_ok(req, SMBlseek, 2, 0); - if (result < 0) { - result = 0; - goto out_free; - } - - result = DVAL(req->rq_header, smb_vwv0); -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish) -{ - struct inode *ino = dentry->d_inode; - struct smb_inode_info *ei = SMB_I(ino); - int mode, read_write = 0x42, read_only = 0x40; - int res; - char *p; - struct smb_request *req; - - /* - * Attempt to open r/w, unless there are no write privileges. - */ - mode = read_write; - if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH))) - mode = read_only; -#if 0 - /* FIXME: why is this code not in? below we fix it so that a caller - wanting RO doesn't get RW. smb_revalidate_inode does some - optimization based on access mode. tail -f needs it to be correct. - - We must open rw since we don't do the open if called a second time - with different 'wish'. Is that not supported by smb servers? */ - if (!(wish & (O_WRONLY | O_RDWR))) - mode = read_only; -#endif - - res = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - retry: - p = smb_setup_header(req, SMBopen, 2, 0); - WSET(req->rq_header, smb_vwv0, mode); - WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR); - res = smb_simple_encode_path(req, &p, dentry, NULL); - if (res < 0) - goto out_free; - smb_setup_bcc(req, p); - - res = smb_request_ok(req, SMBopen, 7, 0); - if (res != 0) { - if (mode == read_write && - (res == -EACCES || res == -ETXTBSY || res == -EROFS)) - { - VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n", - DENTRY_PATH(dentry), res); - mode = read_only; - req->rq_flags = 0; - goto retry; - } - goto out_free; - } - /* We should now have data in vwv[0..6]. */ - - ei->fileid = WVAL(req->rq_header, smb_vwv0); - ei->attr = WVAL(req->rq_header, smb_vwv1); - /* smb_vwv2 has mtime */ - /* smb_vwv4 has size */ - ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK); - ei->open = server->generation; - -out_free: - smb_rput(req); -out: - return res; -} - -/* - * Make sure the file is open, and check that the access - * is compatible with the desired access. - */ -int -smb_open(struct dentry *dentry, int wish) -{ - struct inode *inode = dentry->d_inode; - int result; - __u16 access; - - result = -ENOENT; - if (!inode) { - printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n", - DENTRY_PATH(dentry)); - goto out; - } - - if (!smb_is_open(inode)) { - struct smb_sb_info *server = server_from_inode(inode); - result = 0; - if (!smb_is_open(inode)) - result = smb_proc_open(server, dentry, wish); - if (result) - goto out; - /* - * A successful open means the path is still valid ... - */ - smb_renew_times(dentry); - } - - /* - * Check whether the access is compatible with the desired mode. - */ - result = 0; - access = SMB_I(inode)->access; - if (access != wish && access != SMB_O_RDWR) { - PARANOIA("%s/%s access denied, access=%x, wish=%x\n", - DENTRY_PATH(dentry), access, wish); - result = -EACCES; - } -out: - return result; -} - -static int -smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime) -{ - struct smb_request *req; - int result = -ENOMEM; - - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBclose, 3, 0); - WSET(req->rq_header, smb_vwv0, fileid); - DSET(req->rq_header, smb_vwv1, utc2local(server, mtime)); - req->rq_flags |= SMB_REQ_NORETRY; - result = smb_request_ok(req, SMBclose, 0, 0); - - smb_rput(req); -out: - return result; -} - -/* - * Win NT 4.0 has an apparent bug in that it fails to update the - * modify time when writing to a file. As a workaround, we update - * both modify and access time locally, and post the times to the - * server when closing the file. - */ -static int -smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino) -{ - struct smb_inode_info *ei = SMB_I(ino); - int result = 0; - if (smb_is_open(ino)) - { - /* - * We clear the open flag in advance, in case another - * process observes the value while we block below. - */ - ei->open = 0; - - /* - * Kludge alert: SMB timestamps are accurate only to - * two seconds ... round the times to avoid needless - * cache invalidations! - */ - if (ino->i_mtime.tv_sec & 1) { - ino->i_mtime.tv_sec--; - ino->i_mtime.tv_nsec = 0; - } - if (ino->i_atime.tv_sec & 1) { - ino->i_atime.tv_sec--; - ino->i_atime.tv_nsec = 0; - } - /* - * If the file is open with write permissions, - * update the time stamps to sync mtime and atime. - */ - if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 && - (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) && - !(ei->access == SMB_O_RDONLY)) - { - struct smb_fattr fattr; - smb_get_inode_attr(ino, &fattr); - smb_proc_setattr_ext(server, ino, &fattr); - } - - result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec); - /* - * Force a revalidation after closing ... some servers - * don't post the size until the file has been closed. - */ - if (server->opt.protocol < SMB_PROTOCOL_NT1) - ei->oldmtime = 0; - ei->closed = jiffies; - } - return result; -} - -int -smb_close(struct inode *ino) -{ - int result = 0; - - if (smb_is_open(ino)) { - struct smb_sb_info *server = server_from_inode(ino); - result = smb_proc_close_inode(server, ino); - } - return result; -} - -/* - * This is used to close a file following a failed instantiate. - * Since we don't have an inode, we can't use any of the above. - */ -int -smb_close_fileid(struct dentry *dentry, __u16 fileid) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - int result; - - result = smb_proc_close(server, fileid, get_seconds()); - return result; -} - -/* In smb_proc_read and smb_proc_write we do not retry, because the - file-id would not be valid after a reconnection. */ - -static void -smb_proc_read_data(struct smb_request *req) -{ - req->rq_iov[0].iov_base = req->rq_buffer; - req->rq_iov[0].iov_len = 3; - - req->rq_iov[1].iov_base = req->rq_page; - req->rq_iov[1].iov_len = req->rq_rsize; - req->rq_iovlen = 2; - - req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd; -} - -static int -smb_proc_read(struct inode *inode, loff_t offset, int count, char *data) -{ - struct smb_sb_info *server = server_from_inode(inode); - __u16 returned_count, data_len; - unsigned char *buf; - int result; - struct smb_request *req; - u8 rbuf[4]; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBread, 5, 0); - buf = req->rq_header; - WSET(buf, smb_vwv0, SMB_I(inode)->fileid); - WSET(buf, smb_vwv1, count); - DSET(buf, smb_vwv2, offset); - WSET(buf, smb_vwv4, 0); - - req->rq_page = data; - req->rq_rsize = count; - req->rq_callback = smb_proc_read_data; - req->rq_buffer = rbuf; - req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC; - - result = smb_request_ok(req, SMBread, 5, -1); - if (result < 0) - goto out_free; - returned_count = WVAL(req->rq_header, smb_vwv0); - - data_len = WVAL(rbuf, 1); - - if (returned_count != data_len) { - printk(KERN_NOTICE "smb_proc_read: returned != data_len\n"); - printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n", - returned_count, data_len); - } - result = data_len; - -out_free: - smb_rput(req); -out: - VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n", - inode->i_ino, SMB_I(inode)->fileid, count, result); - return result; -} - -static int -smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data) -{ - struct smb_sb_info *server = server_from_inode(inode); - int result; - u16 fileid = SMB_I(inode)->fileid; - u8 buf[4]; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n", - inode->i_ino, fileid, count, offset); - - smb_setup_header(req, SMBwrite, 5, count + 3); - WSET(req->rq_header, smb_vwv0, fileid); - WSET(req->rq_header, smb_vwv1, count); - DSET(req->rq_header, smb_vwv2, offset); - WSET(req->rq_header, smb_vwv4, 0); - - buf[0] = 1; - WSET(buf, 1, count); /* yes, again ... */ - req->rq_iov[1].iov_base = buf; - req->rq_iov[1].iov_len = 3; - req->rq_iov[2].iov_base = (char *) data; - req->rq_iov[2].iov_len = count; - req->rq_iovlen = 3; - req->rq_flags |= SMB_REQ_NORETRY; - - result = smb_request_ok(req, SMBwrite, 1, 0); - if (result >= 0) - result = WVAL(req->rq_header, smb_vwv0); - - smb_rput(req); -out: - return result; -} - -/* - * In smb_proc_readX and smb_proc_writeX we do not retry, because the - * file-id would not be valid after a reconnection. - */ - -#define SMB_READX_MAX_PAD 64 -static void -smb_proc_readX_data(struct smb_request *req) -{ - /* header length, excluding the netbios length (-4) */ - int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2; - int data_off = WVAL(req->rq_header, smb_vwv6); - - /* - * Some genius made the padding to the data bytes arbitrary. - * So we must first calculate the amount of padding used by the server. - */ - data_off -= hdrlen; - if (data_off > SMB_READX_MAX_PAD || data_off < 0) { - PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n"); - PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off); - req->rq_rlen = req->rq_bufsize + 1; - return; - } - req->rq_iov[0].iov_base = req->rq_buffer; - req->rq_iov[0].iov_len = data_off; - - req->rq_iov[1].iov_base = req->rq_page; - req->rq_iov[1].iov_len = req->rq_rsize; - req->rq_iovlen = 2; - - req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd; -} - -static int -smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data) -{ - struct smb_sb_info *server = server_from_inode(inode); - unsigned char *buf; - int result; - struct smb_request *req; - static char pad[SMB_READX_MAX_PAD]; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBreadX, 12, 0); - buf = req->rq_header; - WSET(buf, smb_vwv0, 0x00ff); - WSET(buf, smb_vwv1, 0); - WSET(buf, smb_vwv2, SMB_I(inode)->fileid); - DSET(buf, smb_vwv3, (u32)offset); /* low 32 bits */ - WSET(buf, smb_vwv5, count); - WSET(buf, smb_vwv6, 0); - DSET(buf, smb_vwv7, 0); - WSET(buf, smb_vwv9, 0); - DSET(buf, smb_vwv10, (u32)(offset >> 32)); /* high 32 bits */ - WSET(buf, smb_vwv11, 0); - - req->rq_page = data; - req->rq_rsize = count; - req->rq_callback = smb_proc_readX_data; - req->rq_buffer = pad; - req->rq_bufsize = SMB_READX_MAX_PAD; - req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY; - - result = smb_request_ok(req, SMBreadX, 12, -1); - if (result < 0) - goto out_free; - result = WVAL(req->rq_header, smb_vwv5); - -out_free: - smb_rput(req); -out: - VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n", - inode->i_ino, SMB_I(inode)->fileid, count, result); - return result; -} - -static int -smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data) -{ - struct smb_sb_info *server = server_from_inode(inode); - int result; - u8 *p; - static u8 pad[4]; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n", - inode->i_ino, SMB_I(inode)->fileid, count, offset); - - p = smb_setup_header(req, SMBwriteX, 14, count + 1); - WSET(req->rq_header, smb_vwv0, 0x00ff); - WSET(req->rq_header, smb_vwv1, 0); - WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid); - DSET(req->rq_header, smb_vwv3, (u32)offset); /* low 32 bits */ - DSET(req->rq_header, smb_vwv5, 0); - WSET(req->rq_header, smb_vwv7, 0); /* write mode */ - WSET(req->rq_header, smb_vwv8, 0); - WSET(req->rq_header, smb_vwv9, 0); - WSET(req->rq_header, smb_vwv10, count); /* data length */ - WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1); - DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32)); - - req->rq_iov[1].iov_base = pad; - req->rq_iov[1].iov_len = 1; - req->rq_iov[2].iov_base = (char *) data; - req->rq_iov[2].iov_len = count; - req->rq_iovlen = 3; - req->rq_flags |= SMB_REQ_NORETRY; - - result = smb_request_ok(req, SMBwriteX, 6, 0); - if (result >= 0) - result = WVAL(req->rq_header, smb_vwv2); - - smb_rput(req); -out: - return result; -} - -int -smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - char *p; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - p = smb_setup_header(req, SMBcreate, 3, 0); - WSET(req->rq_header, smb_vwv0, attr); - DSET(req->rq_header, smb_vwv1, utc2local(server, ctime)); - result = smb_simple_encode_path(req, &p, dentry, NULL); - if (result < 0) - goto out_free; - smb_setup_bcc(req, p); - - result = smb_request_ok(req, SMBcreate, 1, 0); - if (result < 0) - goto out_free; - - *fileid = WVAL(req->rq_header, smb_vwv0); - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -int -smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry) -{ - struct smb_sb_info *server = server_from_dentry(old_dentry); - char *p; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - p = smb_setup_header(req, SMBmv, 1, 0); - WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR); - result = smb_simple_encode_path(req, &p, old_dentry, NULL); - if (result < 0) - goto out_free; - result = smb_simple_encode_path(req, &p, new_dentry, NULL); - if (result < 0) - goto out_free; - smb_setup_bcc(req, p); - - if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0) - goto out_free; - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -/* - * Code common to mkdir and rmdir. - */ -static int -smb_proc_generic_command(struct dentry *dentry, __u8 command) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - char *p; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - p = smb_setup_header(req, command, 0, 0); - result = smb_simple_encode_path(req, &p, dentry, NULL); - if (result < 0) - goto out_free; - smb_setup_bcc(req, p); - - result = smb_request_ok(req, command, 0, 0); - if (result < 0) - goto out_free; - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -int -smb_proc_mkdir(struct dentry *dentry) -{ - return smb_proc_generic_command(dentry, SMBmkdir); -} - -int -smb_proc_rmdir(struct dentry *dentry) -{ - return smb_proc_generic_command(dentry, SMBrmdir); -} - -#if SMBFS_POSIX_UNLINK -/* - * Removes readonly attribute from a file. Used by unlink to give posix - * semantics. - */ -static int -smb_set_rw(struct dentry *dentry,struct smb_sb_info *server) -{ - int result; - struct smb_fattr fattr; - - /* FIXME: cifsUE should allow removing a readonly file. */ - - /* first get current attribute */ - smb_init_dirent(server, &fattr); - result = server->ops->getattr(server, dentry, &fattr); - smb_finish_dirent(server, &fattr); - if (result < 0) - return result; - - /* if RONLY attribute is set, remove it */ - if (fattr.attr & aRONLY) { /* read only attribute is set */ - fattr.attr &= ~aRONLY; - result = smb_proc_setattr_core(server, dentry, fattr.attr); - } - return result; -} -#endif - -int -smb_proc_unlink(struct dentry *dentry) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - int flag = 0; - char *p; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - retry: - p = smb_setup_header(req, SMBunlink, 1, 0); - WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN); - result = smb_simple_encode_path(req, &p, dentry, NULL); - if (result < 0) - goto out_free; - smb_setup_bcc(req, p); - - if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) { -#if SMBFS_POSIX_UNLINK - if (result == -EACCES && !flag) { - /* Posix semantics is for the read-only state - of a file to be ignored in unlink(). In the - SMB world a unlink() is refused on a - read-only file. To make things easier for - unix users we try to override the files - permission if the unlink fails with the - right error. - This introduces a race condition that could - lead to a file being written by someone who - shouldn't have access, but as far as I can - tell that is unavoidable */ - - /* remove RONLY attribute and try again */ - result = smb_set_rw(dentry,server); - if (result == 0) { - flag = 1; - req->rq_flags = 0; - goto retry; - } - } -#endif - goto out_free; - } - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -int -smb_proc_flush(struct smb_sb_info *server, __u16 fileid) -{ - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBflush, 1, 0); - WSET(req->rq_header, smb_vwv0, fileid); - req->rq_flags |= SMB_REQ_NORETRY; - result = smb_request_ok(req, SMBflush, 0, 0); - - smb_rput(req); -out: - return result; -} - -static int -smb_proc_trunc32(struct inode *inode, loff_t length) -{ - /* - * Writing 0bytes is old-SMB magic for truncating files. - * MAX_NON_LFS should prevent this from being called with a too - * large offset. - */ - return smb_proc_write(inode, length, 0, NULL); -} - -static int -smb_proc_trunc64(struct inode *inode, loff_t length) -{ - struct smb_sb_info *server = server_from_inode(inode); - int result; - char *param; - char *data; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 14))) - goto out; - - param = req->rq_buffer; - data = req->rq_buffer + 6; - - /* FIXME: must we also set allocation size? winNT seems to do that */ - WSET(param, 0, SMB_I(inode)->fileid); - WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO); - WSET(param, 4, 0); - LSET(data, 0, length); - - req->rq_trans2_command = TRANSACT2_SETFILEINFO; - req->rq_ldata = 8; - req->rq_data = data; - req->rq_lparm = 6; - req->rq_parm = param; - req->rq_flags |= SMB_REQ_NORETRY; - result = smb_add_request(req); - if (result < 0) - goto out_free; - - result = 0; - if (req->rq_rcls != 0) - result = smb_errno(req); - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_trunc95(struct inode *inode, loff_t length) -{ - struct smb_sb_info *server = server_from_inode(inode); - int result = smb_proc_trunc32(inode, length); - - /* - * win9x doesn't appear to update the size immediately. - * It will return the old file size after the truncate, - * confusing smbfs. So we force an update. - * - * FIXME: is this still necessary? - */ - smb_proc_flush(server, SMB_I(inode)->fileid); - return result; -} - -static void -smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr) -{ - memset(fattr, 0, sizeof(*fattr)); - - fattr->f_nlink = 1; - fattr->f_uid = server->mnt->uid; - fattr->f_gid = server->mnt->gid; - fattr->f_unix = 0; -} - -static void -smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr) -{ - if (fattr->f_unix) - return; - - fattr->f_mode = server->mnt->file_mode; - if (fattr->attr & aDIR) { - fattr->f_mode = server->mnt->dir_mode; - fattr->f_size = SMB_ST_BLKSIZE; - } - /* Check the read-only flag */ - if (fattr->attr & aRONLY) - fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); - - /* How many 512 byte blocks do we need for this file? */ - fattr->f_blocks = 0; - if (fattr->f_size != 0) - fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9); - return; -} - -void -smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr, - struct super_block *sb) -{ - smb_init_dirent(server, fattr); - fattr->attr = aDIR; - fattr->f_ino = 2; /* traditional root inode number */ - fattr->f_mtime = current_fs_time(sb); - smb_finish_dirent(server, fattr); -} - -/* - * Decode a dirent for old protocols - * - * qname is filled with the decoded, and possibly translated, name. - * fattr receives decoded attributes - * - * Bugs Noted: - * (1) Pathworks servers may pad the name with extra spaces. - */ -static char * -smb_decode_short_dirent(struct smb_sb_info *server, char *p, - struct qstr *qname, struct smb_fattr *fattr, - unsigned char *name_buf) -{ - int len; - - /* - * SMB doesn't have a concept of inode numbers ... - */ - smb_init_dirent(server, fattr); - fattr->f_ino = 0; /* FIXME: do we need this? */ - - p += SMB_STATUS_SIZE; /* reserved (search_status) */ - fattr->attr = *p; - fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1)); - fattr->f_mtime.tv_nsec = 0; - fattr->f_size = DVAL(p, 5); - fattr->f_ctime = fattr->f_mtime; - fattr->f_atime = fattr->f_mtime; - qname->name = p + 9; - len = strnlen(qname->name, 12); - - /* - * Trim trailing blanks for Pathworks servers - */ - while (len > 2 && qname->name[len-1] == ' ') - len--; - - smb_finish_dirent(server, fattr); - -#if 0 - /* FIXME: These only work for ascii chars, and recent smbmount doesn't - allow the flag to be set anyway. It kills const. Remove? */ - switch (server->opt.case_handling) { - case SMB_CASE_UPPER: - str_upper(entry->name, len); - break; - case SMB_CASE_LOWER: - str_lower(entry->name, len); - break; - default: - break; - } -#endif - - qname->len = 0; - len = server->ops->convert(name_buf, SMB_MAXNAMELEN, - qname->name, len, - server->remote_nls, server->local_nls); - if (len > 0) { - qname->len = len; - qname->name = name_buf; - DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name); - } - - return p + 22; -} - -/* - * This routine is used to read in directory entries from the network. - * Note that it is for short directory name seeks, i.e.: protocol < - * SMB_PROTOCOL_LANMAN2 - */ -static int -smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir, - struct smb_cache_control *ctl) -{ - struct dentry *dir = filp->f_path.dentry; - struct smb_sb_info *server = server_from_dentry(dir); - struct qstr qname; - struct smb_fattr fattr; - char *p; - int result; - int i, first, entries_seen, entries; - int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE; - __u16 bcc; - __u16 count; - char status[SMB_STATUS_SIZE]; - static struct qstr mask = { - .name = "*.*", - .len = 3, - }; - unsigned char *last_status; - struct smb_request *req; - unsigned char *name_buf; - - VERBOSE("%s/%s\n", DENTRY_PATH(dir)); - - lock_kernel(); - - result = -ENOMEM; - if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL))) - goto out; - - first = 1; - entries = 0; - entries_seen = 2; /* implicit . and .. */ - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, server->opt.max_xmit))) - goto out_name; - - while (1) { - p = smb_setup_header(req, SMBsearch, 2, 0); - WSET(req->rq_header, smb_vwv0, entries_asked); - WSET(req->rq_header, smb_vwv1, aDIR); - if (first == 1) { - result = smb_simple_encode_path(req, &p, dir, &mask); - if (result < 0) - goto out_free; - if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) { - result = -ENAMETOOLONG; - goto out_free; - } - *p++ = 5; - WSET(p, 0, 0); - p += 2; - first = 0; - } else { - if (p + 5 + SMB_STATUS_SIZE > - (char *)req->rq_buffer + req->rq_bufsize) { - result = -ENAMETOOLONG; - goto out_free; - } - - *p++ = 4; - *p++ = 0; - *p++ = 5; - WSET(p, 0, SMB_STATUS_SIZE); - p += 2; - memcpy(p, status, SMB_STATUS_SIZE); - p += SMB_STATUS_SIZE; - } - - smb_setup_bcc(req, p); - - result = smb_request_ok(req, SMBsearch, 1, -1); - if (result < 0) { - if ((req->rq_rcls == ERRDOS) && - (req->rq_err == ERRnofiles)) - break; - goto out_free; - } - count = WVAL(req->rq_header, smb_vwv0); - if (count <= 0) - break; - - result = -EIO; - bcc = smb_bcc(req->rq_header); - if (bcc != count * SMB_DIRINFO_SIZE + 3) - goto out_free; - p = req->rq_buffer + 3; - - - /* Make sure the response fits in the buffer. Fixed sized - entries means we don't have to check in the decode loop. */ - - last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE; - - if (last_status + SMB_DIRINFO_SIZE >= - req->rq_buffer + req->rq_bufsize) { - printk(KERN_ERR "smb_proc_readdir_short: " - "last dir entry outside buffer! " - "%d@%p %d@%p\n", SMB_DIRINFO_SIZE, last_status, - req->rq_bufsize, req->rq_buffer); - goto out_free; - } - - /* Read the last entry into the status field. */ - memcpy(status, last_status, SMB_STATUS_SIZE); - - - /* Now we are ready to parse smb directory entries. */ - - for (i = 0; i < count; i++) { - p = smb_decode_short_dirent(server, p, - &qname, &fattr, name_buf); - if (qname.len == 0) - continue; - - if (entries_seen == 2 && qname.name[0] == '.') { - if (qname.len == 1) - continue; - if (qname.name[1] == '.' && qname.len == 2) - continue; - } - if (!smb_fill_cache(filp, dirent, filldir, ctl, - &qname, &fattr)) - ; /* stop reading? */ - entries_seen++; - } - } - result = entries; - -out_free: - smb_rput(req); -out_name: - kfree(name_buf); -out: - unlock_kernel(); - return result; -} - -static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p) -{ - u64 size, disk_bytes; - - /* FIXME: verify nls support. all is sent as utf8? */ - - fattr->f_unix = 1; - fattr->f_mode = 0; - - /* FIXME: use the uniqueID from the remote instead? */ - /* 0 L file size in bytes */ - /* 8 L file size on disk in bytes (block count) */ - /* 40 L uid */ - /* 48 L gid */ - /* 56 W file type */ - /* 60 L devmajor */ - /* 68 L devminor */ - /* 76 L unique ID (inode) */ - /* 84 L permissions */ - /* 92 L link count */ - - size = LVAL(p, 0); - disk_bytes = LVAL(p, 8); - - /* - * Some samba versions round up on-disk byte usage - * to 1MB boundaries, making it useless. When seeing - * that, use the size instead. - */ - if (!(disk_bytes & 0xfffff)) - disk_bytes = size+511; - - fattr->f_size = size; - fattr->f_blocks = disk_bytes >> 9; - fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16)); - fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24)); - fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32)); - - if (server->mnt->flags & SMB_MOUNT_UID) - fattr->f_uid = server->mnt->uid; - else - fattr->f_uid = LVAL(p, 40); - - if (server->mnt->flags & SMB_MOUNT_GID) - fattr->f_gid = server->mnt->gid; - else - fattr->f_gid = LVAL(p, 48); - - fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56)); - - if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) { - __u64 major = LVAL(p, 60); - __u64 minor = LVAL(p, 68); - - fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff); - if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) || - MINOR(fattr->f_rdev) != (minor & 0xffffffff)) - fattr->f_rdev = 0; - } - - fattr->f_mode |= LVAL(p, 84); - - if ( (server->mnt->flags & SMB_MOUNT_DMODE) && - (S_ISDIR(fattr->f_mode)) ) - fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR; - else if ( (server->mnt->flags & SMB_MOUNT_FMODE) && - !(S_ISDIR(fattr->f_mode)) ) - fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) | - (fattr->f_mode & S_IFMT); - -} - -/* - * Interpret a long filename structure using the specified info level: - * level 1 for anything below NT1 protocol - * level 260 for NT1 protocol - * - * qname is filled with the decoded, and possibly translated, name - * fattr receives decoded attributes. - * - * Bugs Noted: - * (1) Win NT 4.0 appends a null byte to names and counts it in the length! - */ -static char * -smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level, - struct qstr *qname, struct smb_fattr *fattr, - unsigned char *name_buf) -{ - char *result; - unsigned int len = 0; - int n; - __u16 date, time; - int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE); - - /* - * SMB doesn't have a concept of inode numbers ... - */ - smb_init_dirent(server, fattr); - fattr->f_ino = 0; /* FIXME: do we need this? */ - - switch (level) { - case 1: - len = *((unsigned char *) p + 22); - qname->name = p + 23; - result = p + 24 + len; - - date = WVAL(p, 0); - time = WVAL(p, 2); - fattr->f_ctime.tv_sec = date_dos2unix(server, date, time); - fattr->f_ctime.tv_nsec = 0; - - date = WVAL(p, 4); - time = WVAL(p, 6); - fattr->f_atime.tv_sec = date_dos2unix(server, date, time); - fattr->f_atime.tv_nsec = 0; - - date = WVAL(p, 8); - time = WVAL(p, 10); - fattr->f_mtime.tv_sec = date_dos2unix(server, date, time); - fattr->f_mtime.tv_nsec = 0; - fattr->f_size = DVAL(p, 12); - /* ULONG allocation size */ - fattr->attr = WVAL(p, 20); - - VERBOSE("info 1 at %p, len=%d, name=%.*s\n", - p, len, len, qname->name); - break; - case 260: - result = p + WVAL(p, 0); - len = DVAL(p, 60); - if (len > 255) len = 255; - /* NT4 null terminates, unless we are using unicode ... */ - qname->name = p + 94; - if (!unicode && len && qname->name[len-1] == '\0') - len--; - - fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8)); - fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16)); - fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24)); - /* change time (32) */ - fattr->f_size = LVAL(p, 40); - /* alloc size (48) */ - fattr->attr = DVAL(p, 56); - - VERBOSE("info 260 at %p, len=%d, name=%.*s\n", - p, len, len, qname->name); - break; - case SMB_FIND_FILE_UNIX: - result = p + WVAL(p, 0); - qname->name = p + 108; - - len = strlen(qname->name); - /* FIXME: should we check the length?? */ - - p += 8; - smb_decode_unix_basic(fattr, server, p); - VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n", - p, len, len, qname->name); - break; - default: - PARANOIA("Unknown info level %d\n", level); - result = p + WVAL(p, 0); - goto out; - } - - smb_finish_dirent(server, fattr); - -#if 0 - /* FIXME: These only work for ascii chars, and recent smbmount doesn't - allow the flag to be set anyway. Remove? */ - switch (server->opt.case_handling) { - case SMB_CASE_UPPER: - str_upper(qname->name, len); - break; - case SMB_CASE_LOWER: - str_lower(qname->name, len); - break; - default: - break; - } -#endif - - qname->len = 0; - n = server->ops->convert(name_buf, SMB_MAXNAMELEN, - qname->name, len, - server->remote_nls, server->local_nls); - if (n > 0) { - qname->len = n; - qname->name = name_buf; - } - -out: - return result; -} - -/* findfirst/findnext flags */ -#define SMB_CLOSE_AFTER_FIRST (1<<0) -#define SMB_CLOSE_IF_END (1<<1) -#define SMB_REQUIRE_RESUME_KEY (1<<2) -#define SMB_CONTINUE_BIT (1<<3) - -/* - * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in - * source/libsmb/clilist.c. When looking for smb bugs in the readdir code, - * go there for advise. - * - * Bugs Noted: - * (1) When using Info Level 1 Win NT 4.0 truncates directory listings - * for certain patterns of names and/or lengths. The breakage pattern - * is completely reproducible and can be toggled by the creation of a - * single file. (E.g. echo hi >foo breaks, rm -f foo works.) - */ -static int -smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir, - struct smb_cache_control *ctl) -{ - struct dentry *dir = filp->f_path.dentry; - struct smb_sb_info *server = server_from_dentry(dir); - struct qstr qname; - struct smb_fattr fattr; - - unsigned char *p, *lastname; - char *mask, *param; - __u16 command; - int first, entries_seen; - - /* Both NT and OS/2 accept info level 1 (but see note below). */ - int info_level = 260; - const int max_matches = 512; - - unsigned int ff_searchcount = 0; - unsigned int ff_eos = 0; - unsigned int ff_lastname = 0; - unsigned int ff_dir_handle = 0; - unsigned int loop_count = 0; - unsigned int mask_len, i; - int result; - struct smb_request *req; - unsigned char *name_buf; - static struct qstr star = { - .name = "*", - .len = 1, - }; - - lock_kernel(); - - /* - * We always prefer unix style. Use info level 1 for older - * servers that don't do 260. - */ - if (server->opt.capabilities & SMB_CAP_UNIX) - info_level = SMB_FIND_FILE_UNIX; - else if (server->opt.protocol < SMB_PROTOCOL_NT1) - info_level = 1; - - result = -ENOMEM; - if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL))) - goto out; - if (! (req = smb_alloc_request(server, server->opt.max_xmit))) - goto out_name; - param = req->rq_buffer; - - /* - * Encode the initial path - */ - mask = param + 12; - - result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star); - if (result <= 0) - goto out_free; - mask_len = result - 1; /* mask_len is strlen, not #bytes */ - result = 0; - first = 1; - VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask); - - entries_seen = 2; - ff_eos = 0; - - while (ff_eos == 0) { - loop_count += 1; - if (loop_count > 10) { - printk(KERN_WARNING "smb_proc_readdir_long: " - "Looping in FIND_NEXT??\n"); - result = -EIO; - break; - } - - if (first != 0) { - command = TRANSACT2_FINDFIRST; - WSET(param, 0, aSYSTEM | aHIDDEN | aDIR); - WSET(param, 2, max_matches); /* max count */ - WSET(param, 4, SMB_CLOSE_IF_END); - WSET(param, 6, info_level); - DSET(param, 8, 0); - } else { - command = TRANSACT2_FINDNEXT; - - VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n", - ff_dir_handle, ff_lastname, mask_len, mask); - - WSET(param, 0, ff_dir_handle); /* search handle */ - WSET(param, 2, max_matches); /* max count */ - WSET(param, 4, info_level); - DSET(param, 6, 0); - WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END); - } - - req->rq_trans2_command = command; - req->rq_ldata = 0; - req->rq_data = NULL; - req->rq_lparm = 12 + mask_len + 1; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) { - PARANOIA("error=%d, breaking\n", result); - break; - } - - if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) { - /* a damn Win95 bug - sometimes it clags if you - ask it too fast */ - schedule_timeout_interruptible(msecs_to_jiffies(200)); - continue; - } - - if (req->rq_rcls != 0) { - result = smb_errno(req); - PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n", - mask, result, req->rq_rcls, req->rq_err); - break; - } - - /* parse out some important return info */ - if (first != 0) { - ff_dir_handle = WVAL(req->rq_parm, 0); - ff_searchcount = WVAL(req->rq_parm, 2); - ff_eos = WVAL(req->rq_parm, 4); - ff_lastname = WVAL(req->rq_parm, 8); - } else { - ff_searchcount = WVAL(req->rq_parm, 0); - ff_eos = WVAL(req->rq_parm, 2); - ff_lastname = WVAL(req->rq_parm, 6); - } - - if (ff_searchcount == 0) - break; - - /* Now we are ready to parse smb directory entries. */ - - /* point to the data bytes */ - p = req->rq_data; - for (i = 0; i < ff_searchcount; i++) { - /* make sure we stay within the buffer */ - if (p >= req->rq_data + req->rq_ldata) { - printk(KERN_ERR "smb_proc_readdir_long: " - "dirent pointer outside buffer! " - "%p %d@%p\n", - p, req->rq_ldata, req->rq_data); - result = -EIO; /* always a comm. error? */ - goto out_free; - } - - p = smb_decode_long_dirent(server, p, info_level, - &qname, &fattr, name_buf); - - /* ignore . and .. from the server */ - if (entries_seen == 2 && qname.name[0] == '.') { - if (qname.len == 1) - continue; - if (qname.name[1] == '.' && qname.len == 2) - continue; - } - - if (!smb_fill_cache(filp, dirent, filldir, ctl, - &qname, &fattr)) - ; /* stop reading? */ - entries_seen++; - } - - VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos); - - /* - * We might need the lastname for continuations. - * - * Note that some servers (win95?) point to the filename and - * others (NT4, Samba using NT1) to the dir entry. We assume - * here that those who do not point to a filename do not need - * this info to continue the listing. - * - * OS/2 needs this and talks infolevel 1. - * NetApps want lastname with infolevel 260. - * win2k want lastname with infolevel 260, and points to - * the record not to the name. - * Samba+CifsUnixExt doesn't need lastname. - * - * Both are happy if we return the data they point to. So we do. - * (FIXME: above is not true with win2k) - */ - mask_len = 0; - if (info_level != SMB_FIND_FILE_UNIX && - ff_lastname > 0 && ff_lastname < req->rq_ldata) { - lastname = req->rq_data + ff_lastname; - - switch (info_level) { - case 260: - mask_len = req->rq_ldata - ff_lastname; - break; - case 1: - /* lastname points to a length byte */ - mask_len = *lastname++; - if (ff_lastname + 1 + mask_len > req->rq_ldata) - mask_len = req->rq_ldata - ff_lastname - 1; - break; - } - - /* - * Update the mask string for the next message. - */ - if (mask_len > 255) - mask_len = 255; - if (mask_len) - strncpy(mask, lastname, mask_len); - } - mask_len = strnlen(mask, mask_len); - VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n", - mask_len, ff_lastname, req->rq_ldata, mask_len, mask); - - first = 0; - loop_count = 0; - } - -out_free: - smb_rput(req); -out_name: - kfree(name_buf); -out: - unlock_kernel(); - return result; -} - -/* - * This version uses the trans2 TRANSACT2_FINDFIRST message - * to get the attribute data. - * - * Bugs Noted: - */ -static int -smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry, - struct smb_fattr *fattr) -{ - char *param, *mask; - __u16 date, time; - int mask_len, result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - mask = param + 12; - - mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL); - if (mask_len < 0) { - result = mask_len; - goto out_free; - } - VERBOSE("name=%s, len=%d\n", mask, mask_len); - WSET(param, 0, aSYSTEM | aHIDDEN | aDIR); - WSET(param, 2, 1); /* max count */ - WSET(param, 4, 1); /* close after this call */ - WSET(param, 6, 1); /* info_level */ - DSET(param, 8, 0); - - req->rq_trans2_command = TRANSACT2_FINDFIRST; - req->rq_ldata = 0; - req->rq_data = NULL; - req->rq_lparm = 12 + mask_len; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - if (req->rq_rcls != 0) { - result = smb_errno(req); -#ifdef SMBFS_PARANOIA - if (result != -ENOENT) - PARANOIA("error for %s, rcls=%d, err=%d\n", - mask, req->rq_rcls, req->rq_err); -#endif - goto out_free; - } - /* Make sure we got enough data ... */ - result = -EINVAL; - if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) { - PARANOIA("bad result for %s, len=%d, count=%d\n", - mask, req->rq_ldata, WVAL(req->rq_parm, 2)); - goto out_free; - } - - /* - * Decode the response into the fattr ... - */ - date = WVAL(req->rq_data, 0); - time = WVAL(req->rq_data, 2); - fattr->f_ctime.tv_sec = date_dos2unix(server, date, time); - fattr->f_ctime.tv_nsec = 0; - - date = WVAL(req->rq_data, 4); - time = WVAL(req->rq_data, 6); - fattr->f_atime.tv_sec = date_dos2unix(server, date, time); - fattr->f_atime.tv_nsec = 0; - - date = WVAL(req->rq_data, 8); - time = WVAL(req->rq_data, 10); - fattr->f_mtime.tv_sec = date_dos2unix(server, date, time); - fattr->f_mtime.tv_nsec = 0; - VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n", - mask, date, time, fattr->f_mtime.tv_sec); - fattr->f_size = DVAL(req->rq_data, 12); - /* ULONG allocation size */ - fattr->attr = WVAL(req->rq_data, 20); - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *fattr) -{ - int result; - char *p; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - p = smb_setup_header(req, SMBgetatr, 0, 0); - result = smb_simple_encode_path(req, &p, dir, NULL); - if (result < 0) - goto out_free; - smb_setup_bcc(req, p); - - if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0) - goto out_free; - fattr->attr = WVAL(req->rq_header, smb_vwv0); - fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1)); - fattr->f_mtime.tv_nsec = 0; - fattr->f_size = DVAL(req->rq_header, smb_vwv3); - fattr->f_ctime = fattr->f_mtime; - fattr->f_atime = fattr->f_mtime; -#ifdef SMBFS_DEBUG_TIMESTAMP - printk("getattr_core: %s/%s, mtime=%ld\n", - DENTRY_PATH(dir), fattr->f_mtime); -#endif - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -/* - * Bugs Noted: - * (1) Win 95 swaps the date and time fields in the standard info level. - */ -static int -smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir, - struct smb_request *req, int infolevel) -{ - char *p, *param; - int result; - - param = req->rq_buffer; - WSET(param, 0, infolevel); - DSET(param, 2, 0); - result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL); - if (result < 0) - goto out; - p = param + 6 + result; - - req->rq_trans2_command = TRANSACT2_QPATHINFO; - req->rq_ldata = 0; - req->rq_data = NULL; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out; - if (req->rq_rcls != 0) { - VERBOSE("for %s: result=%d, rcls=%d, err=%d\n", - ¶m[6], result, req->rq_rcls, req->rq_err); - result = smb_errno(req); - goto out; - } - result = -ENOENT; - if (req->rq_ldata < 22) { - PARANOIA("not enough data for %s, len=%d\n", - ¶m[6], req->rq_ldata); - goto out; - } - - result = 0; -out: - return result; -} - -static int -smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *attr) -{ - u16 date, time; - int off_date = 0, off_time = 2; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD); - if (result < 0) - goto out_free; - - /* - * Kludge alert: Win 95 swaps the date and time field, - * contrary to the CIFS docs and Win NT practice. - */ - if (server->mnt->flags & SMB_MOUNT_WIN95) { - off_date = 2; - off_time = 0; - } - date = WVAL(req->rq_data, off_date); - time = WVAL(req->rq_data, off_time); - attr->f_ctime.tv_sec = date_dos2unix(server, date, time); - attr->f_ctime.tv_nsec = 0; - - date = WVAL(req->rq_data, 4 + off_date); - time = WVAL(req->rq_data, 4 + off_time); - attr->f_atime.tv_sec = date_dos2unix(server, date, time); - attr->f_atime.tv_nsec = 0; - - date = WVAL(req->rq_data, 8 + off_date); - time = WVAL(req->rq_data, 8 + off_time); - attr->f_mtime.tv_sec = date_dos2unix(server, date, time); - attr->f_mtime.tv_nsec = 0; -#ifdef SMBFS_DEBUG_TIMESTAMP - printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", - DENTRY_PATH(dir), date, time, attr->f_mtime); -#endif - attr->f_size = DVAL(req->rq_data, 12); - attr->attr = WVAL(req->rq_data, 20); - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *attr) -{ - struct smb_request *req; - int result; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - result = smb_proc_getattr_trans2(server, dir, req, - SMB_QUERY_FILE_ALL_INFO); - if (result < 0) - goto out_free; - - attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0)); - attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8)); - attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16)); - /* change (24) */ - attr->attr = WVAL(req->rq_data, 32); - /* pad? (34) */ - /* allocated size (40) */ - attr->f_size = LVAL(req->rq_data, 48); - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *attr) -{ - struct smb_request *req; - int result; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - result = smb_proc_getattr_trans2(server, dir, req, - SMB_QUERY_FILE_UNIX_BASIC); - if (result < 0) - goto out_free; - - smb_decode_unix_basic(attr, server, req->rq_data); - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *attr) -{ - struct inode *inode = dir->d_inode; - int result; - - /* FIXME: why not use the "all" version? */ - result = smb_proc_getattr_trans2_std(server, dir, attr); - if (result < 0) - goto out; - - /* - * None of the getattr versions here can make win9x return the right - * filesize if there are changes made to an open file. - * A seek-to-end does return the right size, but we only need to do - * that on files we have written. - */ - if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE && - smb_is_open(inode)) - { - __u16 fileid = SMB_I(inode)->fileid; - attr->f_size = smb_proc_seek(server, fileid, 2, 0); - } - -out: - return result; -} - -static int -smb_proc_ops_wait(struct smb_sb_info *server) -{ - int result; - - result = wait_event_interruptible_timeout(server->conn_wq, - server->conn_complete, 30*HZ); - - if (!result || signal_pending(current)) - return -EIO; - - return 0; -} - -static int -smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir, - struct smb_fattr *fattr) -{ - int result; - - if (smb_proc_ops_wait(server) < 0) - return -EIO; - - smb_init_dirent(server, fattr); - result = server->ops->getattr(server, dir, fattr); - smb_finish_dirent(server, fattr); - - return result; -} - -static int -smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir, - struct smb_cache_control *ctl) -{ - struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry); - - if (smb_proc_ops_wait(server) < 0) - return -EIO; - - return server->ops->readdir(filp, dirent, filldir, ctl); -} - -int -smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr) -{ - struct smb_sb_info *server = server_from_dentry(dir); - int result; - - smb_init_dirent(server, fattr); - result = server->ops->getattr(server, dir, fattr); - smb_finish_dirent(server, fattr); - - return result; -} - - -/* - * Because of bugs in the core protocol, we use this only to set - * attributes. See smb_proc_settime() below for timestamp handling. - * - * Bugs Noted: - * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail - * with an undocumented error (ERRDOS code 50). Setting - * mtime to 0 allows the attributes to be set. - * (2) The extra parameters following the name string aren't - * in the CIFS docs, but seem to be necessary for operation. - */ -static int -smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry, - __u16 attr) -{ - char *p; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - - p = smb_setup_header(req, SMBsetatr, 8, 0); - WSET(req->rq_header, smb_vwv0, attr); - DSET(req->rq_header, smb_vwv1, 0); /* mtime */ - WSET(req->rq_header, smb_vwv3, 0); /* reserved values */ - WSET(req->rq_header, smb_vwv4, 0); - WSET(req->rq_header, smb_vwv5, 0); - WSET(req->rq_header, smb_vwv6, 0); - WSET(req->rq_header, smb_vwv7, 0); - result = smb_simple_encode_path(req, &p, dentry, NULL); - if (result < 0) - goto out_free; - if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) { - result = -ENAMETOOLONG; - goto out_free; - } - *p++ = 4; - *p++ = 0; - smb_setup_bcc(req, p); - - result = smb_request_ok(req, SMBsetatr, 0, 0); - if (result < 0) - goto out_free; - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -/* - * Because of bugs in the trans2 setattr messages, we must set - * attributes and timestamps separately. The core SMBsetatr - * message seems to be the only reliable way to set attributes. - */ -int -smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr) -{ - struct smb_sb_info *server = server_from_dentry(dir); - int result; - - VERBOSE("setting %s/%s, open=%d\n", - DENTRY_PATH(dir), smb_is_open(dir->d_inode)); - result = smb_proc_setattr_core(server, dir, fattr->attr); - return result; -} - -/* - * Sets the timestamps for an file open with write permissions. - */ -static int -smb_proc_setattr_ext(struct smb_sb_info *server, - struct inode *inode, struct smb_fattr *fattr) -{ - __u16 date, time; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBsetattrE, 7, 0); - WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid); - /* We don't change the creation time */ - WSET(req->rq_header, smb_vwv1, 0); - WSET(req->rq_header, smb_vwv2, 0); - date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time); - WSET(req->rq_header, smb_vwv3, date); - WSET(req->rq_header, smb_vwv4, time); - date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time); - WSET(req->rq_header, smb_vwv5, date); - WSET(req->rq_header, smb_vwv6, time); -#ifdef SMBFS_DEBUG_TIMESTAMP - printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n", - date, time, fattr->f_mtime); -#endif - - req->rq_flags |= SMB_REQ_NORETRY; - result = smb_request_ok(req, SMBsetattrE, 0, 0); - if (result < 0) - goto out_free; - result = 0; -out_free: - smb_rput(req); -out: - return result; -} - -/* - * Bugs Noted: - * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't - * set the file's attribute flags. - */ -static int -smb_proc_setattr_trans2(struct smb_sb_info *server, - struct dentry *dir, struct smb_fattr *fattr) -{ - __u16 date, time; - char *p, *param; - int result; - char data[26]; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - - WSET(param, 0, 1); /* Info level SMB_INFO_STANDARD */ - DSET(param, 2, 0); - result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL); - if (result < 0) - goto out_free; - p = param + 6 + result; - - WSET(data, 0, 0); /* creation time */ - WSET(data, 2, 0); - date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time); - WSET(data, 4, date); - WSET(data, 6, time); - date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time); - WSET(data, 8, date); - WSET(data, 10, time); -#ifdef SMBFS_DEBUG_TIMESTAMP - printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", - DENTRY_PATH(dir), date, time, fattr->f_mtime); -#endif - DSET(data, 12, 0); /* size */ - DSET(data, 16, 0); /* blksize */ - WSET(data, 20, 0); /* attr */ - DSET(data, 22, 0); /* ULONG EA size */ - - req->rq_trans2_command = TRANSACT2_SETPATHINFO; - req->rq_ldata = 26; - req->rq_data = data; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - result = 0; - if (req->rq_rcls != 0) - result = smb_errno(req); - -out_free: - smb_rput(req); -out: - return result; -} - -/* - * ATTR_MODE 0x001 - * ATTR_UID 0x002 - * ATTR_GID 0x004 - * ATTR_SIZE 0x008 - * ATTR_ATIME 0x010 - * ATTR_MTIME 0x020 - * ATTR_CTIME 0x040 - * ATTR_ATIME_SET 0x080 - * ATTR_MTIME_SET 0x100 - * ATTR_FORCE 0x200 - * ATTR_ATTR_FLAG 0x400 - * - * major/minor should only be set by mknod. - */ -int -smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, - unsigned int major, unsigned int minor) -{ - struct smb_sb_info *server = server_from_dentry(d); - u64 nttime; - char *p, *param; - int result; - char data[100]; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - - DEBUG1("valid flags = 0x%04x\n", attr->ia_valid); - - WSET(param, 0, SMB_SET_FILE_UNIX_BASIC); - DSET(param, 2, 0); - result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL); - if (result < 0) - goto out_free; - p = param + 6 + result; - - /* 0 L file size in bytes */ - /* 8 L file size on disk in bytes (block count) */ - /* 40 L uid */ - /* 48 L gid */ - /* 56 W file type enum */ - /* 60 L devmajor */ - /* 68 L devminor */ - /* 76 L unique ID (inode) */ - /* 84 L permissions */ - /* 92 L link count */ - LSET(data, 0, SMB_SIZE_NO_CHANGE); - LSET(data, 8, SMB_SIZE_NO_CHANGE); - LSET(data, 16, SMB_TIME_NO_CHANGE); - LSET(data, 24, SMB_TIME_NO_CHANGE); - LSET(data, 32, SMB_TIME_NO_CHANGE); - LSET(data, 40, SMB_UID_NO_CHANGE); - LSET(data, 48, SMB_GID_NO_CHANGE); - DSET(data, 56, smb_filetype_from_mode(attr->ia_mode)); - LSET(data, 60, major); - LSET(data, 68, minor); - LSET(data, 76, 0); - LSET(data, 84, SMB_MODE_NO_CHANGE); - LSET(data, 92, 0); - - if (attr->ia_valid & ATTR_SIZE) { - LSET(data, 0, attr->ia_size); - LSET(data, 8, 0); /* can't set anyway */ - } - - /* - * FIXME: check the conversion function it the correct one - * - * we can't set ctime but we might as well pass this to the server - * and let it ignore it. - */ - if (attr->ia_valid & ATTR_CTIME) { - nttime = smb_unixutc2ntutc(attr->ia_ctime); - LSET(data, 16, nttime); - } - if (attr->ia_valid & ATTR_ATIME) { - nttime = smb_unixutc2ntutc(attr->ia_atime); - LSET(data, 24, nttime); - } - if (attr->ia_valid & ATTR_MTIME) { - nttime = smb_unixutc2ntutc(attr->ia_mtime); - LSET(data, 32, nttime); - } - - if (attr->ia_valid & ATTR_UID) { - LSET(data, 40, attr->ia_uid); - } - if (attr->ia_valid & ATTR_GID) { - LSET(data, 48, attr->ia_gid); - } - - if (attr->ia_valid & ATTR_MODE) { - LSET(data, 84, attr->ia_mode); - } - - req->rq_trans2_command = TRANSACT2_SETPATHINFO; - req->rq_ldata = 100; - req->rq_data = data; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - -out_free: - smb_rput(req); -out: - return result; -} - - -/* - * Set the modify and access timestamps for a file. - * - * Incredibly enough, in all of SMB there is no message to allow - * setting both attributes and timestamps at once. - * - * Bugs Noted: - * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message - * with info level 1 (INFO_STANDARD). - * (2) Win 95 seems not to support setting directory timestamps. - * (3) Under the core protocol apparently the only way to set the - * timestamp is to open and close the file. - */ -int -smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr) -{ - struct smb_sb_info *server = server_from_dentry(dentry); - struct inode *inode = dentry->d_inode; - int result; - - VERBOSE("setting %s/%s, open=%d\n", - DENTRY_PATH(dentry), smb_is_open(inode)); - - /* setting the time on a Win95 server fails (tridge) */ - if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && - !(server->mnt->flags & SMB_MOUNT_WIN95)) { - if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY) - result = smb_proc_setattr_ext(server, inode, fattr); - else - result = smb_proc_setattr_trans2(server, dentry, fattr); - } else { - /* - * Fail silently on directories ... timestamp can't be set? - */ - result = 0; - if (S_ISREG(inode->i_mode)) { - /* - * Set the mtime by opening and closing the file. - * Note that the file is opened read-only, but this - * still allows us to set the date (tridge) - */ - result = -EACCES; - if (!smb_is_open(inode)) - smb_proc_open(server, dentry, SMB_O_RDONLY); - if (smb_is_open(inode)) { - inode->i_mtime = fattr->f_mtime; - result = smb_proc_close_inode(server, inode); - } - } - } - - return result; -} - -int -smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr) -{ - struct smb_sb_info *server = SMB_SB(dentry->d_sb); - int result; - char *p; - long unit; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 0))) - goto out; - - smb_setup_header(req, SMBdskattr, 0, 0); - if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0) - goto out_free; - p = SMB_VWV(req->rq_header); - unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT; - attr->f_blocks = WVAL(p, 0) * unit; - attr->f_bsize = SMB_ST_BLKSIZE; - attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit; - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -int -smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, - char *buffer, int len) -{ - char *p, *param; - int result; - struct smb_request *req; - - DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d)); - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - - WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK); - DSET(param, 2, 0); - result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL); - if (result < 0) - goto out_free; - p = param + 6 + result; - - req->rq_trans2_command = TRANSACT2_QPATHINFO; - req->rq_ldata = 0; - req->rq_data = NULL; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - DEBUG1("for %s: result=%d, rcls=%d, err=%d\n", - ¶m[6], result, req->rq_rcls, req->rq_err); - - /* copy data up to the \0 or buffer length */ - result = len; - if (req->rq_ldata < len) - result = req->rq_ldata; - strncpy(buffer, req->rq_data, result); - -out_free: - smb_rput(req); -out: - return result; -} - - -/* - * Create a symlink object called dentry which points to oldpath. - * Samba does not permit dangling links but returns a suitable error message. - */ -int -smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, - const char *oldpath) -{ - char *p, *param; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - - WSET(param, 0, SMB_SET_FILE_UNIX_LINK); - DSET(param, 2, 0); - result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL); - if (result < 0) - goto out_free; - p = param + 6 + result; - - req->rq_trans2_command = TRANSACT2_SETPATHINFO; - req->rq_ldata = strlen(oldpath) + 1; - req->rq_data = (char *) oldpath; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - - DEBUG1("for %s: result=%d, rcls=%d, err=%d\n", - ¶m[6], result, req->rq_rcls, req->rq_err); - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -/* - * Create a hard link object called new_dentry which points to dentry. - */ -int -smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, - struct dentry *new_dentry) -{ - char *p, *param; - int result; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, PAGE_SIZE))) - goto out; - param = req->rq_buffer; - - WSET(param, 0, SMB_SET_FILE_UNIX_HLINK); - DSET(param, 2, 0); - result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, - new_dentry, NULL); - if (result < 0) - goto out_free; - p = param + 6 + result; - - /* Grr, pointless separation of parameters and data ... */ - req->rq_data = p; - req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1, - dentry, NULL); - - req->rq_trans2_command = TRANSACT2_SETPATHINFO; - req->rq_lparm = p - param; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - - DEBUG1("for %s: result=%d, rcls=%d, err=%d\n", - ¶m[6], result, req->rq_rcls, req->rq_err); - result = 0; - -out_free: - smb_rput(req); -out: - return result; -} - -static int -smb_proc_query_cifsunix(struct smb_sb_info *server) -{ - int result; - int major, minor; - u64 caps; - char param[2]; - struct smb_request *req; - - result = -ENOMEM; - if (! (req = smb_alloc_request(server, 100))) - goto out; - - WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO); - - req->rq_trans2_command = TRANSACT2_QFSINFO; - req->rq_ldata = 0; - req->rq_data = NULL; - req->rq_lparm = 2; - req->rq_parm = param; - req->rq_flags = 0; - result = smb_add_request(req); - if (result < 0) - goto out_free; - - if (req->rq_ldata < 12) { - PARANOIA("Not enough data\n"); - goto out_free; - } - major = WVAL(req->rq_data, 0); - minor = WVAL(req->rq_data, 2); - - DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n", - major, minor); - /* FIXME: verify that we are ok with this major/minor? */ - - caps = LVAL(req->rq_data, 4); - DEBUG1("Server capabilities 0x%016llx\n", caps); - -out_free: - smb_rput(req); -out: - return result; -} - - -static void -install_ops(struct smb_ops *dst, struct smb_ops *src) -{ - memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC); -} - -/* < LANMAN2 */ -static struct smb_ops smb_ops_core = -{ - .read = smb_proc_read, - .write = smb_proc_write, - .readdir = smb_proc_readdir_short, - .getattr = smb_proc_getattr_core, - .truncate = smb_proc_trunc32, -}; - -/* LANMAN2, OS/2, others? */ -static struct smb_ops smb_ops_os2 = -{ - .read = smb_proc_read, - .write = smb_proc_write, - .readdir = smb_proc_readdir_long, - .getattr = smb_proc_getattr_trans2_std, - .truncate = smb_proc_trunc32, -}; - -/* Win95, and possibly some NetApp versions too */ -static struct smb_ops smb_ops_win95 = -{ - .read = smb_proc_read, /* does not support 12word readX */ - .write = smb_proc_write, - .readdir = smb_proc_readdir_long, - .getattr = smb_proc_getattr_95, - .truncate = smb_proc_trunc95, -}; - -/* Samba, NT4 and NT5 */ -static struct smb_ops smb_ops_winNT = -{ - .read = smb_proc_readX, - .write = smb_proc_writeX, - .readdir = smb_proc_readdir_long, - .getattr = smb_proc_getattr_trans2_all, - .truncate = smb_proc_trunc64, -}; - -/* Samba w/ unix extensions. Others? */ -static struct smb_ops smb_ops_unix = -{ - .read = smb_proc_readX, - .write = smb_proc_writeX, - .readdir = smb_proc_readdir_long, - .getattr = smb_proc_getattr_unix, - /* FIXME: core/ext/time setattr needs to be cleaned up! */ - /* .setattr = smb_proc_setattr_unix, */ - .truncate = smb_proc_trunc64, -}; - -/* Place holder until real ops are in place */ -static struct smb_ops smb_ops_null = -{ - .readdir = smb_proc_readdir_null, - .getattr = smb_proc_getattr_null, -}; - -void smb_install_null_ops(struct smb_ops *ops) -{ - install_ops(ops, &smb_ops_null); -} diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h deleted file mode 100644 index 05939a6f43e6..000000000000 --- a/fs/smbfs/proto.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Autogenerated with cproto on: Sat Sep 13 17:18:51 CEST 2003 - */ - -struct smb_request; -struct sock; -struct statfs; - -/* proc.c */ -extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp); -extern __u32 smb_len(__u8 *p); -extern int smb_get_rsize(struct smb_sb_info *server); -extern int smb_get_wsize(struct smb_sb_info *server); -extern int smb_errno(struct smb_request *req); -extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt); -extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc); -extern int smb_open(struct dentry *dentry, int wish); -extern int smb_close(struct inode *ino); -extern int smb_close_fileid(struct dentry *dentry, __u16 fileid); -extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid); -extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry); -extern int smb_proc_mkdir(struct dentry *dentry); -extern int smb_proc_rmdir(struct dentry *dentry); -extern int smb_proc_unlink(struct dentry *dentry); -extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid); -extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr, - struct super_block *sb); -extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr); -extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr); -extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor); -extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr); -extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr); -extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len); -extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath); -extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry); -extern void smb_install_null_ops(struct smb_ops *ops); -/* dir.c */ -extern const struct file_operations smb_dir_operations; -extern const struct inode_operations smb_dir_inode_operations; -extern const struct inode_operations smb_dir_inode_operations_unix; -extern void smb_new_dentry(struct dentry *dentry); -extern void smb_renew_times(struct dentry *dentry); -/* cache.c */ -extern void smb_invalid_dir_cache(struct inode *dir); -extern void smb_invalidate_dircache_entries(struct dentry *parent); -extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos); -extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry); -/* sock.c */ -extern void smb_data_ready(struct sock *sk, int len); -extern int smb_valid_socket(struct inode *inode); -extern void smb_close_socket(struct smb_sb_info *server); -extern int smb_recv_available(struct smb_sb_info *server); -extern int smb_receive_header(struct smb_sb_info *server); -extern int smb_receive_drop(struct smb_sb_info *server); -extern int smb_receive(struct smb_sb_info *server, struct smb_request *req); -extern int smb_send_request(struct smb_request *req); -/* inode.c */ -extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr); -extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr); -extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr); -extern void smb_invalidate_inodes(struct smb_sb_info *server); -extern int smb_revalidate_inode(struct dentry *dentry); -extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern int smb_notify_change(struct dentry *dentry, struct iattr *attr); -/* file.c */ -extern const struct address_space_operations smb_file_aops; -extern const struct file_operations smb_file_operations; -extern const struct inode_operations smb_file_inode_operations; -/* ioctl.c */ -extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -/* smbiod.c */ -extern void smbiod_wake_up(void); -extern int smbiod_register_server(struct smb_sb_info *server); -extern void smbiod_unregister_server(struct smb_sb_info *server); -extern void smbiod_flush(struct smb_sb_info *server); -extern int smbiod_retry(struct smb_sb_info *server); -/* request.c */ -extern int smb_init_request_cache(void); -extern void smb_destroy_request_cache(void); -extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize); -extern void smb_rput(struct smb_request *req); -extern int smb_add_request(struct smb_request *req); -extern int smb_request_send_server(struct smb_sb_info *server); -extern int smb_request_recv(struct smb_sb_info *server); -/* symlink.c */ -extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname); -extern const struct inode_operations smb_link_inode_operations; diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c deleted file mode 100644 index 45f45933e862..000000000000 --- a/fs/smbfs/request.c +++ /dev/null @@ -1,818 +0,0 @@ -/* - * request.c - * - * Copyright (C) 2001 by Urban Widmark - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/net.h> -#include <linux/sched.h> - -#include <linux/smb_fs.h> -#include <linux/smbno.h> -#include <linux/smb_mount.h> - -#include "smb_debug.h" -#include "request.h" -#include "proto.h" - -/* #define SMB_SLAB_DEBUG (SLAB_RED_ZONE | SLAB_POISON) */ -#define SMB_SLAB_DEBUG 0 - -/* cache for request structures */ -static struct kmem_cache *req_cachep; - -static int smb_request_send_req(struct smb_request *req); - -/* - /proc/slabinfo: - name, active, num, objsize, active_slabs, num_slaps, #pages -*/ - - -int smb_init_request_cache(void) -{ - req_cachep = kmem_cache_create("smb_request", - sizeof(struct smb_request), 0, - SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN, - NULL); - if (req_cachep == NULL) - return -ENOMEM; - - return 0; -} - -void smb_destroy_request_cache(void) -{ - kmem_cache_destroy(req_cachep); -} - -/* - * Allocate and initialise a request structure - */ -static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server, - int bufsize) -{ - struct smb_request *req; - unsigned char *buf = NULL; - - req = kmem_cache_zalloc(req_cachep, GFP_KERNEL); - VERBOSE("allocating request: %p\n", req); - if (!req) - goto out; - - if (bufsize > 0) { - buf = kmalloc(bufsize, GFP_NOFS); - if (!buf) { - kmem_cache_free(req_cachep, req); - return NULL; - } - } - - req->rq_buffer = buf; - req->rq_bufsize = bufsize; - req->rq_server = server; - init_waitqueue_head(&req->rq_wait); - INIT_LIST_HEAD(&req->rq_queue); - atomic_set(&req->rq_count, 1); - -out: - return req; -} - -struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize) -{ - struct smb_request *req = NULL; - - for (;;) { - atomic_inc(&server->nr_requests); - if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) { - req = smb_do_alloc_request(server, bufsize); - if (req != NULL) - break; - } - -#if 0 - /* - * Try to free up at least one request in order to stay - * below the hard limit - */ - if (nfs_try_to_free_pages(server)) - continue; - - if (fatal_signal_pending(current)) - return ERR_PTR(-ERESTARTSYS); - current->policy = SCHED_YIELD; - schedule(); -#else - /* FIXME: we want something like nfs does above, but that - requires changes to all callers and can wait. */ - break; -#endif - } - return req; -} - -static void smb_free_request(struct smb_request *req) -{ - atomic_dec(&req->rq_server->nr_requests); - if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC)) - kfree(req->rq_buffer); - kfree(req->rq_trans2buffer); - kmem_cache_free(req_cachep, req); -} - -/* - * What prevents a rget to race with a rput? The count must never drop to zero - * while it is in use. Only rput if it is ok that it is free'd. - */ -static void smb_rget(struct smb_request *req) -{ - atomic_inc(&req->rq_count); -} -void smb_rput(struct smb_request *req) -{ - if (atomic_dec_and_test(&req->rq_count)) { - list_del_init(&req->rq_queue); - smb_free_request(req); - } -} - -/* setup to receive the data part of the SMB */ -static int smb_setup_bcc(struct smb_request *req) -{ - int result = 0; - req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd; - - if (req->rq_rlen > req->rq_bufsize) { - PARANOIA("Packet too large %d > %d\n", - req->rq_rlen, req->rq_bufsize); - return -ENOBUFS; - } - - req->rq_iov[0].iov_base = req->rq_buffer; - req->rq_iov[0].iov_len = req->rq_rlen; - req->rq_iovlen = 1; - - return result; -} - -/* - * Prepare a "normal" request structure. - */ -static int smb_setup_request(struct smb_request *req) -{ - int len = smb_len(req->rq_header) + 4; - req->rq_slen = len; - - /* if we expect a data part in the reply we set the iov's to read it */ - if (req->rq_resp_bcc) - req->rq_setup_read = smb_setup_bcc; - - /* This tries to support re-using the same request */ - req->rq_bytes_sent = 0; - req->rq_rcls = 0; - req->rq_err = 0; - req->rq_errno = 0; - req->rq_fragment = 0; - kfree(req->rq_trans2buffer); - req->rq_trans2buffer = NULL; - - return 0; -} - -/* - * Prepare a transaction2 request structure - */ -static int smb_setup_trans2request(struct smb_request *req) -{ - struct smb_sb_info *server = req->rq_server; - int mparam, mdata; - static unsigned char padding[4]; - - /* I know the following is very ugly, but I want to build the - smb packet as efficiently as possible. */ - - const int smb_parameters = 15; - const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2; - const int oparam = ALIGN(header + 3, sizeof(u32)); - const int odata = ALIGN(oparam + req->rq_lparm, sizeof(u32)); - const int bcc = (req->rq_data ? odata + req->rq_ldata : - oparam + req->rq_lparm) - header; - - if ((bcc + oparam) > server->opt.max_xmit) - return -ENOMEM; - smb_setup_header(req, SMBtrans2, smb_parameters, bcc); - - /* - * max parameters + max data + max setup == bufsize to make NT4 happy - * and not abort the transfer or split into multiple responses. It also - * makes smbfs happy as handling packets larger than the buffer size - * is extra work. - * - * OS/2 is probably going to hate me for this ... - */ - mparam = SMB_TRANS2_MAX_PARAM; - mdata = req->rq_bufsize - mparam; - - mdata = server->opt.max_xmit - mparam - 100; - if (mdata < 1024) { - mdata = 1024; - mparam = 20; - } - -#if 0 - /* NT/win2k has ~4k max_xmit, so with this we request more than it wants - to return as one SMB. Useful for testing the fragmented trans2 - handling. */ - mdata = 8192; -#endif - - WSET(req->rq_header, smb_tpscnt, req->rq_lparm); - WSET(req->rq_header, smb_tdscnt, req->rq_ldata); - WSET(req->rq_header, smb_mprcnt, mparam); - WSET(req->rq_header, smb_mdrcnt, mdata); - WSET(req->rq_header, smb_msrcnt, 0); /* max setup always 0 ? */ - WSET(req->rq_header, smb_flags, 0); - DSET(req->rq_header, smb_timeout, 0); - WSET(req->rq_header, smb_pscnt, req->rq_lparm); - WSET(req->rq_header, smb_psoff, oparam - 4); - WSET(req->rq_header, smb_dscnt, req->rq_ldata); - WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0); - *(req->rq_header + smb_suwcnt) = 0x01; /* setup count */ - *(req->rq_header + smb_suwcnt + 1) = 0x00; /* reserved */ - WSET(req->rq_header, smb_setup0, req->rq_trans2_command); - - req->rq_iovlen = 2; - req->rq_iov[0].iov_base = (void *) req->rq_header; - req->rq_iov[0].iov_len = oparam; - req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm; - req->rq_iov[1].iov_len = req->rq_lparm; - req->rq_slen = oparam + req->rq_lparm; - - if (req->rq_data) { - req->rq_iovlen += 2; - req->rq_iov[2].iov_base = padding; - req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm; - req->rq_iov[3].iov_base = req->rq_data; - req->rq_iov[3].iov_len = req->rq_ldata; - req->rq_slen = odata + req->rq_ldata; - } - - /* always a data part for trans2 replies */ - req->rq_setup_read = smb_setup_bcc; - - return 0; -} - -/* - * Add a request and tell smbiod to process it - */ -int smb_add_request(struct smb_request *req) -{ - long timeleft; - struct smb_sb_info *server = req->rq_server; - int result = 0; - - smb_setup_request(req); - if (req->rq_trans2_command) { - if (req->rq_buffer == NULL) { - PARANOIA("trans2 attempted without response buffer!\n"); - return -EIO; - } - result = smb_setup_trans2request(req); - } - if (result < 0) - return result; - -#ifdef SMB_DEBUG_PACKET_SIZE - add_xmit_stats(req); -#endif - - /* add 'req' to the queue of requests */ - if (smb_lock_server_interruptible(server)) - return -EINTR; - - /* - * Try to send the request as the process. If that fails we queue the - * request and let smbiod send it later. - */ - - /* FIXME: each server has a number on the maximum number of parallel - requests. 10, 50 or so. We should not allow more requests to be - active. */ - if (server->mid > 0xf000) - server->mid = 0; - req->rq_mid = server->mid++; - WSET(req->rq_header, smb_mid, req->rq_mid); - - result = 0; - if (server->state == CONN_VALID) { - if (list_empty(&server->xmitq)) - result = smb_request_send_req(req); - if (result < 0) { - /* Connection lost? */ - server->conn_error = result; - server->state = CONN_INVALID; - } - } - if (result != 1) - list_add_tail(&req->rq_queue, &server->xmitq); - smb_rget(req); - - if (server->state != CONN_VALID) - smbiod_retry(server); - - smb_unlock_server(server); - - smbiod_wake_up(); - - timeleft = wait_event_interruptible_timeout(req->rq_wait, - req->rq_flags & SMB_REQ_RECEIVED, 30*HZ); - if (!timeleft || signal_pending(current)) { - /* - * On timeout or on interrupt we want to try and remove the - * request from the recvq/xmitq. - * First check if the request is still part of a queue. (May - * have been removed by some error condition) - */ - smb_lock_server(server); - if (!list_empty(&req->rq_queue)) { - list_del_init(&req->rq_queue); - smb_rput(req); - } - smb_unlock_server(server); - } - - if (!timeleft) { - PARANOIA("request [%p, mid=%d] timed out!\n", - req, req->rq_mid); - VERBOSE("smb_com: %02x\n", *(req->rq_header + smb_com)); - VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls)); - VERBOSE("smb_flg: %02x\n", *(req->rq_header + smb_flg)); - VERBOSE("smb_tid: %04x\n", WVAL(req->rq_header, smb_tid)); - VERBOSE("smb_pid: %04x\n", WVAL(req->rq_header, smb_pid)); - VERBOSE("smb_uid: %04x\n", WVAL(req->rq_header, smb_uid)); - VERBOSE("smb_mid: %04x\n", WVAL(req->rq_header, smb_mid)); - VERBOSE("smb_wct: %02x\n", *(req->rq_header + smb_wct)); - - req->rq_rcls = ERRSRV; - req->rq_err = ERRtimeout; - - /* Just in case it was "stuck" */ - smbiod_wake_up(); - } - VERBOSE("woke up, rcls=%d\n", req->rq_rcls); - - if (req->rq_rcls != 0) - req->rq_errno = smb_errno(req); - if (signal_pending(current)) - req->rq_errno = -ERESTARTSYS; - return req->rq_errno; -} - -/* - * Send a request and place it on the recvq if successfully sent. - * Must be called with the server lock held. - */ -static int smb_request_send_req(struct smb_request *req) -{ - struct smb_sb_info *server = req->rq_server; - int result; - - if (req->rq_bytes_sent == 0) { - WSET(req->rq_header, smb_tid, server->opt.tid); - WSET(req->rq_header, smb_pid, 1); - WSET(req->rq_header, smb_uid, server->opt.server_uid); - } - - result = smb_send_request(req); - if (result < 0 && result != -EAGAIN) - goto out; - - result = 0; - if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) - goto out; - - list_move_tail(&req->rq_queue, &server->recvq); - result = 1; -out: - return result; -} - -/* - * Sends one request for this server. (smbiod) - * Must be called with the server lock held. - * Returns: <0 on error - * 0 if no request could be completely sent - * 1 if all data for one request was sent - */ -int smb_request_send_server(struct smb_sb_info *server) -{ - struct list_head *head; - struct smb_request *req; - int result; - - if (server->state != CONN_VALID) - return 0; - - /* dequeue first request, if any */ - req = NULL; - head = server->xmitq.next; - if (head != &server->xmitq) { - req = list_entry(head, struct smb_request, rq_queue); - } - if (!req) - return 0; - - result = smb_request_send_req(req); - if (result < 0) { - server->conn_error = result; - list_move(&req->rq_queue, &server->xmitq); - result = -EIO; - goto out; - } - -out: - return result; -} - -/* - * Try to find a request matching this "mid". Typically the first entry will - * be the matching one. - */ -static struct smb_request *find_request(struct smb_sb_info *server, int mid) -{ - struct list_head *tmp; - struct smb_request *req = NULL; - - list_for_each(tmp, &server->recvq) { - req = list_entry(tmp, struct smb_request, rq_queue); - if (req->rq_mid == mid) { - break; - } - req = NULL; - } - - if (!req) { - VERBOSE("received reply with mid %d but no request!\n", - WVAL(server->header, smb_mid)); - server->rstate = SMB_RECV_DROP; - } - - return req; -} - -/* - * Called when we have read the smb header and believe this is a response. - */ -static int smb_init_request(struct smb_sb_info *server, struct smb_request *req) -{ - int hdrlen, wct; - - memcpy(req->rq_header, server->header, SMB_HEADER_LEN); - - wct = *(req->rq_header + smb_wct); - if (wct > 20) { - PARANOIA("wct too large, %d > 20\n", wct); - server->rstate = SMB_RECV_DROP; - return 0; - } - - req->rq_resp_wct = wct; - hdrlen = SMB_HEADER_LEN + wct*2 + 2; - VERBOSE("header length: %d smb_wct: %2d\n", hdrlen, wct); - - req->rq_bytes_recvd = SMB_HEADER_LEN; - req->rq_rlen = hdrlen; - req->rq_iov[0].iov_base = req->rq_header; - req->rq_iov[0].iov_len = hdrlen; - req->rq_iovlen = 1; - server->rstate = SMB_RECV_PARAM; - -#ifdef SMB_DEBUG_PACKET_SIZE - add_recv_stats(smb_len(server->header)); -#endif - return 0; -} - -/* - * Reads the SMB parameters - */ -static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req) -{ - int result; - - result = smb_receive(server, req); - if (result < 0) - return result; - if (req->rq_bytes_recvd < req->rq_rlen) - return 0; - - VERBOSE("result: %d smb_bcc: %04x\n", result, - WVAL(req->rq_header, SMB_HEADER_LEN + - (*(req->rq_header + smb_wct) * 2))); - - result = 0; - req->rq_iov[0].iov_base = NULL; - req->rq_rlen = 0; - if (req->rq_callback) - req->rq_callback(req); - else if (req->rq_setup_read) - result = req->rq_setup_read(req); - if (result < 0) { - server->rstate = SMB_RECV_DROP; - return result; - } - - server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END; - - req->rq_bytes_recvd = 0; // recvd out of the iov - - VERBOSE("rlen: %d\n", req->rq_rlen); - if (req->rq_rlen < 0) { - PARANOIA("Parameters read beyond end of packet!\n"); - server->rstate = SMB_RECV_END; - return -EIO; - } - return 0; -} - -/* - * Reads the SMB data - */ -static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req) -{ - int result; - - result = smb_receive(server, req); - if (result < 0) - goto out; - if (req->rq_bytes_recvd < req->rq_rlen) - goto out; - server->rstate = SMB_RECV_END; -out: - VERBOSE("result: %d\n", result); - return result; -} - -/* - * Receive a transaction2 response - * Return: 0 if the response has been fully read - * 1 if there are further "fragments" to read - * <0 if there is an error - */ -static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req) -{ - unsigned char *inbuf; - unsigned int parm_disp, parm_offset, parm_count, parm_tot; - unsigned int data_disp, data_offset, data_count, data_tot; - int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2; - - VERBOSE("handling trans2\n"); - - inbuf = req->rq_header; - data_tot = WVAL(inbuf, smb_tdrcnt); - parm_tot = WVAL(inbuf, smb_tprcnt); - parm_disp = WVAL(inbuf, smb_prdisp); - parm_offset = WVAL(inbuf, smb_proff); - parm_count = WVAL(inbuf, smb_prcnt); - data_disp = WVAL(inbuf, smb_drdisp); - data_offset = WVAL(inbuf, smb_droff); - data_count = WVAL(inbuf, smb_drcnt); - - /* Modify offset for the split header/buffer we use */ - if (data_count || data_offset) { - if (unlikely(data_offset < hdrlen)) - goto out_bad_data; - else - data_offset -= hdrlen; - } - if (parm_count || parm_offset) { - if (unlikely(parm_offset < hdrlen)) - goto out_bad_parm; - else - parm_offset -= hdrlen; - } - - if (parm_count == parm_tot && data_count == data_tot) { - /* - * This packet has all the trans2 data. - * - * We setup the request so that this will be the common - * case. It may be a server error to not return a - * response that fits. - */ - VERBOSE("single trans2 response " - "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n", - data_count, parm_count, - data_offset, parm_offset); - req->rq_ldata = data_count; - req->rq_lparm = parm_count; - req->rq_data = req->rq_buffer + data_offset; - req->rq_parm = req->rq_buffer + parm_offset; - if (unlikely(parm_offset + parm_count > req->rq_rlen)) - goto out_bad_parm; - if (unlikely(data_offset + data_count > req->rq_rlen)) - goto out_bad_data; - return 0; - } - - VERBOSE("multi trans2 response " - "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n", - req->rq_fragment, - data_count, parm_count, - data_offset, parm_offset); - - if (!req->rq_fragment) { - int buf_len; - - /* We got the first trans2 fragment */ - req->rq_fragment = 1; - req->rq_total_data = data_tot; - req->rq_total_parm = parm_tot; - req->rq_ldata = 0; - req->rq_lparm = 0; - - buf_len = data_tot + parm_tot; - if (buf_len > SMB_MAX_PACKET_SIZE) - goto out_too_long; - - req->rq_trans2bufsize = buf_len; - req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS); - if (!req->rq_trans2buffer) - goto out_no_mem; - - req->rq_parm = req->rq_trans2buffer; - req->rq_data = req->rq_trans2buffer + parm_tot; - } else if (unlikely(req->rq_total_data < data_tot || - req->rq_total_parm < parm_tot)) - goto out_data_grew; - - if (unlikely(parm_disp + parm_count > req->rq_total_parm || - parm_offset + parm_count > req->rq_rlen)) - goto out_bad_parm; - if (unlikely(data_disp + data_count > req->rq_total_data || - data_offset + data_count > req->rq_rlen)) - goto out_bad_data; - - inbuf = req->rq_buffer; - memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count); - memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count); - - req->rq_ldata += data_count; - req->rq_lparm += parm_count; - - /* - * Check whether we've received all of the data. Note that - * we use the packet totals -- total lengths might shrink! - */ - if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) { - req->rq_ldata = data_tot; - req->rq_lparm = parm_tot; - return 0; - } - return 1; - -out_too_long: - printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n", - data_tot, parm_tot); - goto out_EIO; -out_no_mem: - printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n", - req->rq_trans2bufsize); - req->rq_errno = -ENOMEM; - goto out; -out_data_grew: - printk(KERN_ERR "smb_trans2: data/params grew!\n"); - goto out_EIO; -out_bad_parm: - printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n", - parm_disp, parm_count, parm_tot, parm_offset); - goto out_EIO; -out_bad_data: - printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n", - data_disp, data_count, data_tot, data_offset); -out_EIO: - req->rq_errno = -EIO; -out: - return req->rq_errno; -} - -/* - * State machine for receiving responses. We handle the fact that we can't - * read the full response in one try by having states telling us how much we - * have read. - * - * Must be called with the server lock held (only called from smbiod). - * - * Return: <0 on error - */ -int smb_request_recv(struct smb_sb_info *server) -{ - struct smb_request *req = NULL; - int result = 0; - - if (smb_recv_available(server) <= 0) - return 0; - - VERBOSE("state: %d\n", server->rstate); - switch (server->rstate) { - case SMB_RECV_DROP: - result = smb_receive_drop(server); - if (result < 0) - break; - if (server->rstate == SMB_RECV_DROP) - break; - server->rstate = SMB_RECV_START; - /* fallthrough */ - case SMB_RECV_START: - server->smb_read = 0; - server->rstate = SMB_RECV_HEADER; - /* fallthrough */ - case SMB_RECV_HEADER: - result = smb_receive_header(server); - if (result < 0) - break; - if (server->rstate == SMB_RECV_HEADER) - break; - if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) { - server->rstate = SMB_RECV_REQUEST; - break; - } - if (server->rstate != SMB_RECV_HCOMPLETE) - break; - /* fallthrough */ - case SMB_RECV_HCOMPLETE: - req = find_request(server, WVAL(server->header, smb_mid)); - if (!req) - break; - smb_init_request(server, req); - req->rq_rcls = *(req->rq_header + smb_rcls); - req->rq_err = WVAL(req->rq_header, smb_err); - if (server->rstate != SMB_RECV_PARAM) - break; - /* fallthrough */ - case SMB_RECV_PARAM: - if (!req) - req = find_request(server,WVAL(server->header,smb_mid)); - if (!req) - break; - result = smb_recv_param(server, req); - if (result < 0) - break; - if (server->rstate != SMB_RECV_DATA) - break; - /* fallthrough */ - case SMB_RECV_DATA: - if (!req) - req = find_request(server,WVAL(server->header,smb_mid)); - if (!req) - break; - result = smb_recv_data(server, req); - if (result < 0) - break; - break; - - /* We should never be called with any of these states */ - case SMB_RECV_END: - case SMB_RECV_REQUEST: - BUG(); - } - - if (result < 0) { - /* We saw an error */ - return result; - } - - if (server->rstate != SMB_RECV_END) - return 0; - - result = 0; - if (req->rq_trans2_command && req->rq_rcls == SUCCESS) - result = smb_recv_trans2(server, req); - - /* - * Response completely read. Drop any extra bytes sent by the server. - * (Yes, servers sometimes add extra bytes to responses) - */ - VERBOSE("smb_len: %d smb_read: %d\n", - server->smb_len, server->smb_read); - if (server->smb_read < server->smb_len) - smb_receive_drop(server); - - server->rstate = SMB_RECV_START; - - if (!result) { - list_del_init(&req->rq_queue); - req->rq_flags |= SMB_REQ_RECEIVED; - smb_rput(req); - wake_up_interruptible(&req->rq_wait); - } - return 0; -} diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h deleted file mode 100644 index efb21451e7c9..000000000000 --- a/fs/smbfs/request.h +++ /dev/null @@ -1,70 +0,0 @@ -#include <linux/list.h> -#include <linux/types.h> -#include <linux/uio.h> -#include <linux/wait.h> - -struct smb_request { - struct list_head rq_queue; /* recvq or xmitq for the server */ - - atomic_t rq_count; - - wait_queue_head_t rq_wait; - int rq_flags; - int rq_mid; /* multiplex ID, set by request.c */ - - struct smb_sb_info *rq_server; - - /* header + word count + parameter words + byte count */ - unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2]; - - int rq_bufsize; - unsigned char *rq_buffer; - - /* FIXME: this is not good enough for merging IO requests. */ - unsigned char *rq_page; - int rq_rsize; - - int rq_resp_wct; - int rq_resp_bcc; - - int rq_rlen; - int rq_bytes_recvd; - - int rq_slen; - int rq_bytes_sent; - - int rq_iovlen; - struct kvec rq_iov[4]; - - int (*rq_setup_read) (struct smb_request *); - void (*rq_callback) (struct smb_request *); - - /* ------ trans2 stuff ------ */ - - u16 rq_trans2_command; /* 0 if not a trans2 request */ - unsigned int rq_ldata; - unsigned char *rq_data; - unsigned int rq_lparm; - unsigned char *rq_parm; - - int rq_fragment; - u32 rq_total_data; - u32 rq_total_parm; - int rq_trans2bufsize; - unsigned char *rq_trans2buffer; - - /* ------ response ------ */ - - unsigned short rq_rcls; - unsigned short rq_err; - int rq_errno; -}; - -#define SMB_REQ_STATIC 0x0001 /* rq_buffer is static */ -#define SMB_REQ_NORETRY 0x0002 /* request is invalid after retry */ - -#define SMB_REQ_TRANSMITTED 0x4000 /* all data has been sent */ -#define SMB_REQ_RECEIVED 0x8000 /* reply received, smbiod is done */ - -#define xSMB_REQ_NOREPLY 0x0004 /* we don't want the reply (if any) */ -#define xSMB_REQ_NORECEIVER 0x0008 /* caller doesn't wait for response */ diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h deleted file mode 100644 index fc4b1a5dd755..000000000000 --- a/fs/smbfs/smb_debug.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Defines some debug macros for smbfs. - */ - -/* This makes a dentry parent/child name pair. Useful for debugging printk's */ -#define DENTRY_PATH(dentry) \ - (dentry)->d_parent->d_name.name,(dentry)->d_name.name - -/* - * safety checks that should never happen ??? - * these are normally enabled. - */ -#ifdef SMBFS_PARANOIA -# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a) -#else -# define PARANOIA(f, a...) do { ; } while(0) -#endif - -/* lots of debug messages */ -#ifdef SMBFS_DEBUG_VERBOSE -# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a) -#else -# define VERBOSE(f, a...) do { ; } while(0) -#endif - -/* - * "normal" debug messages, but not with a normal DEBUG define ... way - * too common name. - */ -#ifdef SMBFS_DEBUG -#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a) -#else -#define DEBUG1(f, a...) do { ; } while(0) -#endif diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c deleted file mode 100644 index 0e39a924f10a..000000000000 --- a/fs/smbfs/smbiod.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * smbiod.c - * - * Copyright (C) 2000, Charles Loep / Corel Corp. - * Copyright (C) 2001, Urban Widmark - */ - - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/file.h> -#include <linux/dcache.h> -#include <linux/module.h> -#include <linux/net.h> -#include <linux/kthread.h> -#include <net/ip.h> - -#include <linux/smb_fs.h> -#include <linux/smbno.h> -#include <linux/smb_mount.h> - -#include <asm/system.h> -#include <asm/uaccess.h> - -#include "smb_debug.h" -#include "request.h" -#include "proto.h" - -enum smbiod_state { - SMBIOD_DEAD, - SMBIOD_STARTING, - SMBIOD_RUNNING, -}; - -static enum smbiod_state smbiod_state = SMBIOD_DEAD; -static struct task_struct *smbiod_thread; -static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait); -static LIST_HEAD(smb_servers); -static DEFINE_SPINLOCK(servers_lock); - -#define SMBIOD_DATA_READY (1<<0) -static unsigned long smbiod_flags; - -static int smbiod(void *); -static int smbiod_start(void); - -/* - * called when there's work for us to do - */ -void smbiod_wake_up(void) -{ - if (smbiod_state == SMBIOD_DEAD) - return; - set_bit(SMBIOD_DATA_READY, &smbiod_flags); - wake_up_interruptible(&smbiod_wait); -} - -/* - * start smbiod if none is running - */ -static int smbiod_start(void) -{ - struct task_struct *tsk; - int err = 0; - - if (smbiod_state != SMBIOD_DEAD) - return 0; - smbiod_state = SMBIOD_STARTING; - __module_get(THIS_MODULE); - spin_unlock(&servers_lock); - tsk = kthread_run(smbiod, NULL, "smbiod"); - if (IS_ERR(tsk)) { - err = PTR_ERR(tsk); - module_put(THIS_MODULE); - } - - spin_lock(&servers_lock); - if (err < 0) { - smbiod_state = SMBIOD_DEAD; - smbiod_thread = NULL; - } else { - smbiod_state = SMBIOD_RUNNING; - smbiod_thread = tsk; - } - return err; -} - -/* - * register a server & start smbiod if necessary - */ -int smbiod_register_server(struct smb_sb_info *server) -{ - int ret; - spin_lock(&servers_lock); - list_add(&server->entry, &smb_servers); - VERBOSE("%p\n", server); - ret = smbiod_start(); - spin_unlock(&servers_lock); - return ret; -} - -/* - * Unregister a server - * Must be called with the server lock held. - */ -void smbiod_unregister_server(struct smb_sb_info *server) -{ - spin_lock(&servers_lock); - list_del_init(&server->entry); - VERBOSE("%p\n", server); - spin_unlock(&servers_lock); - - smbiod_wake_up(); - smbiod_flush(server); -} - -void smbiod_flush(struct smb_sb_info *server) -{ - struct list_head *tmp, *n; - struct smb_request *req; - - list_for_each_safe(tmp, n, &server->xmitq) { - req = list_entry(tmp, struct smb_request, rq_queue); - req->rq_errno = -EIO; - list_del_init(&req->rq_queue); - smb_rput(req); - wake_up_interruptible(&req->rq_wait); - } - list_for_each_safe(tmp, n, &server->recvq) { - req = list_entry(tmp, struct smb_request, rq_queue); - req->rq_errno = -EIO; - list_del_init(&req->rq_queue); - smb_rput(req); - wake_up_interruptible(&req->rq_wait); - } -} - -/* - * Wake up smbmount and make it reconnect to the server. - * This must be called with the server locked. - * - * FIXME: add smbconnect version to this - */ -int smbiod_retry(struct smb_sb_info *server) -{ - struct list_head *head; - struct smb_request *req; - struct pid *pid = get_pid(server->conn_pid); - int result = 0; - - VERBOSE("state: %d\n", server->state); - if (server->state == CONN_VALID || server->state == CONN_RETRYING) - goto out; - - smb_invalidate_inodes(server); - - /* - * Some requests are meaningless after a retry, so we abort them. - * One example are all requests using 'fileid' since the files are - * closed on retry. - */ - head = server->xmitq.next; - while (head != &server->xmitq) { - req = list_entry(head, struct smb_request, rq_queue); - head = head->next; - - req->rq_bytes_sent = 0; - if (req->rq_flags & SMB_REQ_NORETRY) { - VERBOSE("aborting request %p on xmitq\n", req); - req->rq_errno = -EIO; - list_del_init(&req->rq_queue); - smb_rput(req); - wake_up_interruptible(&req->rq_wait); - } - } - - /* - * FIXME: test the code for retrying request we already sent - */ - head = server->recvq.next; - while (head != &server->recvq) { - req = list_entry(head, struct smb_request, rq_queue); - head = head->next; -#if 0 - if (req->rq_flags & SMB_REQ_RETRY) { - /* must move the request to the xmitq */ - VERBOSE("retrying request %p on recvq\n", req); - list_move(&req->rq_queue, &server->xmitq); - continue; - } -#endif - - VERBOSE("aborting request %p on recvq\n", req); - /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */ - req->rq_errno = -EIO; - list_del_init(&req->rq_queue); - smb_rput(req); - wake_up_interruptible(&req->rq_wait); - } - - smb_close_socket(server); - - if (!pid) { - /* FIXME: this is fatal, umount? */ - printk(KERN_ERR "smb_retry: no connection process\n"); - server->state = CONN_RETRIED; - goto out; - } - - /* - * Change state so that only one retry per server will be started. - */ - server->state = CONN_RETRYING; - - /* - * Note: use the "priv" flag, as a user process may need to reconnect. - */ - result = kill_pid(pid, SIGUSR1, 1); - if (result) { - /* FIXME: this is most likely fatal, umount? */ - printk(KERN_ERR "smb_retry: signal failed [%d]\n", result); - goto out; - } - VERBOSE("signalled pid %d\n", pid_nr(pid)); - - /* FIXME: The retried requests should perhaps get a "time boost". */ - -out: - put_pid(pid); - return result; -} - -/* - * Currently handles lockingX packets. - */ -static void smbiod_handle_request(struct smb_sb_info *server) -{ - PARANOIA("smbiod got a request ... and we don't implement oplocks!\n"); - server->rstate = SMB_RECV_DROP; -} - -/* - * Do some IO for one server. - */ -static void smbiod_doio(struct smb_sb_info *server) -{ - int result; - int maxwork = 7; - - if (server->state != CONN_VALID) - goto out; - - do { - result = smb_request_recv(server); - if (result < 0) { - server->state = CONN_INVALID; - smbiod_retry(server); - goto out; /* reconnecting is slow */ - } else if (server->rstate == SMB_RECV_REQUEST) - smbiod_handle_request(server); - } while (result > 0 && maxwork-- > 0); - - /* - * If there is more to read then we want to be sure to wake up again. - */ - if (server->state != CONN_VALID) - goto out; - if (smb_recv_available(server) > 0) - set_bit(SMBIOD_DATA_READY, &smbiod_flags); - - do { - result = smb_request_send_server(server); - if (result < 0) { - server->state = CONN_INVALID; - smbiod_retry(server); - goto out; /* reconnecting is slow */ - } - } while (result > 0); - - /* - * If the last request was not sent out we want to wake up again. - */ - if (!list_empty(&server->xmitq)) - set_bit(SMBIOD_DATA_READY, &smbiod_flags); - -out: - return; -} - -/* - * smbiod kernel thread - */ -static int smbiod(void *unused) -{ - VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid); - - for (;;) { - struct smb_sb_info *server; - struct list_head *pos, *n; - - /* FIXME: Use poll? */ - wait_event_interruptible(smbiod_wait, - test_bit(SMBIOD_DATA_READY, &smbiod_flags)); - if (signal_pending(current)) { - spin_lock(&servers_lock); - smbiod_state = SMBIOD_DEAD; - spin_unlock(&servers_lock); - break; - } - - clear_bit(SMBIOD_DATA_READY, &smbiod_flags); - - spin_lock(&servers_lock); - if (list_empty(&smb_servers)) { - smbiod_state = SMBIOD_DEAD; - spin_unlock(&servers_lock); - break; - } - - list_for_each_safe(pos, n, &smb_servers) { - server = list_entry(pos, struct smb_sb_info, entry); - VERBOSE("checking server %p\n", server); - - if (server->state == CONN_VALID) { - spin_unlock(&servers_lock); - - smb_lock_server(server); - smbiod_doio(server); - smb_unlock_server(server); - - spin_lock(&servers_lock); - } - } - spin_unlock(&servers_lock); - } - - VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid); - module_put_and_exit(0); -} diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c deleted file mode 100644 index e37fe4deebd0..000000000000 --- a/fs/smbfs/sock.c +++ /dev/null @@ -1,386 +0,0 @@ -/* - * sock.c - * - * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke - * Copyright (C) 1997 by Volker Lendecke - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/socket.h> -#include <linux/fcntl.h> -#include <linux/file.h> -#include <linux/in.h> -#include <linux/net.h> -#include <linux/mm.h> -#include <linux/netdevice.h> -#include <linux/workqueue.h> -#include <net/scm.h> -#include <net/tcp_states.h> -#include <net/ip.h> - -#include <linux/smb_fs.h> -#include <linux/smb.h> -#include <linux/smbno.h> - -#include <asm/uaccess.h> -#include <asm/ioctls.h> - -#include "smb_debug.h" -#include "proto.h" -#include "request.h" - - -static int -_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags) -{ - struct kvec iov = {ubuf, size}; - struct msghdr msg = {.msg_flags = flags}; - msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL; - return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags); -} - -/* - * Return the server this socket belongs to - */ -static struct smb_sb_info * -server_from_socket(struct socket *socket) -{ - return socket->sk->sk_user_data; -} - -/* - * Called when there is data on the socket. - */ -void -smb_data_ready(struct sock *sk, int len) -{ - struct smb_sb_info *server = server_from_socket(sk->sk_socket); - void (*data_ready)(struct sock *, int) = server->data_ready; - - data_ready(sk, len); - VERBOSE("(%p, %d)\n", sk, len); - smbiod_wake_up(); -} - -int -smb_valid_socket(struct inode * inode) -{ - return (inode && S_ISSOCK(inode->i_mode) && - SOCKET_I(inode)->type == SOCK_STREAM); -} - -static struct socket * -server_sock(struct smb_sb_info *server) -{ - struct file *file; - - if (server && (file = server->sock_file)) - { -#ifdef SMBFS_PARANOIA - if (!smb_valid_socket(file->f_path.dentry->d_inode)) - PARANOIA("bad socket!\n"); -#endif - return SOCKET_I(file->f_path.dentry->d_inode); - } - return NULL; -} - -void -smb_close_socket(struct smb_sb_info *server) -{ - struct file * file = server->sock_file; - - if (file) { - struct socket *sock = server_sock(server); - - VERBOSE("closing socket %p\n", sock); - sock->sk->sk_data_ready = server->data_ready; - server->sock_file = NULL; - fput(file); - } -} - -static int -smb_get_length(struct socket *socket, unsigned char *header) -{ - int result; - - result = _recvfrom(socket, header, 4, MSG_PEEK); - if (result == -EAGAIN) - return -ENODATA; - if (result < 0) { - PARANOIA("recv error = %d\n", -result); - return result; - } - if (result < 4) - return -ENODATA; - - switch (header[0]) { - case 0x00: - case 0x82: - break; - - case 0x85: - DEBUG1("Got SESSION KEEP ALIVE\n"); - _recvfrom(socket, header, 4, 0); /* read away */ - return -ENODATA; - - default: - PARANOIA("Invalid NBT packet, code=%x\n", header[0]); - return -EIO; - } - - /* The length in the RFC NB header is the raw data length */ - return smb_len(header); -} - -int -smb_recv_available(struct smb_sb_info *server) -{ - mm_segment_t oldfs; - int avail, err; - struct socket *sock = server_sock(server); - - oldfs = get_fs(); - set_fs(get_ds()); - err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail); - set_fs(oldfs); - return (err >= 0) ? avail : err; -} - -/* - * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc) - */ -static int -smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount) -{ - struct kvec *iv = *data; - int i; - int len; - - /* - * Eat any sent kvecs - */ - while (iv->iov_len <= amount) { - amount -= iv->iov_len; - iv++; - (*num)--; - } - - /* - * And chew down the partial one - */ - vec[0].iov_len = iv->iov_len-amount; - vec[0].iov_base =((unsigned char *)iv->iov_base)+amount; - iv++; - - len = vec[0].iov_len; - - /* - * And copy any others - */ - for (i = 1; i < *num; i++) { - vec[i] = *iv++; - len += vec[i].iov_len; - } - - *data = vec; - return len; -} - -/* - * smb_receive_header - * Only called by the smbiod thread. - */ -int -smb_receive_header(struct smb_sb_info *server) -{ - struct socket *sock; - int result = 0; - unsigned char peek_buf[4]; - - result = -EIO; - sock = server_sock(server); - if (!sock) - goto out; - if (sock->sk->sk_state != TCP_ESTABLISHED) - goto out; - - if (!server->smb_read) { - result = smb_get_length(sock, peek_buf); - if (result < 0) { - if (result == -ENODATA) - result = 0; - goto out; - } - server->smb_len = result + 4; - - if (server->smb_len < SMB_HEADER_LEN) { - PARANOIA("short packet: %d\n", result); - server->rstate = SMB_RECV_DROP; - result = -EIO; - goto out; - } - if (server->smb_len > SMB_MAX_PACKET_SIZE) { - PARANOIA("long packet: %d\n", result); - server->rstate = SMB_RECV_DROP; - result = -EIO; - goto out; - } - } - - result = _recvfrom(sock, server->header + server->smb_read, - SMB_HEADER_LEN - server->smb_read, 0); - VERBOSE("_recvfrom: %d\n", result); - if (result < 0) { - VERBOSE("receive error: %d\n", result); - goto out; - } - server->smb_read += result; - - if (server->smb_read == SMB_HEADER_LEN) - server->rstate = SMB_RECV_HCOMPLETE; -out: - return result; -} - -static char drop_buffer[PAGE_SIZE]; - -/* - * smb_receive_drop - read and throw away the data - * Only called by the smbiod thread. - * - * FIXME: we are in the kernel, could we just tell the socket that we want - * to drop stuff from the buffer? - */ -int -smb_receive_drop(struct smb_sb_info *server) -{ - struct socket *sock; - unsigned int flags; - struct kvec iov; - struct msghdr msg; - int rlen = smb_len(server->header) - server->smb_read + 4; - int result = -EIO; - - if (rlen > PAGE_SIZE) - rlen = PAGE_SIZE; - - sock = server_sock(server); - if (!sock) - goto out; - if (sock->sk->sk_state != TCP_ESTABLISHED) - goto out; - - flags = MSG_DONTWAIT | MSG_NOSIGNAL; - iov.iov_base = drop_buffer; - iov.iov_len = PAGE_SIZE; - msg.msg_flags = flags; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - - result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags); - - VERBOSE("read: %d\n", result); - if (result < 0) { - VERBOSE("receive error: %d\n", result); - goto out; - } - server->smb_read += result; - - if (server->smb_read >= server->smb_len) - server->rstate = SMB_RECV_END; - -out: - return result; -} - -/* - * smb_receive - * Only called by the smbiod thread. - */ -int -smb_receive(struct smb_sb_info *server, struct smb_request *req) -{ - struct socket *sock; - unsigned int flags; - struct kvec iov[4]; - struct kvec *p = req->rq_iov; - size_t num = req->rq_iovlen; - struct msghdr msg; - int rlen; - int result = -EIO; - - sock = server_sock(server); - if (!sock) - goto out; - if (sock->sk->sk_state != TCP_ESTABLISHED) - goto out; - - flags = MSG_DONTWAIT | MSG_NOSIGNAL; - msg.msg_flags = flags; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - - /* Dont repeat bytes and count available bufferspace */ - rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd), - (req->rq_rlen - req->rq_bytes_recvd)); - - result = kernel_recvmsg(sock, &msg, p, num, rlen, flags); - - VERBOSE("read: %d\n", result); - if (result < 0) { - VERBOSE("receive error: %d\n", result); - goto out; - } - req->rq_bytes_recvd += result; - server->smb_read += result; - -out: - return result; -} - -/* - * Try to send a SMB request. This may return after sending only parts of the - * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent. - * - * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c - */ -int -smb_send_request(struct smb_request *req) -{ - struct smb_sb_info *server = req->rq_server; - struct socket *sock; - struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT}; - int slen = req->rq_slen - req->rq_bytes_sent; - int result = -EIO; - struct kvec iov[4]; - struct kvec *p = req->rq_iov; - size_t num = req->rq_iovlen; - - sock = server_sock(server); - if (!sock) - goto out; - if (sock->sk->sk_state != TCP_ESTABLISHED) - goto out; - - /* Dont repeat bytes */ - if (req->rq_bytes_sent) - smb_move_iov(&p, &num, iov, req->rq_bytes_sent); - - result = kernel_sendmsg(sock, &msg, p, num, slen); - - if (result >= 0) { - req->rq_bytes_sent += result; - if (req->rq_bytes_sent >= req->rq_slen) - req->rq_flags |= SMB_REQ_TRANSMITTED; - } -out: - return result; -} diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c deleted file mode 100644 index 00b2909bd469..000000000000 --- a/fs/smbfs/symlink.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * symlink.c - * - * Copyright (C) 2002 by John Newbigin - * - * Please add a note about your changes to smbfs in the ChangeLog file. - */ - -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/net.h> -#include <linux/namei.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -#include <linux/smbno.h> -#include <linux/smb_fs.h> - -#include "smb_debug.h" -#include "proto.h" - -int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname) -{ - DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry)); - - return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname); -} - -static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *link = __getname(); - DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry)); - - if (!link) { - link = ERR_PTR(-ENOMEM); - } else { - int len = smb_proc_read_link(server_from_dentry(dentry), - dentry, link, PATH_MAX - 1); - if (len < 0) { - __putname(link); - link = ERR_PTR(len); - } else { - link[len] = 0; - } - } - nd_set_link(nd, link); - return NULL; -} - -static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - __putname(s); -} - -const struct inode_operations smb_link_inode_operations = -{ - .readlink = generic_readlink, - .follow_link = smb_follow_link, - .put_link = smb_put_link, -}; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 07a4f1156048..24de30ba34c1 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -370,12 +370,10 @@ static void squashfs_put_super(struct super_block *sb) } -static int squashfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); } @@ -451,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode) static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", - .get_sb = squashfs_get_sb, + .mount = squashfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 652b8541f9c6..3876c36699a1 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index, strncmp(target, name, name_size) == 0) { /* found xattr */ if (type & SQUASHFS_XATTR_VALUE_OOL) { - __le64 xattr; + __le64 xattr_val; + u64 xattr; /* val is a reference to the real location */ err = squashfs_read_metadata(sb, &val, &start, &offset, sizeof(val)); if (err < 0) goto failed; - err = squashfs_read_metadata(sb, &xattr, &start, - &offset, sizeof(xattr)); + err = squashfs_read_metadata(sb, &xattr_val, + &start, &offset, sizeof(xattr_val)); if (err < 0) goto failed; - xattr = le64_to_cpu(xattr); + xattr = le64_to_cpu(xattr_val); start = SQUASHFS_XATTR_BLK(xattr) + msblk->xattr_table; offset = SQUASHFS_XATTR_OFFSET(xattr); diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index 49fe0d719fbf..b634efce4bde 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -25,7 +25,7 @@ extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, u64 *, int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, - int *, unsigned long long *); + unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) @@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, } static inline int squashfs_xattr_lookup(struct super_block *sb, - unsigned int index, int *count, int *size, + unsigned int index, int *count, unsigned int *size, unsigned long long *xattr) { return 0; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index cfb41106098f..d33be5dd6c32 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -34,6 +34,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Map xattr id using the xattr id look up table diff --git a/fs/super.c b/fs/super.c index b9c9869165db..ca696155cd9a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) +struct dentry *mount_ns(struct file_system_type *fs_type, int flags, + void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; sb = sget(fs_type, ns_test_super, ns_set_super, data); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); if (!sb->s_root) { int err; @@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); } sb->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } -EXPORT_SYMBOL(get_sb_ns); +EXPORT_SYMBOL(mount_ns); #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) @@ -762,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } -int get_sb_bdev(struct file_system_type *fs_type, +struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { struct block_device *bdev; struct super_block *s; @@ -777,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type, bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(bdev)) - return PTR_ERR(bdev); + return ERR_CAST(bdev); /* * once the super is inserted into the list by sget, s_umount @@ -829,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type, bdev->bd_super = s; } - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); error_s: error = PTR_ERR(s); error_bdev: close_bdev_exclusive(bdev, mode); error: - return error; + return ERR_PTR(error); +} +EXPORT_SYMBOL(mount_bdev); + +int get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + + root = mount_bdev(fs_type, flags, dev_name, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + return 0; } EXPORT_SYMBOL(get_sb_bdev); @@ -856,29 +868,42 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -int get_sb_nodev(struct file_system_type *fs_type, +struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - return error; + return ERR_PTR(error); } s->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); } +EXPORT_SYMBOL(mount_nodev); +int get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + + root = mount_nodev(fs_type, flags, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + return 0; +} EXPORT_SYMBOL(get_sb_nodev); static int compare_single(struct super_block *s, void *p) @@ -886,29 +911,42 @@ static int compare_single(struct super_block *s, void *p) return 1; } -int get_sb_single(struct file_system_type *fs_type, +struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); if (!s->s_root) { s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - return error; + return ERR_PTR(error); } s->s_flags |= MS_ACTIVE; } else { do_remount_sb(s, flags, data, 0); } - simple_set_mnt(mnt, s); + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_single); + +int get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + root = mount_single(fs_type, flags, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; return 0; } @@ -918,6 +956,7 @@ struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct vfsmount *mnt; + struct dentry *root; char *secdata = NULL; int error; @@ -942,9 +981,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void goto out_free_secdata; } - error = type->get_sb(type, flags, name, data, mnt); - if (error < 0) - goto out_free_secdata; + if (type->mount) { + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; + } + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + } else { + error = type->get_sb(type, flags, name, data, mnt); + if (error < 0) + goto out_free_secdata; + } BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); mnt->mnt_sb->s_flags |= MS_BORN; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f2af22574c50..266895783b47 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,7 +23,7 @@ #include "sysfs.h" -static struct vfsmount *sysfs_mount; +static struct vfsmount *sysfs_mnt; struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } -static int sysfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *sysfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct sysfs_super_info *info; enum kobj_ns_type type; struct super_block *sb; int error; - error = -ENOMEM; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - goto out; + return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) info->ns[type] = kobj_ns_current(type); @@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type, sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); - if (IS_ERR(sb)) { - error = PTR_ERR(sb); - goto out; - } + if (IS_ERR(sb)) + return ERR_CAST(sb); if (!sb->s_root) { sb->s_flags = flags; error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(sb); - goto out; + return ERR_PTR(error); } sb->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, sb); - error = 0; -out: - return error; + return dget(sb->s_root); } static void sysfs_kill_sb(struct super_block *sb) @@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb) static struct file_system_type sysfs_fs_type = { .name = "sysfs", - .get_sb = sysfs_get_sb, + .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, }; @@ -189,11 +183,11 @@ int __init sysfs_init(void) err = register_filesystem(&sysfs_fs_type); if (!err) { - sysfs_mount = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mount)) { + sysfs_mnt = kern_mount(&sysfs_fs_type); + if (IS_ERR(sysfs_mnt)) { printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mount); - sysfs_mount = NULL; + err = PTR_ERR(sysfs_mnt); + sysfs_mnt = NULL; unregister_filesystem(&sysfs_fs_type); goto out_err; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a0b0cda6927e..3d9c62be0c10 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -526,23 +526,22 @@ failed: /* Every kernel module contains stuff like this. */ -static int sysv_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *sysv_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); } -static int v7_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *v7_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); } static struct file_system_type sysv_fs_type = { .owner = THIS_MODULE, .name = "sysv", - .get_sb = sysv_get_sb, + .mount = sysv_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = { static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, .name = "v7", - .get_sb = v7_get_sb, + .mount = v7_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a47c9f0ad07..91fac54c70e3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2038,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data) return c->vi.cdev == *dev; } -static int ubifs_get_sb(struct file_system_type *fs_type, int flags, - const char *name, void *data, struct vfsmount *mnt) +static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, + const char *name, void *data) { struct ubi_volume_desc *ubi; struct ubi_volume_info vi; @@ -2057,7 +2057,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(ubi)) { dbg_err("cannot open \"%s\", error %d", name, (int)PTR_ERR(ubi)); - return PTR_ERR(ubi); + return ERR_CAST(ubi); } ubi_get_volume_info(ubi, &vi); @@ -2095,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return err; + return ERR_PTR(err); } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .get_sb = ubifs_get_sb, + .mount = ubifs_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/udf/super.c b/fs/udf/super.c index 76f3d6d97b40..4a5c7c61836a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) } /* UDF filesystem type */ -static int udf_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *udf_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super); } static struct file_system_type udf_fstype = { .owner = THIS_MODULE, .name = "udf", - .get_sb = udf_get_sb, + .mount = udf_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6b9be90dae7d..2c47daed56da 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1454,16 +1454,16 @@ static const struct super_operations ufs_super_ops = { .show_options = ufs_show_options, }; -static int ufs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ufs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", - .get_sb = ufs_get_sb, + .mount = ufs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 480f28127f09..6100ec0fa1d4 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -22,6 +22,7 @@ config XFS_FS config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS + select QUOTACTL help If you say Y here, you will be able to set limits for disk usage on a per user and/or a per group basis under XFS. XFS considers quota diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c9af48fffcd7..7d287afccde5 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1111,11 +1111,12 @@ xfs_vm_writepage( uptodate = 0; /* - * A hole may still be marked uptodate because discard_buffer - * leaves the flag set. + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 63fd2c07cb57..aa1d353def29 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1781,7 +1781,6 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { @@ -1795,6 +1794,7 @@ xfs_buf_delwri_split( _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); } else skipped++; } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2ea238f6d38e..ad442d9e392e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -416,7 +416,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) goto out_dput; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 96107efc0c61..94d5fd6a2973 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -762,7 +762,8 @@ xfs_setup_inode( inode->i_state = I_NEW; inode_sb_list_add(inode); - insert_inode_hash(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index cf808782c065..064f964d4f3c 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -353,9 +353,6 @@ xfs_parseargs( mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; - cmn_err(CE_WARN, - "Enabling EXPERIMENTAL delayed logging feature " - "- use at your own risk.\n"); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { @@ -1609,16 +1606,14 @@ xfs_fs_fill_super( goto out_free_sb; } -STATIC int -xfs_fs_get_sb( +STATIC struct dentry * +xfs_fs_mount( struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, - struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } static const struct super_operations xfs_super_operations = { @@ -1639,7 +1634,7 @@ static const struct super_operations xfs_super_operations = { static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", - .get_sb = xfs_fs_get_sb, + .mount = xfs_fs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37d33254981d..afb0d7cfad1c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -853,6 +853,7 @@ restart: if (trylock) { if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { skipped++; + xfs_perag_put(pag); continue; } first_index = pag->pag_ici_reclaim_cursor; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9b715dce5699..9124425b7f2f 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -744,9 +744,15 @@ xfs_filestream_new_ag( * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. */ if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL); + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); /* * A new AG needs to be found for the file. If the file's parent diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b1498ab5a399..19e9dfa1c254 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -275,6 +275,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0e64b113bd6..9bb6eda4cd21 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) -#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) -#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} #define xfs_qm_vop_create_dqattach(tp, ip, u, g) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -#define xfs_qm_sync(mp, fl) (0) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) -#define xfs_qm_unmount_quotas(mp) (0) +#define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ |