summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2013-09-07 12:53:35 +1000
committerHerbert Xu <herbert@gondor.apana.org.au>2013-09-07 12:53:35 +1000
commiteeca9fad52fc4bfdf42c38bfcf383e932eb3e9d6 (patch)
treecc51c880459d41c0e8d7576405bef4c987bc7aa0 /fs
parentff6f83fc9d44db09997937c3475d525a6866fbb4 (diff)
parentb48a97be8e6c2afdba2f3b61fd88c3c7743fbd73 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux
Merge upstream tree in order to reinstate crct10dif.
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/autofs4/expire.c8
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c118
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c483
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c340
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c150
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c501
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/ceph/addr.c88
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c18
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/cifsencrypt.c195
-rw-r--r--fs/cifs/cifsglob.h17
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/dir.c14
-rw-r--r--fs/cifs/file.c54
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/smb1ops.c29
-rw-r--r--fs/cifs/smb2file.c24
-rw-r--r--fs/cifs/smb2inode.c57
-rw-r--r--fs/cifs/smb2ops.c54
-rw-r--r--fs/cifs/smb2pdu.c220
-rw-r--r--fs/cifs/smb2pdu.h14
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c90
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/configfs/dir.c15
-rw-r--r--fs/ecryptfs/crypto.c337
-rw-r--r--fs/ecryptfs/file.c7
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/inode.c14
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/extents.c23
-rw-r--r--fs/ext4/extents_status.c73
-rw-r--r--fs/ext4/inode.c19
-rw-r--r--fs/ext4/mballoc.c11
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c35
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/f2fs/dir.c20
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/file_table.c31
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fuse/dir.c51
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c37
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c69
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c22
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/libfs.c3
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/locks.c71
-rw-r--r--fs/namei.c2
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c52
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/dir.c87
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c15
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c3
-rw-r--r--fs/nfs/nfs4filelayout.h3
-rw-r--r--fs/nfs/nfs4filelayoutdev.c8
-rw-r--r--fs/nfs/nfs4proc.c687
-rw-r--r--fs/nfs/nfs4session.c40
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c36
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c189
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/super.c199
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/nfs4proc.c46
-rw-r--r--fs/nfsd/nfs4state.c225
-rw-r--r--fs/nfsd/nfs4xdr.c169
-rw-r--r--fs/nfsd/nfsd.h27
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c28
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c89
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/open.c6
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/select.c62
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/group.c70
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_alloc.c24
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c199
-rw-r--r--fs/xfs/xfs_bmap.h1
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_buf_item.c87
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dinode.h3
-rw-r--r--fs/xfs/xfs_dir2_block.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/xfs_dquot.c31
-rw-r--r--fs/xfs/xfs_dquot.h11
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c74
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c82
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_ioctl.c16
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_iops.c27
-rw-r--r--fs/xfs/xfs_itable.c33
-rw-r--r--fs/xfs/xfs_log.c22
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_recover.c114
-rw-r--r--fs/xfs/xfs_mount.c92
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_qm.c394
-rw-r--r--fs/xfs/xfs_qm.h97
-rw-r--r--fs/xfs/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/xfs_qm_syscalls.c75
-rw-r--r--fs/xfs/xfs_quota.h104
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h6
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_symlink.c61
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.h5
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_buf.c34
-rw-r--r--fs/xfs/xfs_trans_dquot.c122
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c28
224 files changed, 7381 insertions, 4256 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
endif
+
+
+config 9P_FS_SECURITY
+ bool "9P Security Labels"
+ depends on 9P_FS
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the 9P filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
v9fs.o \
fid.o \
xattr.o \
- xattr_user.o
+ xattr_user.o \
+ xattr_trusted.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
+9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..25b018efb8ab 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,13 +1054,11 @@ static int
v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_wstat *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
- err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
generic_fillattr(dentry->d_inode, stat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
+ &v9fs_xattr_trusted_handler,
#ifdef CONFIG_9P_FS_POSIX_ACL
&v9fs_xattr_acl_access_handler,
&v9fs_xattr_acl_default_handler,
#endif
+#ifdef CONFIG_9P_FS_SECURITY
+ &v9fs_xattr_security_handler,
+#endif
NULL
};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
extern const struct xattr_handler *v9fs_xattr_handlers[];
extern struct xattr_handler v9fs_xattr_user_handler;
+extern struct xattr_handler v9fs_xattr_trusted_handler;
+extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(full_name+prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+ kfree(full_name);
+ return retval;
+}
+
+static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(full_name + prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+ kfree(full_name);
+ return retval;
+}
+
+struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_security_get,
+ .set = v9fs_xattr_security_set,
+};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(full_name+prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+ kfree(full_name);
+ return retval;
+}
+
+static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(full_name + prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+ kfree(full_name);
+ return retval;
+}
+
+struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_trusted_get,
+ .set = v9fs_xattr_trusted_set,
+};
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
/* Already gone or negative dentry (under construction) - try next */
- if (q->d_count == 0 || !simple_positive(q)) {
+ if (!d_count(q) || !simple_positive(q)) {
spin_unlock(&q->d_lock);
next = q->d_u.d_child.next;
goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
else
ino_count++;
- if (p->d_count > ino_count) {
+ if (d_count(p) > ino_count) {
top_ino->last_used = jiffies;
dput(p);
return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
if (!exp_leaves) {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
} else {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ca8e55548d98..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
spin_lock(&active->d_lock);
/* Already gone? */
- if (active->d_count == 0)
+ if (!d_count(active))
goto next;
qstr = &active->d_name;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bb43ce081d6e..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
struct backing_dev_info *old = inode->i_data.backing_dev_info;
+ bool wakeup_bdi = false;
if (unlikely(dst == old)) /* deadlock avoidance */
return;
bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
- if (inode->i_state & I_DIRTY)
+ if (inode->i_state & I_DIRTY) {
+ if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+ wakeup_bdi = true;
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ }
spin_unlock(&inode->i_lock);
spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock);
+
+ if (wakeup_bdi)
+ bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* to a logical address
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- u64 time_seq,
- struct __prelim_ref *ref,
- struct ulist *parents,
- const u64 *extent_item_pos)
+ struct btrfs_path *path, u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
- struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int root_level;
int level = ref->level;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
-
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
time_seq, ref->wanted_disk_byte,
extent_item_pos);
out:
- btrfs_free_path(path);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
return ret;
}
@@ -322,7 +316,7 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root, u64 time_seq,
+ struct btrfs_path *path, u64 time_seq,
struct list_head *head,
const u64 *extent_item_pos)
{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
- err = __resolve_indirect_ref(fs_info, search_commit_root,
- time_seq, ref, parents,
- extent_item_pos);
+ err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+ parents, extent_item_pos);
if (err == -ENOMEM)
goto out;
if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
int slot;
struct extent_buffer *leaf;
struct btrfs_key key;
+ struct btrfs_key found_key;
unsigned long ptr;
unsigned long end;
struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
end = (unsigned long)ei + item_size;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
*info_level = btrfs_tree_block_level(leaf, info);
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
+ } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ *info_level = found_key.offset;
} else {
BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
}
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int info_level = 0;
int ret;
- int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
struct list_head prefs_delayed;
struct list_head prefs;
struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&prefs_delayed);
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)-1;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
+ if (!trans)
+ path->search_commit_root = 1;
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
goto out;
BUG_ON(ret == 0);
- if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ if (trans) {
/*
* look if there are updates for this ref queued and lock the
* head
@@ -869,7 +870,8 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == bytenr &&
- key.type == BTRFS_EXTENT_ITEM_KEY) {
+ (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs);
if (ret)
@@ -890,8 +892,8 @@ again:
__merge_refs(&prefs, 1);
- ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
- &prefs, extent_item_pos);
+ ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+ extent_item_pos);
if (ret)
goto out;
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
{
int ret;
u64 flags;
+ u64 size = 0;
u32 item_size;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
- key.type = BTRFS_EXTENT_ITEM_KEY;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
key.offset = (u64)-1;
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
return ret;
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
- if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+ if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+ size = fs_info->extent_root->leafsize;
+ else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+ size = found_key->offset;
+
+ if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+ found_key->type != BTRFS_METADATA_ITEM_KEY) ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical) {
+ found_key->objectid + size <= logical) {
pr_debug("logical %llu is not within any extent\n",
(unsigned long long)logical);
return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct ulist *refs = NULL;
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
- if (search_commit_root) {
- trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
- } else {
+ if (!search_commit_root) {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
#include "ulist.h"
#include "extent_io.h"
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 17dffe33e8d0..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- tree_mod_log_free_eb(root->fs_info, buf);
+ if (last_ref)
+ tree_mod_log_free_eb(root->fs_info, buf);
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
}
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
* time_seq).
*/
static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
- struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq, struct tree_mod_elem *first_tm)
{
u32 n;
struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
+ tree_mod_log_read_lock(fs_info);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
if (tm->index != first_tm->index)
break;
}
+ tree_mod_log_read_unlock(fs_info);
btrfs_set_header_nritems(eb, n);
}
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
extent_buffer_get(eb_rewin);
btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+ __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_generation(eb, old_generation);
}
if (tm)
- __tree_mod_log_rewind(eb, time_seq, tm);
+ __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
}
}
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
{
int slot;
int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int ret = 0;
int blocksize;
parent = path->nodes[level + 1];
if (!parent)
- return 0;
+ return;
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block2 = 0;
free_extent_buffer(eb);
}
- if (block1 || block2) {
- ret = -EAGAIN;
-
- /* release the whole path */
- btrfs_release_path(path);
-
- /* read the blocks */
- if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
- if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
- if (block1) {
- eb = read_tree_block(root, block1, blocksize, 0);
- free_extent_buffer(eb);
- }
- if (block2) {
- eb = read_tree_block(root, block2, blocksize, 0);
- free_extent_buffer(eb);
- }
- }
- return ret;
+ if (block1)
+ readahead_tree_block(root, block1, blocksize, 0);
+ if (block2)
+ readahead_tree_block(root, block2, blocksize, 0);
}
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
if (tmp) {
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
- /*
- * we found an up to date block without
- * sleeping, return
- * right away
- */
- *eb_ret = tmp;
- return 0;
- }
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- free_extent_buffer(tmp);
- btrfs_set_path_blocking(p);
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ *eb_ret = tmp;
+ return 0;
+ }
- /* now we're allowed to do a blocking uptodate check */
- tmp = read_tree_block(root, blocknr, blocksize, gen);
- if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
- *eb_ret = tmp;
- return 0;
- }
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ /* the pages were up to date, but we failed
+ * the generation number check. Do a full
+ * read for the generation number that is correct.
+ * We must do this without dropping locks so
+ * we can trust our generation number
+ */
+ btrfs_set_path_blocking(p);
+
+ /* now we're allowed to do a blocking uptodate check */
+ ret = btrfs_read_buffer(tmp, gen);
+ if (!ret) {
+ *eb_ret = tmp;
+ return 0;
}
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
/*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = split_node(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = balance_level(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
*/
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path, int level, int log_removal)
+ struct btrfs_path *path, int level)
{
u64 lower_gen;
struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- tree_mod_log_set_root_pointer(root, c, log_removal);
+ tree_mod_log_set_root_pointer(root, c, 0);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
/*
* trying to split the root, lets make a new one
*
- * tree mod log: We pass 0 as log_removal parameter to
+ * tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
* elements below with tree_mod_log_eb_copy. We're holding a
* tree lock on the buffer, which is why we cannot race with
* other tree_mod_log users.
*/
- ret = insert_new_root(trans, root, path, level + 1, 0);
+ ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
} else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
- if (data_size) {
+ if (data_size && path->nodes[1]) {
wret = push_leaf_right(trans, root, path, data_size,
data_size, 0, 0);
if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
}
if (!path->nodes[1]) {
- ret = insert_new_root(trans, root, path, 1, 1);
+ ret = insert_new_root(trans, root, path, 1);
if (ret)
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
-#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
+#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
account */
/*
+ * bytes_pinned is kept in line with what is actually pinned, as in
+ * we've called update_block_group and dropped the bytes_used counter
+ * and increased the bytes_pinned counter. However this means that
+ * bytes_pinned does not reflect the bytes that will be pinned once the
+ * delayed refs are flushed, so this counter is inc'ed everytime we call
+ * btrfs_free_extent so it is a realtime count of what will be freed
+ * once the transaction is committed. It will be zero'ed everytime the
+ * transaction commits.
+ */
+ struct percpu_counter total_bytes_pinned;
+
+ /*
* we bump reservation progress every time we decrement
* bytes_reserved. This way people waiting for reservations
* know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
atomic_t open_ioctl_trans;
/*
- * this is used by the balancing code to wait for all the pending
- * ordered extents
+ * this is used to protect the following list -- ordered_roots.
*/
- spinlock_t ordered_extent_lock;
+ spinlock_t ordered_root_lock;
/*
- * all of the data=ordered extents pending writeback
+ * all fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
* these can span multiple transactions and basically include
* every dirty data page that isn't from nodatacow
*/
- struct list_head ordered_extents;
+ struct list_head ordered_roots;
- spinlock_t delalloc_lock;
- /*
- * all of the inodes that have delalloc bytes. It is possible for
- * this list to be empty even when there is still dirty data=ordered
- * extents waiting to finish IO.
- */
- struct list_head delalloc_inodes;
+ spinlock_t delalloc_root_lock;
+ /* all fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
/*
* there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
- int enospc_unlink;
- int trans_no_join;
u64 total_pinned;
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
struct rb_root qgroup_tree;
spinlock_t qgroup_lock;
+ /*
+ * used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
/* protect user change for quota operations */
struct mutex qgroup_ioctl_lock;
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
struct mutex qgroup_rescan_lock; /* protects the progress item */
struct btrfs_key qgroup_rescan_progress;
struct btrfs_workers qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
/* filesystem state */
unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
int force_cow;
spinlock_t root_item_lock;
+ atomic_t refs;
+
+ spinlock_t delalloc_lock;
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+ struct list_head delalloc_root;
+ u64 nr_delalloc_inodes;
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+ struct list_head ordered_root;
+ u64 nr_ordered_extents;
};
struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
num_items;
}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+ struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+ return (root->fs_info->sb->s_flags & MS_RDONLY ||
+ btrfs_fs_closing(root->fs_info));
+}
+
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *item);
void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
- btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes);
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
return next;
}
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
- u64 root_id)
-{
- struct btrfs_key root_key;
-
- if (root->objectid == root_id)
- return root;
-
- root_key.objectid = root_id;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0292b3ead54..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
+ root->nr_delalloc_inodes = 0;
+ root->nr_ordered_extents = 0;
root->name = NULL;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->delalloc_inodes);
+ INIT_LIST_HEAD(&root->delalloc_root);
+ INIT_LIST_HEAD(&root->ordered_extents);
+ INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->delalloc_lock);
+ spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
+ atomic_set(&root->refs, 1);
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->root_item_lock);
}
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
-{
- int ret;
- u32 blocksize;
- u64 generation;
-
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
- ret = btrfs_find_last_root(tree_root, objectid,
- &root->root_item, &root->root_key);
- if (ret > 0)
- return -ENOENT;
- else if (ret < 0)
- return ret;
-
- generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
- root->commit_root = NULL;
- root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
- if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
- free_extent_buffer(root->node);
- root->node = NULL;
- return -EIO;
- }
- root->commit_root = btrfs_root_node(root);
- return 0;
-}
-
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *l;
u64 generation;
u32 blocksize;
- int ret = 0;
- int slot;
+ int ret;
- root = btrfs_alloc_root(fs_info);
- if (!root)
+ path = btrfs_alloc_path();
+ if (!path)
return ERR_PTR(-ENOMEM);
- if (location->offset == (u64)-1) {
- ret = find_and_setup_root(tree_root, fs_info,
- location->objectid, root);
- if (ret) {
- kfree(root);
- return ERR_PTR(ret);
- }
- goto out;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root) {
+ ret = -ENOMEM;
+ goto alloc_fail;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, location->objectid);
+ root, fs_info, key->objectid);
- path = btrfs_alloc_path();
- if (!path) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret == 0) {
- l = path->nodes[0];
- slot = path->slots[0];
- btrfs_read_root_item(l, slot, &root->root_item);
- memcpy(&root->root_key, location, sizeof(*location));
- }
- btrfs_free_path(path);
+ ret = btrfs_find_root(tree_root, key, path,
+ &root->root_item, &root->root_key);
if (ret) {
- kfree(root);
if (ret > 0)
ret = -ENOENT;
- return ERR_PTR(ret);
+ goto find_fail;
}
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- if (!root->node || !extent_buffer_uptodate(root->node)) {
- ret = (!root->node) ? -ENOMEM : -EIO;
-
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
+ if (!root->node) {
+ ret = -ENOMEM;
+ goto find_fail;
+ } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ ret = -EIO;
+ goto read_fail;
}
-
root->commit_root = btrfs_root_node(root);
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_free_path(path);
+ return root;
+
+read_fail:
+ free_extent_buffer(root->node);
+find_fail:
+ kfree(root);
+alloc_fail:
+ root = ERR_PTR(ret);
+ goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_read_tree_root(tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1523,6 +1502,66 @@ out:
return root;
}
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+ int ret;
+
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned || !root->free_ino_ctl) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ btrfs_init_free_ino_ctl(root);
+ mutex_init(&root->fs_commit_mutex);
+ spin_lock_init(&root->cache_lock);
+ init_waitqueue_head(&root->cache_wait);
+
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ return 0;
+fail:
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+ return ret;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
+{
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+
+ return ret;
+}
+
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->quota_root ? fs_info->quota_root :
ERR_PTR(-ENOENT);
again:
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)location->objectid);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root)
return root;
- root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ root = btrfs_read_fs_root(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
goto fail;
}
- btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
-
- ret = get_anon_bdev(&root->anon_dev);
+ ret = btrfs_init_fs_root(root);
if (ret)
goto fail;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = -ENOENT;
- goto fail;
- }
-
ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
if (ret < 0)
goto fail;
if (ret == 0)
root->orphan_item_inserted = 1;
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (ret)
- goto fail;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
- if (ret == 0)
- root->in_radix = 1;
-
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
if (ret == -EEXIST) {
free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
}
goto fail;
}
-
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- WARN_ON(ret);
return root;
fail:
free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
+ int again;
do {
- int again = 0;
-
- if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
- down_read_trylock(&root->fs_info->sb->s_umount)) {
- if (mutex_trylock(&root->fs_info->cleaner_mutex)) {
- btrfs_run_delayed_iputs(root);
- again = btrfs_clean_one_deleted_snapshot(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- }
- btrfs_run_defrag_inodes(root->fs_info);
- up_read(&root->fs_info->sb->s_umount);
+ again = 0;
+
+ /* Make the cleaner go to sleep early. */
+ if (btrfs_need_cleaner_sleep(root))
+ goto sleep;
+
+ if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+ goto sleep;
+
+ /*
+ * Avoid the problem that we change the status of the fs
+ * during the above check and trylock.
+ */
+ if (btrfs_need_cleaner_sleep(root)) {
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ goto sleep;
}
+ btrfs_run_delayed_iputs(root);
+ again = btrfs_clean_one_deleted_snapshot(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /*
+ * The defragger has dealt with the R/O remount and umount,
+ * needn't do anything special here.
+ */
+ btrfs_run_defrag_inodes(root->fs_info);
+sleep:
if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = get_seconds();
- if (!cur->blocked &&
+ if (cur->state < TRANS_STATE_BLOCKED &&
(now < cur->start_time || now - cur->start_time < 30)) {
spin_unlock(&root->fs_info->trans_lock);
delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
list_del(&gang[0]->root_list);
if (gang[0]->in_radix) {
- btrfs_free_fs_root(fs_info, gang[0]);
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
free_extent_buffer(gang[0]->commit_root);
- kfree(gang[0]);
+ btrfs_put_fs_root(gang[0]);
}
}
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
if (!ret)
break;
for (i = 0; i < ret; i++)
- btrfs_free_fs_root(fs_info, gang[i]);
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
int backup_index = 0;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
- extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
- csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
- dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-
- if (!tree_root || !extent_root || !csum_root ||
- !chunk_root || !dev_root || !quota_root) {
+ if (!tree_root || !chunk_root) {
err = -ENOMEM;
goto fail;
}
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
- spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
- fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
- INIT_LIST_HEAD(&fs_info->ordered_extents);
- spin_lock_init(&fs_info->ordered_extent_lock);
+ INIT_LIST_HEAD(&fs_info->ordered_roots);
+ spin_lock_init(&fs_info->ordered_root_lock);
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
GFP_NOFS);
if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
mutex_init(&fs_info->qgroup_rescan_lock);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_EXTENT_TREE_OBJECTID, extent_root);
- if (ret)
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ extent_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(extent_root)) {
+ ret = PTR_ERR(extent_root);
goto recovery_tree_root;
+ }
extent_root->track_dirty = 1;
+ fs_info->extent_root = extent_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_DEV_TREE_OBJECTID, dev_root);
- if (ret)
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ dev_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(dev_root)) {
+ ret = PTR_ERR(dev_root);
goto recovery_tree_root;
+ }
dev_root->track_dirty = 1;
+ fs_info->dev_root = dev_root;
+ btrfs_init_devices_late(fs_info);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_CSUM_TREE_OBJECTID, csum_root);
- if (ret)
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ csum_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(csum_root)) {
+ ret = PTR_ERR(csum_root);
goto recovery_tree_root;
+ }
csum_root->track_dirty = 1;
+ fs_info->csum_root = csum_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_QUOTA_TREE_OBJECTID, quota_root);
- if (ret) {
- kfree(quota_root);
- quota_root = fs_info->quota_root = NULL;
- } else {
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ quota_root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(quota_root)) {
quota_root->track_dirty = 1;
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
+ fs_info->quota_root = quota_root;
}
fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
location.objectid = BTRFS_FS_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
+ location.offset = 0;
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (!fs_info->fs_root)
- goto fail_qgroup;
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
return ret;
}
+ btrfs_qgroup_rescan_resume(fs_info);
+
return 0;
fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_RAID10)) {
num_tolerated_disk_barrier_failures = 1;
} else if (flags &
- BTRFS_BLOCK_GROUP_RAID5) {
+ BTRFS_BLOCK_GROUP_RAID6) {
num_tolerated_disk_barrier_failures = 2;
}
}
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
kfree(root->name);
- kfree(root);
+ btrfs_put_fs_root(root);
+}
+
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+ free_fs_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
/*
* This will just short circuit the ordered completion stuff which will
* make sure the ordered extent gets properly cleaned up.
*/
- list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+ list_for_each_entry(ordered, &root->ordered_extents,
root_extent_list)
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ list_del_init(&root->ordered_root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ cond_resched_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
}
int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->root)) != NULL) {
struct btrfs_delayed_ref_head *head = NULL;
+ bool pin_bytes = false;
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
}
if (head->must_insert_reserved)
- btrfs_pin_extent(root, ref->bytenr,
- ref->num_bytes, 1);
+ pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (head)
- mutex_unlock(&head->mutex);
spin_unlock(&delayed_refs->lock);
+ if (head) {
+ if (pin_bytes)
+ btrfs_pin_extent(root, ref->bytenr,
+ ref->num_bytes, 1);
+ mutex_unlock(&head->mutex);
+ }
btrfs_put_delayed_ref(ref);
cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+ delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&btrfs_inode->runtime_flags);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
- spin_lock(&root->fs_info->delalloc_lock);
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ list_del_init(&root->delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ btrfs_destroy_delalloc_inodes(root);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
cur_trans->dirty_pages.dirty_bytes);
- /* FIXME: cleanup wait for commit */
- cur_trans->in_commit = 1;
- cur_trans->blocked = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(cur_trans);
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- cur_trans->commit_done = 1;
- wake_up(&cur_trans->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ cur_trans->state =TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+
/*
memset(cur_trans, 0, sizeof(*cur_trans));
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
list_splice_init(&root->fs_info->trans_list, &list);
- root->fs_info->trans_no_join = 1;
+ root->fs_info->running_transaction = NULL;
spin_unlock(&root->fs_info->trans_lock);
while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_ordered_operations(t, root);
- btrfs_destroy_ordered_extents(root);
+ btrfs_destroy_all_ordered_extents(root->fs_info);
btrfs_destroy_delayed_refs(t, root);
- /* FIXME: cleanup wait for commit */
- t->in_commit = 1;
- t->blocked = 1;
+ /*
+ * FIXME: cleanup wait for commit
+ * We needn't acquire the lock here, because we are during
+ * the umount, there is no other task which will change it.
+ */
+ t->state = TRANS_STATE_COMMIT_START;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(t);
- t->blocked = 0;
+ t->state = TRANS_STATE_UNBLOCKED;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
- t->commit_done = 1;
- smp_mb();
- if (waitqueue_active(&t->commit_wait))
- wake_up(&t->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
- btrfs_destroy_delalloc_inodes(root);
-
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
+ btrfs_destroy_all_delalloc_inodes(root->fs_info);
btrfs_destroy_marked_extents(root, &t->dirty_pages,
EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ t->state = TRANS_STATE_COMPLETED;
+ smp_mb();
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+
atomic_set(&t->use_count, 0);
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);
}
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 0;
- spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+ if (atomic_inc_not_zero(&root->refs))
+ return root;
+ return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->refs))
+ kfree(root);
+}
+
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
goto fail;
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- err = -ENOENT;
- goto fail;
- }
-
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
return 0;
}
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+ u64 num_bytes;
+
+ num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+ sizeof(struct btrfs_extent_inline_ref));
+ if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+ /*
+ * We don't ever fill up leaves all the way so multiply by 2 just to be
+ * closer to what we're really going to want to ouse.
+ */
+ return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *global_rsv;
+ u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ u64 num_bytes;
+ int ret = 0;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_heads = heads_to_leaves(root, num_heads);
+ if (num_heads > 1)
+ num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes <<= 1;
+ global_rsv = &root->fs_info->global_block_rsv;
+
+ /*
+ * If we can't allocate any more chunks lets make sure we have _lots_ of
+ * wiggle room since running delayed refs can create more delayed refs.
+ */
+ if (global_rsv->space_info->full)
+ num_bytes <<= 1;
+
+ spin_lock(&global_rsv->lock);
+ if (global_rsv->reserved <= num_bytes)
+ ret = 1;
+ spin_unlock(&global_rsv->lock);
+ return ret;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
if (old) {
DEFINE_WAIT(__wait);
- if (delayed_refs->num_entries < 16348)
+ if (delayed_refs->flushing ||
+ !btrfs_should_throttle_delayed_refs(trans, root))
return 0;
prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
while (1) {
if (!(run_all || run_most) &&
- delayed_refs->num_heads_ready < 64)
+ !btrfs_should_throttle_delayed_refs(trans, root))
break;
/*
@@ -2629,6 +2676,7 @@ again:
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
atomic_dec(&delayed_refs->procs_running_refs);
+ wake_up(&delayed_refs->wait);
return ret;
}
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info *found;
int i;
int factor;
+ int ret;
if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
}
/*
- * If we have less pinned bytes than we want to allocate then
- * don't bother committing the transaction, it won't help us.
+ * If we don't have enough pinned space to deal with this
+ * allocation don't bother committing the transaction.
*/
- if (data_sinfo->bytes_pinned < bytes)
+ if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+ bytes) < 0)
committed = 1;
spin_unlock(&data_sinfo->lock);
@@ -3577,6 +3633,7 @@ commit_trans:
if (!committed &&
!atomic_read(&root->fs_info->open_ioctl_trans)) {
committed = 1;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
+ WARN_ON(data_sinfo->bytes_may_use < bytes);
data_sinfo->bytes_may_use -= bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
unsigned long nr_pages)
{
struct super_block *sb = root->fs_info->sb;
- int started;
- /* If we can not start writeback, just sync all the delalloc file. */
- started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
- if (!started) {
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
/*
* We needn't worry the filesystem going from r/w to r/o though
* we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_inodes(root, 0);
+ btrfs_start_all_delalloc_inodes(root->fs_info, 0);
if (!current->journal_info)
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
}
}
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (delalloc_bytes == 0) {
if (trans)
return;
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
return;
}
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
/* See if there is enough pinned space to make this reservation */
spin_lock(&space_info->lock);
- if (space_info->bytes_pinned >= bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes) >= 0) {
spin_unlock(&space_info->lock);
goto commit;
}
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
- if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
spin_unlock(&space_info->lock);
return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ block_rsv_add_bytes(dest, num_bytes, 1);
+ return 0;
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
int factor;
/* block accounting for super block */
- spin_lock(&info->delalloc_lock);
+ spin_lock(&info->delalloc_root_lock);
old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
btrfs_set_super_bytes_used(info->super_copy, old_val);
- spin_unlock(&info->delalloc_lock);
+ spin_unlock(&info->delalloc_root_lock);
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
return ret;
}
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, start);
+ if (!block_group)
+ return -EINVAL;
+
+ cache_block_group(block_group, 0);
+ caching_ctl = get_caching_control(block_group);
+
+ if (!caching_ctl) {
+ /* Logic error */
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ if (ret)
+ goto out_lock;
+
+ num_bytes = (start + num_bytes) -
+ caching_ctl->progress;
+ start = caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ }
+out_lock:
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+ struct extent_buffer *eb)
+{
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key key;
+ int found_type;
+ int i;
+
+ if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+ if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ continue;
+ key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ __exclude_logged_extent(log, key.objectid, key.offset);
+ }
+
+ return 0;
+}
+
/**
* btrfs_update_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
+ struct btrfs_space_info *space_info;
down_write(&fs_info->extent_commit_sem);
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
+ list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+ percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
update_global_block_rsv(fs_info);
}
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ BUG_ON(!space_info); /* Logic bug */
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+ root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_block_group_cache *cache = NULL;
+ int pin = 1;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ pin = 0;
}
out:
+ if (pin)
+ add_pinned_bytes(root->fs_info, buf->len,
+ btrfs_header_level(buf),
+ root->root_key.objectid);
+
/*
* Deleting the buffer, clear the corrupt flag since it doesn't matter
* anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+ add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
/*
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
- struct btrfs_caching_control *caching_ctl;
- u64 start = ins->objectid;
- u64 num_bytes = ins->offset;
-
- block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, 0);
- caching_ctl = get_caching_control(block_group);
-
- if (!caching_ctl) {
- BUG_ON(!block_group_cache_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- if (ret)
- goto out;
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = add_excluded_extent(root, start, num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
- start = caching_ctl->progress;
- num_bytes = ins->objectid + ins->offset -
- caching_ctl->progress;
- ret = add_excluded_extent(root, start, num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- put_caching_control(caching_ctl);
+ /*
+ * Mixed block groups will exclude before processing the log so we only
+ * need to do the exlude dance if this fs isn't mixed.
+ */
+ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
if (ret)
- goto out;
+ return ret;
}
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ if (!block_group)
+ return -EINVAL;
+
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
-out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -7298,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int err = 0;
int ret;
int level;
+ bool root_dropped = false;
path = btrfs_alloc_path();
if (!path) {
@@ -7355,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
while (1) {
btrfs_tree_lock(path->nodes[level]);
btrfs_set_lock_blocking(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
path->nodes[level]->start,
@@ -7370,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
break;
btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
WARN_ON(wc->refs[level] != 1);
level--;
}
@@ -7384,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
- if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
- pr_debug("btrfs: drop snapshot early exit\n");
- err = -EAGAIN;
- goto out_end_trans;
- }
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
@@ -7416,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
BUG_ON(wc->level == 0);
- if (btrfs_should_end_transaction(trans, tree_root)) {
+ if (btrfs_should_end_transaction(trans, tree_root) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(root))) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
@@ -7427,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
btrfs_end_transaction_throttle(trans, tree_root);
+ if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ pr_debug("btrfs: drop snapshot early exit\n");
+ err = -EAGAIN;
+ goto out_free;
+ }
+
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -7447,8 +7620,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
- NULL, NULL);
+ ret = btrfs_find_root(tree_root, &root->root_key, path,
+ NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, tree_root, ret);
err = ret;
@@ -7465,18 +7638,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->in_radix) {
- btrfs_free_fs_root(tree_root->fs_info, root);
+ btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
- kfree(root);
+ btrfs_put_fs_root(root);
}
+ root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (root_dropped == false)
+ btrfs_add_dead_root(root);
if (err)
btrfs_std_error(root->fs_info, err);
return err;
@@ -7782,6 +7965,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
+ struct btrfs_trans_handle *trans;
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
@@ -7868,6 +8052,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
do_div(min_free, dev_min);
}
+ /* We need to do this so that we can look at pending chunks */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 dev_offset;
@@ -7878,7 +8069,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free &&
!device->is_tgtdev_for_dev_replace) {
- ret = find_free_dev_extent(device, min_free,
+ ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7890,6 +8081,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
}
}
mutex_unlock(&root->fs_info->chunk_mutex);
+ btrfs_end_transaction(trans, root);
out:
btrfs_put_block_group(block_group);
return ret;
@@ -8032,6 +8224,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
dump_space_info(space_info, 0, 0);
}
}
+ percpu_counter_destroy(&space_info->total_bytes_pinned);
list_del(&space_info->list);
kfree(space_info);
}
@@ -8254,6 +8447,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
sizeof(item));
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ ret = btrfs_finish_chunk_alloc(trans, extent_root,
+ key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
}
}
@@ -8591,8 +8788,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
- if (!ret)
- wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ ret = wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
}
ret = btrfs_trim_block_group(cache,
&group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
kmem_cache_free(extent_buffer_cache, eb);
}
}
+
+#define btrfs_debug_check_extent_io_range(inode, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct inode *inode, u64 start, u64 end)
+{
+ u64 isize = i_size_read(inode);
+
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ printk_ratelimited(KERN_DEBUG
+ "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ caller,
+ (unsigned long long)btrfs_ino(inode),
+ (unsigned long long)isize,
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
#define btrfs_leak_debug_del(entry) do {} while (0)
#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err;
int clear = 0;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
if (delete)
bits |= ~EXTENT_CTLBITS;
bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct rb_node *node;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
spin_lock(&tree->lock);
again:
while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
"mirror=%lu\n", (u64)bio->bi_sector, err,
io_bio->mirror_num);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
+ tree = &BTRFS_I(inode)->io_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
if (uptodate) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+
+ /* Zero out the end if this page straddles i_size */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && offset)
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
- sizeof(struct btrfs_sector_sum) * \
- (r)->sectorsize - (r)->sectorsize)
+ sizeof(u32) * (r)->sectorsize)
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
- MAX_ORDERED_SUM_BYTES(root));
+ MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
- GFP_NOFS);
+ GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
- sector_sum = sums->sums;
sums->bytenr = start;
- sums->len = size;
+ sums->len = (int)size;
offset = (start - key.offset) >>
root->fs_info->sb->s_blocksize_bits;
offset *= csum_size;
+ size >>= root->fs_info->sb->s_blocksize_bits;
- while (size > 0) {
- read_extent_buffer(path->nodes[0],
- &sector_sum->sum,
- ((unsigned long)item) +
- offset, csum_size);
- sector_sum->bytenr = start;
-
- size -= root->sectorsize;
- start += root->sectorsize;
- offset += csum_size;
- sector_sum++;
- }
+ read_extent_buffer(path->nodes[0],
+ sums->sums,
+ ((unsigned long)item) + offset,
+ csum_size * size);
+
+ start += root->sectorsize * size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
+ int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
- u64 disk_bytenr;
WARN_ON(bio->bi_vcnt <= 0);
sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
if (!sums)
return -ENOMEM;
- sector_sum = sums->sums;
- disk_bytenr = (u64)bio->bi_sector << 9;
sums->len = bio->bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = (u64)bio->bi_sector << 9;
+ index = 0;
while (bio_index < bio->bi_vcnt) {
if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
BUG_ON(!sums); /* -ENOMEM */
- sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = ((u64)bio->bi_sector << 9) +
+ total_bytes;
+ index = 0;
}
data = kmap_atomic(bvec->bv_page);
- sector_sum->sum = ~(u32)0;
- sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset,
- sector_sum->sum,
- bvec->bv_len);
+ sums->sums[index] = ~(u32)0;
+ sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+ sums->sums[index],
+ bvec->bv_len);
kunmap_atomic(data);
- btrfs_csum_final(sector_sum->sum,
- (char *)&sector_sum->sum);
- sector_sum->bytenr = disk_bytenr;
+ btrfs_csum_final(sums->sums[index],
+ (char *)(sums->sums + index));
- sector_sum++;
bio_index++;
+ index++;
total_bytes += bvec->bv_len;
this_sum_bytes += bvec->bv_len;
- disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
bvec++;
}
@@ -672,62 +661,46 @@ out:
return ret;
}
-static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
- struct btrfs_sector_sum *sector_sum,
- u64 total_bytes, u64 sectorsize)
-{
- u64 tmp = sectorsize;
- u64 next_sector = sector_sum->bytenr;
- struct btrfs_sector_sum *next = sector_sum + 1;
-
- while ((tmp + total_bytes) < sums->len) {
- if (next_sector + sectorsize != next->bytenr)
- break;
- tmp += sectorsize;
- next_sector = next->bytenr;
- next++;
- }
- return tmp;
-}
-
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
{
- u64 bytenr;
- int ret;
struct btrfs_key file_key;
struct btrfs_key found_key;
- u64 next_offset;
- u64 total_bytes = 0;
- int found_next;
struct btrfs_path *path;
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
+ u64 next_offset;
+ u64 total_bytes = 0;
u64 csum_offset;
- struct btrfs_sector_sum *sector_sum;
+ u64 bytenr;
u32 nritems;
u32 ins_size;
+ int index = 0;
+ int found_next;
+ int ret;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
- sector_sum = sums->sums;
again:
next_offset = (u64)-1;
found_next = 0;
+ bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = sector_sum->bytenr;
- bytenr = sector_sum->bytenr;
+ file_key.offset = bytenr;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
- item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+ item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
- leaf = path->nodes[0];
ret = 0;
+ leaf = path->nodes[0];
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
free_space = btrfs_leaf_free_space(root, leaf) -
sizeof(struct btrfs_item) - csum_size;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
WARN_ON(tmp < 1);
@@ -822,6 +794,7 @@ again:
diff *= csum_size;
btrfs_extend_item(root, path, diff);
+ ret = 0;
goto csum;
}
@@ -831,8 +804,7 @@ insert:
if (found_next) {
u64 tmp;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
WARN_ON(1);
goto fail_unlock;
}
-csum:
leaf = path->nodes[0];
+csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- ret = 0;
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+ btrfs_item_size_nr(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
- write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
- total_bytes += root->sectorsize;
- sector_sum++;
- if (total_bytes < sums->len) {
- item = (struct btrfs_csum_item *)((char *)item +
- csum_size);
- if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
- sector_sum->bytenr) {
- bytenr = sector_sum->bytenr;
- goto next_sector;
- }
- }
+ ins_size = (u32)(sums->len - total_bytes) >>
+ root->fs_info->sb->s_blocksize_bits;
+ ins_size *= csum_size;
+ ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+ ins_size);
+ write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+ ins_size);
+
+ ins_size /= csum_size;
+ total_bytes += ins_size * root->sectorsize;
+ index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a58b63..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(inode_root);
goto cleanup;
}
- if (btrfs_root_refs(&inode_root->root_item) == 0) {
- ret = -ENOENT;
- goto cleanup;
- }
key.objectid = defrag->ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
}
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+ while (1) {
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered) {
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ return PTR_ERR(trans);
+ }
+
+ num_bytes = lockend - lockstart + 1;
+ ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+ NULL);
+ btrfs_end_transaction(trans, root);
+ if (ret <= 0) {
+ ret = 0;
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ NULL, GFP_NOFS);
+ *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ }
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+ return ret;
+}
+
static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct iov_iter *i,
loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
+ u64 release_bytes = 0;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool only_release_metadata = false;
bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
offset);
size_t num_pages = (write_bytes + offset +
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- ret = btrfs_delalloc_reserve_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ if (ret == -ENOSPC &&
+ (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC))) {
+ ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret > 0) {
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = (write_bytes + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = 0;
+ } else {
+ ret = -ENOSPC;
+ }
+ }
+
if (ret)
break;
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode,
+ reserve_bytes);
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, write_bytes,
force_page_uptodate);
- if (ret) {
- btrfs_delalloc_release_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
break;
- }
copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* managed to copy.
*/
if (num_pages > dirty_pages) {
+ release_bytes = (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT;
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- btrfs_delalloc_release_space(inode,
- (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode,
+ release_bytes);
+ else
+ btrfs_delalloc_release_space(inode,
+ release_bytes);
}
+ release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
if (copied > 0) {
ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied,
NULL);
if (ret) {
- btrfs_delalloc_release_space(inode,
- dirty_pages << PAGE_CACHE_SHIFT);
btrfs_drop_pages(pages, num_pages);
break;
}
}
+ release_bytes = 0;
btrfs_drop_pages(pages, num_pages);
+ if (only_release_metadata && copied > 0) {
+ u64 lockstart = round_down(pos, root->sectorsize);
+ u64 lockend = lockstart +
+ (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+ set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_NORESERVE, NULL,
+ NULL, GFP_NOFS);
+ only_release_metadata = false;
+ }
+
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
kfree(pages);
+ if (release_bytes) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, release_bytes);
+ else
+ btrfs_delalloc_release_space(inode, release_bytes);
+ }
+
return num_written ? num_written : ret;
}
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out_reserve_fail;
}
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start);
if (ret)
goto out;
+ } else {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
}
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2750b5023526..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
else
ret = 0;
spin_unlock(&rsv->lock);
- return 0;
+ return ret;
}
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
return 0;
}
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
+
/*
* This test just does basic sanity checking, making sure we can add an exten
* entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
{
int ret = 0;
- printk(KERN_ERR "Running extent only tests\n");
+ test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error adding initial extents %d\n", ret);
+ test_msg("Error adding initial extents %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing extent %d\n", ret);
+ test_msg("Error removing extent %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Full remove left some lingering space\n");
+ test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error adding half extent %d\n", ret);
+ test_msg("Error adding half extent %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing tail end %d\n", ret);
+ test_msg("Error removing tail end %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing front end %d\n", ret);
+ test_msg("Error removing front end %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
if (ret) {
- printk(KERN_ERR "Error removing middle piece %d\n", ret);
+ test_msg("Error removing middle piece %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Still have space at the front\n");
+ test_msg("Still have space at the front\n");
return -1;
}
if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
- printk(KERN_ERR "Still have space in the middle\n");
+ test_msg("Still have space in the middle\n");
return -1;
}
if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Still have space at the end\n");
+ test_msg("Still have space at the end\n");
return -1;
}
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
u64 next_bitmap_offset;
int ret;
- printk(KERN_ERR "Running bitmap only tests\n");
+ test_msg("Running bitmap only tests\n");
ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret);
+ test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing bitmap full range %d\n", ret);
+ test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Left some space in bitmap\n");
+ test_msg("Left some space in bitmap\n");
return -1;
}
ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret);
+ test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret);
+ test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
}
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
ret = add_free_space_entry(cache, next_bitmap_offset -
(2 * 1024 * 1024), 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add space that straddles two bitmaps"
- " %d\n", ret);
+ test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ ret);
return ret;
}
ret = btrfs_remove_free_space(cache, next_bitmap_offset -
(1 * 1024 * 1024), 2 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+ test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
2 * 1024 * 1024)) {
- printk(KERN_ERR "Left some space when removing overlapping\n");
+ test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
int ret;
- printk(KERN_ERR "Running bitmap and extent tests\n");
+ test_msg("Running bitmap and extent tests\n");
/*
* First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret);
+ test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove extent entry %d\n", ret);
+ test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Left remnants after our remove\n");
+ test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret);
+ test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret);
+ test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Left remnants in the bitmap\n");
+ test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret);
+ test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+ test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
- printk(KERN_ERR "Left over peices after removing "
- "overlapping\n");
+ test_msg("Left over peices after removing overlapping\n");
return -1;
}
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
/* Now with the extent entry offset into the bitmap */
ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret);
+ test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret);
+ test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Problem removing overlapping space %d\n", ret);
+ test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Left something behind when removing space");
+ test_msg("Left something behind when removing space");
return -1;
}
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add bitmap %d\n", ret);
+ test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
5 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
5 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Failed to free our space %d\n", ret);
+ test_msg("Failed to free our space %d\n", ret);
return ret;
}
if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
5 * 1024 * 1024)) {
- printk(KERN_ERR "Left stuff over\n");
+ test_msg("Left stuff over\n");
return -1;
}
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret);
+ test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing bitmap and extent "
- "overlapping %d\n", ret);
+ test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
}
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- printk(KERN_ERR "Running btrfs free space cache tests\n");
+ test_msg("Running btrfs free space cache tests\n");
cache = init_test_block_group();
if (!cache) {
- printk(KERN_ERR "Couldn't run the tests\n");
+ test_msg("Couldn't run the tests\n");
return;
}
@@ -3487,6 +3487,9 @@ out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
kfree(cache);
- printk(KERN_ERR "Free space cache tests finished\n");
+ test_msg("Free space cache tests finished\n");
}
-#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_free_space_cache(void);
-#endif
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f9d16b70d3d..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
#include <linux/mount.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -57,6 +58,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "backref.h"
+#include "hash.h"
struct btrfs_iget_args {
u64 ino;
@@ -701,8 +703,12 @@ retry:
async_extent->nr_pages = 0;
async_extent->pages = NULL;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
goto retry;
+ }
goto out_free;
}
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(!list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root,
+ &root->fs_info->delalloc_roots);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes--;
+ if (!root->nr_delalloc_inodes) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(list_empty(&root->delalloc_root));
+ list_del_init(&root->delalloc_root);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_delalloc_release_metadata(inode, len);
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list)
+ && do_list && !(state->state & EXTENT_NORESERVE))
btrfs_free_reserved_data_space(inode, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
BTRFS_I(inode)->delalloc_bytes -= len;
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_del_init(&BTRFS_I(inode)->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_del_delalloc_inode(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return 0;
return PTR_ERR(root);
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- /* parse ENOENT to 0 */
- return 0;
- }
/* step 2: get inode */
key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* 1 for the orphan item deletion. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
+ iput(inode);
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_orphan_add(trans, inode);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret) {
+ iput(inode);
goto out;
+ }
ret = btrfs_truncate(inode);
if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
+ static u64 xattr_access = 0;
+ static u64 xattr_default = 0;
int scanned = 0;
+ if (!xattr_access) {
+ xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS));
+ xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT));
+ }
+
slot++;
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 0;
/* we found an xattr, assume we've got an acl */
- if (found_key.type == BTRFS_XATTR_ITEM_KEY)
- return 1;
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (found_key.offset == xattr_access ||
+ found_key.offset == xattr_default)
+ return 1;
+ }
/*
* we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
return ret;
}
-
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
- struct btrfs_path *path)
-{
- struct extent_buffer *eb;
- int level;
- u64 refs = 1;
-
- for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
- int ret;
-
- if (!path->nodes[level])
- break;
- eb = path->nodes[level];
- if (!btrfs_block_can_be_shared(root, eb))
- continue;
- ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
- &refs, NULL);
- if (refs > 1)
- return 1;
- }
- return 0;
-}
/*
* helper to start transaction for unlink and rmdir.
*
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
*/
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
- struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_path *path;
- struct btrfs_dir_item *di;
- struct inode *inode = dentry->d_inode;
- u64 index;
- int check_link = 1;
- int err = -ENOSPC;
int ret;
- u64 ino = btrfs_ino(inode);
- u64 dir_ino = btrfs_ino(dir);
/*
* 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
- if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
- return ERR_PTR(-ENOSPC);
-
- /* check if there is someone else holds reference */
- if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
- return ERR_PTR(-ENOSPC);
-
- if (atomic_read(&inode->i_count) > 2)
- return ERR_PTR(-ENOSPC);
-
- if (xchg(&root->fs_info->enospc_unlink, 1))
- return ERR_PTR(-ENOSPC);
-
- path = btrfs_alloc_path();
- if (!path) {
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(-ENOMEM);
- }
+ if (PTR_ERR(trans) == -ENOSPC) {
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
- /* 1 for the orphan item */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- root->fs_info->enospc_unlink = 0;
- return trans;
- }
-
- path->skip_locking = 1;
- path->search_commit_root = 1;
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(dir)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(inode)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
-
- if (ret == 0 && S_ISREG(inode->i_mode)) {
- ret = btrfs_lookup_file_extent(trans, root, path,
- ino, (u64)-1, 0);
- if (ret < 0) {
- err = ret;
- goto out;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, 5);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
}
- BUG_ON(ret == 0); /* Corruption */
- if (check_path_shared(root, path))
- goto out;
- btrfs_release_path(path);
- }
-
- if (!check_link) {
- err = 0;
- goto out;
- }
-
- di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- if (di) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- err = 0;
- goto out;
- }
- btrfs_release_path(path);
-
- ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
- dentry->d_name.len, ino, dir_ino, 0,
- &index);
- if (ret) {
- err = ret;
- goto out;
- }
-
- if (check_path_shared(root, path))
- goto out;
-
- btrfs_release_path(path);
-
- /*
- * This is a commit root search, if we can lookup inode item and other
- * relative items in the commit root, it means the transaction of
- * dir/file creation has been committed, and the dir index item that we
- * delay to insert has also been inserted into the commit root. So
- * we needn't worry about the delayed insertion of the dir index item
- * here.
- */
- di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- BUG_ON(ret == -ENOENT);
- if (check_path_shared(root, path))
- goto out;
-
- err = 0;
-out:
- btrfs_free_path(path);
- /* Migrate the orphan reservation over */
- if (!err)
- err = btrfs_block_rsv_migrate(trans->block_rsv,
- &root->fs_info->global_block_rsv,
- trans->bytes_reserved);
-
- if (err) {
- btrfs_end_transaction(trans, root);
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(err);
- }
-
- trans->block_rsv = &root->fs_info->global_block_rsv;
- return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
- btrfs_block_rsv_release(root, trans->block_rsv,
- trans->bytes_reserved);
trans->block_rsv = &root->fs_info->trans_block_rsv;
- BUG_ON(!root->fs_info->enospc_unlink);
- root->fs_info->enospc_unlink = 0;
+ trans->bytes_reserved = num_bytes;
}
- btrfs_end_transaction(trans, root);
+ return trans;
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int ret;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return ret;
}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
return -EPERM;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!err)
btrfs_i_size_write(inode, 0);
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
u64 hole_size;
int err = 0;
+ /*
+ * If our size started in the middle of a page we need to zero out the
+ * rest of the page before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
if (size <= hole_start)
return 0;
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- err = -ENOENT;
- goto out;
- }
-
*sub_root = new_root;
location->objectid = btrfs_root_dirid(&new_root->root_item);
location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (!(inode->i_sb->s_flags & MS_RDONLY))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&root->fs_info->cleanup_work_sem);
- if (ret)
+ if (ret) {
+ iput(inode);
inode = ERR_PTR(ret);
+ }
}
return inode;
@@ -6501,10 +6380,10 @@ out:
* returns 1 when the nocow is safe, < 1 on error, 0 if the
* block must be cow'd
*/
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes)
{
struct btrfs_path *path;
int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
u64 num_bytes;
int slot;
int found_type;
-
+ bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
/* not a regular extent, must cow */
goto out;
}
+
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (disk_bytenr == 0)
+ goto out;
+
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
backref_offset = btrfs_file_extent_offset(leaf, fi);
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (orig_start) {
+ *orig_start = key.offset - backref_offset;
+ *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ }
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end < offset + *len) {
- /* extent doesn't include our full range, must cow */
- goto out;
- }
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (IS_ERR(trans))
goto must_cow;
- if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1) {
+ if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1) {
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
- struct bio_vec *bvec = dio_bio->bi_io_vec;
struct bio *io_bio;
int skip_sum;
int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
}
dip->private = dio_bio->bi_private;
- io_bio->bi_private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
-
- dip->bytes = 0;
- do {
- dip->bytes += bvec->bv_len;
- bvec++;
- } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
-
+ dip->bytes = dio_bio->bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
io_bio->bi_private = dip;
dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
atomic_inc(&inode->i_dio_count);
smp_mb__after_atomic_inc();
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * call btrfs_wait_ordered_range to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ count = iov_length(iov, nr_segs);
+ btrfs_wait_ordered_range(inode, offset, count);
+
if (rw & WRITE) {
- count = iov_length(iov, nr_segs);
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret;
+ int ret = 0;
int err = 0;
struct btrfs_trans_handle *trans;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
- if (ret)
- return ret;
-
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
*/
smp_mb();
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
struct list_head splice;
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
- list_del_init(&binode->delalloc_inodes);
-
+ list_move_tail(&binode->delalloc_inodes,
+ &root->delalloc_inodes);
inode = igrab(&binode->vfs_inode);
if (!inode) {
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &binode->runtime_flags);
+ cond_resched_lock(&root->delalloc_lock);
continue;
}
-
- list_add_tail(&binode->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
&work->work);
cond_resched();
- spin_lock(&root->fs_info->delalloc_lock);
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+ return 0;
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->delalloc_lock);
+ list_splice_tail(&splice, &root->delalloc_inodes);
+ spin_unlock(&root->delalloc_lock);
+ }
+ return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+ int ret;
- /* the filemap_flush will queue IO into the worker threads, but
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ ret = __start_delalloc_inodes(root, delay_iput);
+ /*
+ * the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
*/
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
- return 0;
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
+ return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = __start_delalloc_inodes(root, delay_iput);
+ btrfs_put_fs_root(root);
+ if (ret)
+ goto out;
+
+ spin_lock(&fs_info->delalloc_root_lock);
}
+ spin_unlock(&fs_info->delalloc_root_lock);
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&fs_info->async_submit_draining);
+ return 0;
+out:
if (!list_empty_careful(&splice)) {
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_tail(&splice, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd7e96c73cb7..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!root->ref_cows)
return -EINVAL;
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+
+ btrfs_wait_ordered_extents(root, 0);
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot)
return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- mnt_drop_write_file(file);
- return -EINVAL;
- }
-
- mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(root, vol_args->name);
- kfree(vol_args);
-out:
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name);
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+ kfree(vol_args);
mnt_drop_write_file(file);
return ret;
}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
+ int same_inode = 0;
/*
* TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
ret = -EINVAL;
if (src == inode)
- goto out_fput;
+ same_inode = 1;
/* the src must be open for reading */
if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
path->reada = 2;
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ }
} else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock(&src->i_mutex);
}
/* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
!IS_ALIGNED(destoff, bs))
goto out_unlock;
+ /* verify if ranges are overlapped within the same file */
+ if (same_inode) {
+ if (destoff + len > off && destoff < off + len)
+ goto out_unlock;
+ }
+
if (destoff > inode->i_size) {
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
@@ -2846,7 +2863,8 @@ out:
unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
out_unlock:
mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
+ if (!same_inode)
+ mutex_unlock(&inode->i_mutex);
vfree(buf);
btrfs_free_path(path);
out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- ret = -ENOENT;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
break;
}
- if (copy_to_user(arg, sa, sizeof(*sa)))
- ret = -EFAULT;
-
err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
if (err && !ret)
ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
return ret;
}
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+ return btrfs_ioctl_quota_rescan_wait(file, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(root, argp);
case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
struct workspace {
void *mem;
- void *buf; /* where compressed data goes */
- void *cbuf; /* where decompressed data goes */
+ void *buf; /* where decompressed data goes */
+ void *cbuf; /* where compressed data goes */
struct list_head list;
};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
+#include "disk-io.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio, int compress_type)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
ordered_data_tree_panic(inode, -EEXIST, file_offset);
spin_unlock_irq(&tree->lock);
- spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
- &BTRFS_I(inode)->root->fs_info->ordered_extents);
- spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ &root->ordered_extents);
+ root->nr_ordered_extents++;
+ if (root->nr_ordered_extents == 1) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(!list_empty(&root->ordered_root));
+ list_add_tail(&root->ordered_root,
+ &root->fs_info->ordered_roots);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
return 0;
}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
spin_unlock_irq(&tree->lock);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+ root->nr_ordered_extents--;
trace_btrfs_ordered_extent_remove(inode, entry);
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (!root->nr_ordered_extents) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(list_empty(&root->ordered_root));
+ list_del_init(&root->ordered_root);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
}
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
struct list_head splice, works;
- struct list_head *cur;
struct btrfs_ordered_extent *ordered, *next;
struct inode *inode;
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
- list_splice_init(&root->fs_info->ordered_extents, &splice);
+ spin_lock(&root->ordered_extent_lock);
+ list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice)) {
- cur = splice.next;
- ordered = list_entry(cur, struct btrfs_ordered_extent,
- root_extent_list);
- list_del_init(&ordered->root_extent_list);
- atomic_inc(&ordered->refs);
-
+ ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+ root_extent_list);
+ list_move_tail(&ordered->root_extent_list,
+ &root->ordered_extents);
/*
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(ordered->inode);
+ if (!inode) {
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ atomic_inc(&ordered->refs);
+ spin_unlock(&root->ordered_extent_lock);
- if (inode) {
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
- list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
- } else {
- btrfs_put_ordered_extent(ordered);
- }
+ ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
list_for_each_entry_safe(ordered, next, &works, work_list) {
list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ btrfs_wait_ordered_extents(root, delay_iput);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
+}
+
/*
* this is used during transaction commit to write all the inodes
* added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
if (!wait)
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
list_splice_tail(&splice,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
ret = -ENOMEM;
goto out;
}
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
&work->work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len)
{
struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_sector_sum *sector_sums;
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
i = (disk_bytenr - ordered_sum->bytenr) >>
inode->i_sb->s_blocksize_bits;
- sector_sums = ordered_sum->sums + i;
num_sectors = ordered_sum->len >>
inode->i_sb->s_blocksize_bits;
- for (; i < num_sectors; i++) {
- if (sector_sums[i].bytenr == disk_bytenr) {
- sum[index] = sector_sums[i].sum;
- index++;
- if (index == len)
- goto out;
- disk_bytenr += sectorsize;
- }
- }
+ num_sectors = min_t(int, len - index, num_sectors - i);
+ memcpy(sum + index, ordered_sum->sums + i,
+ num_sectors);
+
+ index += (int)num_sectors;
+ if (index == len)
+ goto out;
+ disk_bytenr += num_sectors * sectorsize;
}
}
out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
if (last_mod < root->fs_info->last_trans_committed)
return;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
struct rb_node *last;
};
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
- /* bytenr on disk */
- u64 bytenr;
- u32 sum;
-};
-
struct btrfs_ordered_sum {
/* bytenr is the start of this extent on disk */
u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
/*
* this is the length in bytes covered by the sums array below.
*/
- unsigned long len;
+ int len;
struct list_head list;
- /* last field is a variable length array of btrfs_sector_sums */
- struct btrfs_sector_sum sums[];
+ /* last field is a variable length array of csums */
+ u32 sums[];
};
/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
unsigned long bytes)
{
- unsigned long num_sectors = (bytes + root->sectorsize - 1) /
- root->sectorsize;
- num_sectors++;
- return sizeof(struct btrfs_ordered_sum) +
- num_sectors * sizeof(struct btrfs_sector_sum);
+ int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
}
static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput);
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-struct qgroup_rescan {
- struct btrfs_work work;
- struct btrfs_fs_info *fs_info;
-};
-
-static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
- struct qgroup_rescan *qscan);
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
int slot;
int ret = 0;
u64 flags = 0;
+ u64 rescan_progress = 0;
if (!fs_info->quota_enabled)
return 0;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
ptr);
- fs_info->qgroup_rescan_progress.objectid =
- btrfs_qgroup_status_rescan(l, ptr);
- if (fs_info->qgroup_flags &
- BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- struct qgroup_rescan *qscan =
- kmalloc(sizeof(*qscan), GFP_NOFS);
- if (!qscan) {
- ret = -ENOMEM;
- goto out;
- }
- fs_info->qgroup_rescan_progress.type = 0;
- fs_info->qgroup_rescan_progress.offset = 0;
- qgroup_rescan_start(fs_info, qscan);
- }
+ rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -421,9 +412,18 @@ out:
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+ ret >= 0) {
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
}
btrfs_free_path(path);
+ if (ret < 0) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
return ret < 0 ? ret : 0;
}
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
}
kfree(qgroup);
}
+ ulist_free(fs_info->qgroup_ulist);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
goto out;
}
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* initially create the quota tree
*/
@@ -916,6 +923,10 @@ out_free_root:
kfree(quota_root);
}
out:
+ if (ret) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
u64 ref_root;
struct btrfs_qgroup *qgroup;
struct ulist *roots = NULL;
- struct ulist *tmp = NULL;
u64 seq;
int ret = 0;
int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
- ret = 0;
- goto unlock;
- }
- }
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
/*
* step 1: for each old ref, visit all nodes once and inc refcnt
*/
- tmp = ulist_alloc(GFP_ATOMIC);
- if (!tmp) {
- ret = -ENOMEM;
- goto unlock;
- }
+ ulist_reinit(fs_info->qgroup_ulist);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+ ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+ seq);
if (ret)
goto unlock;
/*
* step 2: walk from the new root
*/
- ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn,
- node->num_bytes, qgroup);
+ ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes, qgroup);
if (ret)
goto unlock;
/*
* step 3: walk again from old refs
*/
- ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn,
- node->num_bytes);
+ ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes);
if (ret)
goto unlock;
unlock:
spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
ulist_free(roots);
- ulist_free(tmp);
return ret;
}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
if (!ret && start_rescan_worker) {
- ret = btrfs_qgroup_rescan(fs_info);
- if (ret)
- pr_err("btrfs: start rescan quota failed: %d\n", ret);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
ret = 0;
}
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* in a first step, we check all affected qgroups if any limits would
* be exceeded
*/
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
- ret = ulist_add(ulist, qgroup->qgroupid,
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
(uintptr_t)qgroup, GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
}
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(ulist, glist->group->qgroupid,
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
(uintptr_t)glist->group, GFP_ATOMIC);
if (ret < 0)
goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* no limits exceeded, now record the reservation into all qgroups
*/
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
-
return ret;
}
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
if (!qgroup)
goto out;
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- btrfs_std_error(fs_info, -ENOMEM);
- goto out;
- }
- ret = ulist_add(ulist, qgroup->qgroupid,
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
(uintptr_t)qgroup, GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
qg->reserved -= num_bytes;
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(ulist, glist->group->qgroupid,
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
(uintptr_t)glist->group, GFP_ATOMIC);
if (ret < 0)
goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
* returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
*/
static int
-qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path,
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_trans_handle *trans, struct ulist *tmp,
struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
- struct btrfs_fs_info *fs_info = qscan->fs_info;
struct ulist *roots = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
- struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan,
- work);
+ struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+ qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_fs_info *fs_info = qscan->fs_info;
struct ulist *tmp = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(qscan, path, trans,
+ err = qgroup_rescan_leaf(fs_info, path, trans,
tmp, scratch_leaf);
}
if (err > 0)
@@ -2049,7 +2037,6 @@ out:
kfree(scratch_leaf);
ulist_free(tmp);
btrfs_free_path(path);
- kfree(qscan);
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
} else {
pr_err("btrfs: qgroup scan failed with %d\n", err);
}
-}
-static void
-qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
-{
- memset(&qscan->work, 0, sizeof(qscan->work));
- qscan->work.func = btrfs_qgroup_rescan_worker;
- qscan->fs_info = fs_info;
-
- pr_info("btrfs: qgroup scan started\n");
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
+ complete_all(&fs_info->qgroup_rescan_completion);
}
-int
-btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags)
{
int ret = 0;
- struct rb_node *n;
- struct btrfs_qgroup *qgroup;
- struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
- if (!qscan)
- return -ENOMEM;
+ if (!init_flags &&
+ (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+ ret = -EINVAL;
+ goto err;
+ }
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
- ret = -EINPROGRESS;
- else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
- ret = -EINVAL;
- if (ret) {
- spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- kfree(qscan);
- return ret;
+
+ if (init_flags) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = -EINPROGRESS;
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto err;
+ }
+
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ init_completion(&fs_info->qgroup_rescan_completion);
+
+ memset(&fs_info->qgroup_rescan_work, 0,
+ sizeof(fs_info->qgroup_rescan_work));
+ fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+
+ if (ret) {
+err:
+ pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+ spin_lock(&fs_info->qgroup_lock);
/* clear all current qgroup tracking information */
for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup->excl_cmpr = 0;
}
spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
- qgroup_rescan_start(fs_info, qscan);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * We have set the rescan_progress to 0, which means no more
+ * delayed refs will be accounted by btrfs_qgroup_account_ref.
+ * However, btrfs_qgroup_account_ref may be right after its call
+ * to btrfs_find_all_roots, in which case it would still do the
+ * accounting.
+ * To solve this, we're committing the transaction, which will
+ * ensure we run all delayed refs and only after that, we are
+ * going to clear all tracking information for a clean start.
+ */
+
+ trans = btrfs_join_transaction(fs_info->fs_root);
+ if (IS_ERR(trans)) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
+
+ qgroup_rescan_zero_tracking(fs_info);
+
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
return 0;
}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+ int running;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+ running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (running)
+ ret = wait_for_completion_interruptible(
+ &fs_info->qgroup_rescan_completion);
+
+ return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
+ u64 last_snap = 0;
int ret;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
+ last_snap = btrfs_root_last_snapshot(&root->root_item);
btrfs_set_root_last_snapshot(&root->root_item,
trans->transid - 1);
} else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
root_item->drop_level = 0;
+ /*
+ * abuse rtransid, it is safe because it is impossible to
+ * receive data into a relocation tree.
+ */
+ btrfs_set_root_rtransid(root_item, last_snap);
+ btrfs_set_root_otransid(root_item, trans->transid);
}
btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(root_item);
- reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &root_key);
+ reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
+ u64 last_snap;
+ u64 otransid;
+ u64 objectid;
LIST_HEAD(reloc_roots);
int found = 0;
int ret = 0;
@@ -2308,12 +2319,44 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
+
+ /*
+ * we keep the old last snapshod transid in rtranid when we
+ * created the relocation tree.
+ */
+ last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+ otransid = btrfs_root_otransid(&reloc_root->root_item);
+ objectid = reloc_root->root_key.offset;
+
ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
if (ret < 0) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
+ } else if (!ret) {
+ /*
+ * recover the last snapshot tranid to avoid
+ * the space balance break NOCOW.
+ */
+ root = read_fs_root(rc->extent_root->fs_info,
+ objectid);
+ if (IS_ERR(root))
+ continue;
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ continue;
+
+ trans = btrfs_join_transaction(root);
+ BUG_ON(IS_ERR(trans));
+
+ /* Check if the fs/file tree was snapshoted or not. */
+ if (btrfs_root_last_snapshot(&root->root_item) ==
+ otransid - 1)
+ btrfs_set_root_last_snapshot(&root->root_item,
+ last_snap);
+
+ btrfs_end_transaction(trans, root);
}
}
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+ SKINNY_METADATA);
if (tree_block_processed(bytenr, blocksize, rc))
return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
+again:
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = blocksize;
+ if (skinny) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+ }
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
if (ret < 0)
goto out;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (ret > 0) {
- if (key.objectid == bytenr &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
+ if (ret > 0 && skinny) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_METADATA_ITEM_KEY ||
+ (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == blocksize)))
+ ret = 0;
+ }
+
+ if (ret) {
+ skinny = false;
+ btrfs_release_path(path);
+ goto again;
+ }
}
BUG_ON(ret);
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
if (ret < 0) {
err = ret;
goto out;
}
- btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ btrfs_wait_all_ordered_extents(fs_info, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+ reloc_root = btrfs_read_fs_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
@@ -4396,10 +4458,8 @@ out:
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
- size_t offset;
int ret;
u64 disk_bytenr;
LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
if (ret)
goto out;
+ disk_bytenr = ordered->start;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
-
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ sums->bytenr = disk_bytenr;
+ disk_bytenr += sums->len;
btrfs_add_ordered_sum(inode, ordered, sums);
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
}
/*
- * lookup the root with the highest offset for a given objectid. The key we do
- * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
*/
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
- struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key)
{
- struct btrfs_path *path;
- struct btrfs_key search_key;
struct btrfs_key found_key;
struct extent_buffer *l;
int ret;
int slot;
- search_key.objectid = objectid;
- search_key.type = BTRFS_ROOT_ITEM_KEY;
- search_key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
- BUG_ON(ret == 0);
- if (path->slots[0] == 0) {
- ret = 1;
- goto out;
+ if (search_key->offset != -1ULL) { /* the search key is exact */
+ if (ret > 0)
+ goto out;
+ } else {
+ BUG_ON(ret == 0); /* Logical error */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ ret = 0;
}
+
l = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
+
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid ||
+ if (found_key.objectid != search_key->objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- if (item)
- btrfs_read_root_item(l, slot, item);
- if (key)
- memcpy(key, &found_key, sizeof(found_key));
- ret = 0;
+ if (root_item)
+ btrfs_read_root_item(l, slot, root_item);
+ if (root_key)
+ memcpy(root_key, &found_key, sizeof(found_key));
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
return ret;
}
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed. This is any root item with an
- * offset lower than the latest root. They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_root *dead_root;
- struct btrfs_root_item *ri;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_path *path;
- int ret;
- u32 nritems;
- struct extent_buffer *leaf;
- int slot;
-
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = 0;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
-again:
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
- while (1) {
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- if (slot >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
- goto next;
-
- if (key.objectid < objectid)
- goto next;
-
- if (key.objectid > objectid)
- break;
-
- ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
- if (btrfs_disk_root_refs(leaf, ri) != 0)
- goto next;
-
- memcpy(&found_key, &key, sizeof(key));
- key.offset++;
- btrfs_release_path(path);
- dead_root =
- btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &found_key);
- if (IS_ERR(dead_root)) {
- ret = PTR_ERR(dead_root);
- goto err;
- }
-
- ret = btrfs_add_dead_root(dead_root);
- if (ret)
- goto err;
- goto again;
-next:
- slot++;
- path->slots[0]++;
- }
- ret = 0;
-err:
- btrfs_free_path(path);
- return ret;
-}
-
int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
{
struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct btrfs_root *root;
int err = 0;
int ret;
+ bool can_recover = true;
+
+ if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+ can_recover = false;
path = btrfs_alloc_path();
if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
- root = btrfs_read_fs_root_no_name(tree_root->fs_info,
- &root_key);
- if (!IS_ERR(root))
+ root = btrfs_read_fs_root(tree_root, &root_key);
+ err = PTR_RET(root);
+ if (err && err != -ENOENT) {
+ break;
+ } else if (err == -ENOENT) {
+ struct btrfs_trans_handle *trans;
+
+ btrfs_release_path(path);
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to start trans to delete "
+ "orphan item");
+ break;
+ }
+ err = btrfs_del_orphan_item(trans, tree_root,
+ root_key.objectid);
+ btrfs_end_transaction(trans, tree_root);
+ if (err) {
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to delete root orphan "
+ "item");
+ break;
+ }
continue;
+ }
- ret = PTR_ERR(root);
- if (ret != -ENOENT) {
- err = ret;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
+ err = btrfs_init_fs_root(root);
+ if (err) {
+ btrfs_free_fs_root(root);
break;
}
- ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
- if (ret) {
- err = ret;
+ root->orphan_item_inserted = 1;
+
+ err = btrfs_insert_fs_root(root->fs_info, root);
+ if (err) {
+ BUG_ON(err == -EEXIST);
+ btrfs_free_fs_root(root);
break;
}
}
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_path *path;
int ret;
- struct btrfs_root_item *ri;
- struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
goto out;
BUG_ON(ret != 0);
- leaf = path->nodes[0];
- ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
ret = btrfs_del_item(trans, root, path);
out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
- int ret = 0;
- unsigned long i;
+ unsigned long index;
unsigned long num_sectors;
while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
if (!sum)
return 0;
+ index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
num_sectors = sum->len / sctx->sectorsize;
- for (i = 0; i < num_sectors; ++i) {
- if (sum->sums[i].bytenr == logical) {
- memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
- ret = 1;
- break;
- }
- }
- if (ret && i == num_sectors - 1) {
+ memcpy(csum, sum->sums + index, sctx->csum_size);
+ if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
}
- return ret;
+ return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2501,10 +2495,11 @@ again:
ret = scrub_extent(sctx, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
- extent_physical);
+ extent_logical - logical + physical);
if (ret)
goto out;
+ scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
logical += increment;
@@ -3204,16 +3199,18 @@ out:
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
{
- unsigned long index;
struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct page *page;
struct btrfs_root *local_root;
u64 physical_for_dev_replace;
u64 len;
- struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ unsigned long index;
int srcu_index;
+ int ret;
+ int err;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
return PTR_ERR(local_root);
}
+ if (btrfs_root_refs(&local_root->root_item) == 0) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ return -ENOENT;
+ }
+
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
+ /* Avoid truncate/dio/punch hole.. */
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+
+ ret = 0;
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
len = nocow_ctx->len;
while (len >= PAGE_CACHE_SIZE) {
- struct page *page = NULL;
- int ret_sub;
-
index = offset >> PAGE_CACHE_SHIFT;
-
+again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
pr_err("find_or_create_page() failed\n");
ret = -ENOMEM;
- goto next_page;
+ goto out;
}
if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
goto next_page;
} else {
ClearPageError(page);
- ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+ err = extent_read_full_page(&BTRFS_I(inode)->
io_tree,
page, btrfs_get_extent,
nocow_ctx->mirror_num);
- if (ret_sub) {
- ret = ret_sub;
+ if (err) {
+ ret = err;
goto next_page;
}
- wait_on_page_locked(page);
+
+ lock_page(page);
+ /*
+ * If the page has been remove from the page cache,
+ * the data on it is meaningless, because it may be
+ * old one, the new data may be written into the new
+ * page in the page cache.
+ */
+ if (page->mapping != inode->i_mapping) {
+ page_cache_release(page);
+ goto again;
+ }
if (!PageUptodate(page)) {
ret = -EIO;
goto next_page;
}
}
- ret_sub = write_page_nocow(nocow_ctx->sctx,
- physical_for_dev_replace, page);
- if (ret_sub) {
- ret = ret_sub;
- goto next_page;
- }
-
+ err = write_page_nocow(nocow_ctx->sctx,
+ physical_for_dev_replace, page);
+ if (err)
+ ret = err;
next_page:
- if (page) {
- unlock_page(page);
- put_page(page);
- }
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (ret)
+ break;
+
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
-
- if (inode)
- iput(inode);
+out:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
}
}
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
{
struct fs_path *p;
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
return p;
}
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
{
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return NULL;
p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
return p;
}
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
*
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
-static int iterate_inode_ref(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key, int resolve,
iterate_inode_ref_t iterate, void *ctx)
{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
unsigned long elem_size;
unsigned long ptr;
- p = fs_path_alloc_reversed(sctx);
+ p = fs_path_alloc_reversed();
if (!p)
return -ENOMEM;
tmp_path = alloc_path_for_send();
if (!tmp_path) {
- fs_path_free(sctx, p);
+ fs_path_free(p);
return -ENOMEM;
}
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
*
* path must point to the dir item when called.
*/
-static int iterate_dir_item(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key,
iterate_dir_item_t iterate, void *ctx)
{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
* Retrieve the first path of an inode. If an inode has more then one
* ref/hardlink, this is ignored.
*/
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
u64 ino, struct fs_path *path)
{
int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
goto out;
}
- ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, 1,
+ __copy_first_ref, path);
if (ret < 0)
goto out;
ret = 0;
@@ -1314,8 +1312,7 @@ out:
return ret;
}
-static int read_symlink(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int read_symlink(struct btrfs_root *root,
u64 ino,
struct fs_path *dest)
{
@@ -1562,8 +1559,7 @@ out:
* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
* generation of the parent dir and the name of the dir entry.
*/
-static int get_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root, u64 ino,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
u64 *dir, u64 *dir_gen, struct fs_path *name)
{
int ret;
@@ -1628,8 +1624,7 @@ out:
return ret;
}
-static int is_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int is_first_ref(struct btrfs_root *root,
u64 ino, u64 dir,
const char *name, int name_len)
{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
u64 tmp_dir;
u64 tmp_dir_gen;
- tmp_name = fs_path_alloc(sctx);
+ tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
if (ret < 0)
goto out;
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
ret = !memcmp(tmp_name->start, name, name_len);
out:
- fs_path_free(sctx, tmp_name);
+ fs_path_free(tmp_name);
return ret;
}
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
if (!sctx->parent_root)
goto out;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name)
return -ENOMEM;
- ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+ ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
if (ret < 0)
goto out;
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
name->start, fs_path_len(name));
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
return ret;
}
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
* send_root or parent_root for ref lookup.
*/
if (ino < sctx->send_progress)
- ret = get_first_ref(sctx, sctx->send_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->send_root, ino,
+ parent_ino, parent_gen, dest);
else
- ret = get_first_ref(sctx, sctx->parent_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->parent_root, ino,
+ parent_ino, parent_gen, dest);
if (ret < 0)
goto out;
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_gen = 0;
int stop = 0;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name) {
ret = -ENOMEM;
goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
}
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
if (!ret)
fs_path_unreverse(dest);
return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
verbose_printk("btrfs: send_utimes %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
btrfs_free_path(path);
return ret;
}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
verbose_printk("btrfs: send_create_inode %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (S_ISLNK(mode)) {
fs_path_reset(p);
- ret = read_symlink(sctx, sctx->send_root, ino, p);
+ ret = read_symlink(sctx->send_root, ino, p);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
return 0;
}
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
{
struct recorded_ref *cur;
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(sctx, cur->full_path);
+ fs_path_free(cur->full_path);
list_del(&cur->list);
kfree(cur);
}
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
static void free_recorded_refs(struct send_ctx *sctx)
{
- __free_recorded_refs(sctx, &sctx->new_refs);
- __free_recorded_refs(sctx, &sctx->deleted_refs);
+ __free_recorded_refs(&sctx->new_refs);
+ __free_recorded_refs(&sctx->deleted_refs);
}
/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
int ret;
struct fs_path *orphan;
- orphan = fs_path_alloc(sctx);
+ orphan = fs_path_alloc();
if (!orphan)
return -ENOMEM;
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
ret = send_rename(sctx, path, orphan);
out:
- fs_path_free(sctx, orphan);
+ fs_path_free(orphan);
return ret;
}
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
*/
BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
- valid_path = fs_path_alloc(sctx);
+ valid_path = fs_path_alloc();
if (!valid_path) {
ret = -ENOMEM;
goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
- ret = is_first_ref(sctx, sctx->parent_root,
- ow_inode, cur->dir, cur->name,
- cur->name_len);
+ ret = is_first_ref(sctx->parent_root,
+ ow_inode, cur->dir, cur->name,
+ cur->name_len);
if (ret < 0)
goto out;
if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
out:
free_recorded_refs(sctx);
ulist_free(check_dirs);
- fs_path_free(sctx, valid_path);
+ fs_path_free(valid_path);
return ret;
}
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, __record_new_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, __record_deleted_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
return 0;
}
-static int find_iref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_iref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
ctx.name = name;
ctx.found_idx = -1;
- ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
if (ret < 0)
return ret;
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+ ret = find_iref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, dir, name);
if (ret == -ENOENT)
ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+ ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
dir, name);
if (ret == -ENOENT)
ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, __record_changed_new_ref, sctx);
if (ret < 0)
goto out;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
if (ret < 0)
goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
- sctx);
+ ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
btrfs_release_path(path);
if (ret < 0)
goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
posix_acl_xattr_header dummy_acl;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
ret = send_remove_xattr(sctx, p, name, name_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, __process_new_xattr, sctx);
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, __process_new_xattr, sctx);
return ret;
}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
{
int ret;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, __process_deleted_xattr, sctx);
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, __process_deleted_xattr, sctx);
return ret;
}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmalloc(data_len, GFP_NOFS);
+ ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
if (!ctx->found_data)
return -ENOMEM;
- memcpy(ctx->found_data, data, data_len);
return 1;
}
return 0;
}
-static int find_xattr(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_xattr(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
ctx.found_data = NULL;
ctx.found_data_len = 0;
- ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+ ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
if (ret < 0)
return ret;
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
char *found_data = NULL;
int found_data_len = 0;
- ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, name, name_len, &found_data,
- &found_data_len);
+ ret = find_xattr(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, name, name_len, &found_data,
+ &found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
- name, name_len, NULL, NULL);
+ ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
sctx->cmp_key, __process_changed_new_xattr, sctx);
if (ret < 0)
goto out;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
sctx->cmp_key, __process_changed_deleted_xattr, sctx);
out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
goto out;
}
- ret = iterate_dir_item(sctx, root, path, &found_key,
- __process_new_xattr, sctx);
+ ret = iterate_dir_item(root, path, &found_key,
+ __process_new_xattr, sctx);
if (ret < 0)
goto out;
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int num_read = 0;
mm_segment_t old_fs;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
set_fs(old_fs);
if (ret < 0)
return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
clone_root->root->objectid, clone_root->ino,
clone_root->offset);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
} else {
- ret = get_inode_path(sctx, clone_root->root,
- clone_root->ino, p);
+ ret = get_inode_path(clone_root->root, clone_root->ino, p);
}
if (ret < 0)
goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
send_root = BTRFS_I(file_inode(mnt_file))->root;
fs_info = send_root->fs_info;
+ /*
+ * This is done when we lookup the root, it should already be complete
+ * by the time we get here.
+ */
+ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+ /*
+ * If we just created this root we need to make sure that the orphan
+ * cleanup has been done and committed since we search the commit root,
+ * so check its commit root transid with our otransid and if they match
+ * commit the transaction to make sure everything is updated.
+ */
+ down_read(&send_root->fs_info->extent_commit_sem);
+ if (btrfs_header_generation(send_root->commit_root) ==
+ btrfs_root_otransid(&send_root->root_item)) {
+ struct btrfs_trans_handle *trans;
+
+ up_read(&send_root->fs_info->extent_commit_sem);
+
+ trans = btrfs_attach_transaction_barrier(send_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ /* ENOENT means theres no transaction */
+ } else {
+ ret = btrfs_commit_transaction(trans, send_root);
+ if (ret)
+ goto out;
+ }
+ } else {
+ up_read(&send_root->fs_info->extent_commit_sem);
+ }
+
arg = memdup_user(arg_, sizeof(*arg));
if (IS_ERR(arg)) {
ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!clone_root) {
- ret = -EINVAL;
- goto out;
- }
if (IS_ERR(clone_root)) {
ret = PTR_ERR(clone_root);
goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!sctx->parent_root) {
- ret = -EINVAL;
+ if (IS_ERR(sctx->parent_root)) {
+ ret = PTR_ERR(sctx->parent_root);
goto out;
}
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
#include "print-tree.h"
#include "xattr.h"
#include "volumes.h"
-#include "version.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
return;
}
ACCESS_ONCE(trans->transaction->aborted) = errno;
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
@@ -776,9 +778,6 @@ find_root:
if (IS_ERR(new_root))
return ERR_CAST(new_root);
- if (btrfs_root_refs(&new_root->root_item) == 0)
- return ERR_PTR(-ENOENT);
-
dir_id = btrfs_root_dirid(&new_root->root_item);
setup_root:
location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_extents(root, 1);
+ btrfs_wait_all_ordered_extents(fs_info, 1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
}
+static void btrfs_print_info(void)
+{
+ printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+ ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ ", integrity-checker=on"
+#endif
+ "\n");
+}
+
static int __init init_btrfs_fs(void)
{
int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
btrfs_init_lockdep();
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ btrfs_print_info();
btrfs_test_free_space_cache();
-#endif
- printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
return 0;
unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
#define BTRFS_ROOT_TRANS_TAG 0
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+ [TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START),
+ [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN),
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+ [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+};
+
static void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
+ while (!list_empty(&transaction->pending_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&transaction->pending_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
root->commit_root = btrfs_root_node(root);
}
-static inline int can_join_transaction(struct btrfs_transaction *trans,
- int type)
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
- return !(trans->in_commit &&
- type != TRANS_JOIN &&
- type != TRANS_JOIN_NOLOCK);
+ return atomic_read(&trans->num_extwriters);
}
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
return -EROFS;
}
- if (fs_info->trans_no_join) {
- /*
- * If we are JOIN_NOLOCK we're already committing a current
- * transaction, we just need a handle to deal with something
- * when committing the transaction, such as inode cache and
- * space cache. It is a special case.
- */
- if (type != TRANS_JOIN_NOLOCK) {
- spin_unlock(&fs_info->trans_lock);
- return -EBUSY;
- }
- }
-
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
- if (!can_join_transaction(cur_trans, type)) {
+ if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
- cur_trans->num_joined++;
+ extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
return 0;
}
@@ -112,6 +147,12 @@ loop:
if (type == TRANS_ATTACH)
return -ENOENT;
+ /*
+ * JOIN_NOLOCK only happens during the transaction commit, so
+ * it is impossible that ->running_transaction is NULL
+ */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
- * to redo the trans_no_join checks above
+ * to redo the checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
goto loop;
@@ -131,17 +172,15 @@ loop:
}
atomic_set(&cur_trans->num_writers, 1);
- cur_trans->num_joined = 0;
+ extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->in_commit = 0;
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
"creating a fresh transaction\n");
atomic64_set(&fs_info->tree_mod_seq, 0);
- spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
+ INIT_LIST_HEAD(&cur_trans->pending_chunks);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
}
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+ return (trans->state >= TRANS_STATE_BLOCKED &&
+ trans->state < TRANS_STATE_UNBLOCKED &&
+ !trans->aborted);
+}
+
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
- if (cur_trans && cur_trans->blocked) {
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
atomic_inc(&cur_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
wait_event(root->fs_info->transaction_wait,
- !cur_trans->blocked);
+ cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ cur_trans->aborted);
put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
return ERR_PTR(-EROFS);
if (current->journal_info) {
- WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+ WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
* If we are ATTACH, it means we just want to catch the current
* transaction and commit it, so we needn't do sb_start_intwrite().
*/
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_start_intwrite(root->fs_info->sb);
if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ may_wait_transaction(root, type)) {
btrfs_commit_transaction(h, root);
goto again;
}
@@ -429,7 +477,7 @@ got_it:
return h;
join_fail:
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * btrfs_attach_transaction_barrier() - catch the running transaction
*
* It is similar to the above function, the differentia is this one
* will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
- wait_event(commit->commit_wait, commit->commit_done);
+ wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
}
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry_reverse(t, &root->fs_info->trans_list,
list) {
- if (t->in_commit) {
- if (t->commit_done)
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret;
+ if (root->fs_info->global_block_rsv.space_info->full &&
+ btrfs_should_throttle_delayed_refs(trans, root))
+ return 1;
- ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
- return ret ? 1 : 0;
+ return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
}
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
int err;
smp_mb();
- if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ cur_trans->delayed_refs.flushing)
return 1;
updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
- int count = 0;
+ unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- while (count < 1) {
- unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (btrfs_should_throttle_delayed_refs(trans, root)) {
+ cur = max_t(unsigned long, cur, 1);
trans->delayed_ref_updates = 0;
- if (cur &&
- trans->transaction->delayed_refs.num_heads_ready > 64) {
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- } else {
- break;
- }
- count++;
+ btrfs_run_delayed_refs(trans, root, cur);
}
btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_create_pending_block_groups(trans, root);
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
- should_end_transaction(trans, root)) {
- trans->transaction->blocked = 1;
- smp_wmb();
+ should_end_transaction(trans, root) &&
+ ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+ spin_lock(&info->trans_lock);
+ if (cur_trans->state == TRANS_STATE_RUNNING)
+ cur_trans->state = TRANS_STATE_BLOCKED;
+ spin_unlock(&info->trans_lock);
}
- if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+ if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle) {
/*
* We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
}
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
atomic_dec(&cur_trans->num_writers);
+ extwriter_counter_dec(cur_trans, trans->type);
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
- struct blk_plug plug;
- blk_start_plug(&plug);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
- blk_finish_plug(&plug);
return werr;
}
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
{
int ret;
int ret2;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ blk_finish_plug(&plug);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->in_commit;
+ trans = info->running_transaction;
+ if (trans)
+ ret = (trans->state >= TRANS_STATE_COMMIT_START);
spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->blocked;
+ trans = info->running_transaction;
+ if (trans)
+ ret = is_transaction_blocked(trans);
spin_unlock(&info->trans_lock);
return ret;
}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
static void wait_current_trans_commit_start(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
- wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+ wait_event(root->fs_info->transaction_blocked_wait,
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ trans->aborted);
}
/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
wait_event(root->fs_info->transaction_wait,
- trans->commit_done || (trans->in_commit && !trans->blocked));
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ trans->aborted);
}
/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->trans_lock);
- if (list_empty(&cur_trans->list)) {
- spin_unlock(&root->fs_info->trans_lock);
- btrfs_end_transaction(trans, root);
- return;
- }
+ /*
+ * If the transaction is removed from the list, it means this
+ * transaction has been committed successfully, so it is impossible
+ * to call the cleanup function.
+ */
+ BUG_ON(list_empty(&cur_trans->list));
list_del_init(&cur_trans->list);
if (cur_trans == root->fs_info->running_transaction) {
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
}
spin_unlock(&root->fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, root);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->trans_lock);
+
put_transaction(cur_trans);
put_transaction(cur_trans);
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
-
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 0;
- spin_unlock(&root->fs_info->trans_lock);
}
static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
- int snap_pending = 0;
int ret;
- if (!flush_on_commit) {
- spin_lock(&root->fs_info->trans_lock);
- if (!list_empty(&trans->transaction->pending_snapshots))
- snap_pending = 1;
- spin_unlock(&root->fs_info->trans_lock);
- }
-
- if (flush_on_commit || snap_pending) {
- ret = btrfs_start_delalloc_inodes(root, 1);
- if (ret)
- return ret;
- btrfs_wait_ordered_extents(root, 1);
- }
-
ret = btrfs_run_delayed_items(trans, root);
if (ret)
return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * btrfs_transaction state sequence:
- * in_commit = 0, blocked = 0 (initial)
- * in_commit = 1, blocked = 1
- * blocked = 0
- * commit_done = 1
- */
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ return btrfs_start_all_delalloc_inodes(fs_info, 1);
+ return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- unsigned long joined = 0;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
- DEFINE_WAIT(wait);
int ret;
- int should_grow = 0;
- unsigned long now = get_seconds();
ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* start sending their work down.
*/
cur_trans->delayed_refs.flushing = 1;
+ smp_wmb();
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- spin_lock(&cur_trans->commit_lock);
- if (cur_trans->in_commit) {
- spin_unlock(&cur_trans->commit_lock);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans, root);
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- trans->transaction->in_commit = 1;
- trans->transaction->blocked = 1;
- spin_unlock(&cur_trans->commit_lock);
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
- spin_lock(&root->fs_info->trans_lock);
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (!prev_trans->commit_done) {
+ if (prev_trans->state != TRANS_STATE_COMPLETED) {
atomic_inc(&prev_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
}
- if (!btrfs_test_opt(root, SSD) &&
- (now < cur_trans->start_time || now - cur_trans->start_time < 1))
- should_grow = 1;
-
- do {
- joined = cur_trans->num_joined;
-
- WARN_ON(cur_trans != trans->transaction);
-
- ret = btrfs_flush_all_pending_stuffs(trans, root);
- if (ret)
- goto cleanup_transaction;
+ extwriter_counter_dec(cur_trans, trans->type);
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ ret = btrfs_start_delalloc_flush(root->fs_info);
+ if (ret)
+ goto cleanup_transaction;
- if (atomic_read(&cur_trans->num_writers) > 1)
- schedule_timeout(MAX_SCHEDULE_TIMEOUT);
- else if (should_grow)
- schedule_timeout(1);
+ ret = btrfs_flush_all_pending_stuffs(trans, root);
+ if (ret)
+ goto cleanup_transaction;
- finish_wait(&cur_trans->writer_wait, &wait);
- } while (atomic_read(&cur_trans->num_writers) > 1 ||
- (should_grow && cur_trans->num_joined != joined));
+ wait_event(cur_trans->writer_wait,
+ extwriter_counter_read(cur_trans) == 0);
+ /* some pending stuffs might be added after the previous flush. */
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
goto cleanup_transaction;
+ btrfs_wait_delalloc_flush(root->fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
- * no_join so make sure to wait for num_writers to == 1 again.
+ * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
- trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
- root->fs_info->trans_no_join = 0;
spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->reloc_mutex);
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- cur_trans->commit_done = 1;
-
root->fs_info->last_trans_committed = cur_trans->transid;
-
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
put_transaction(cur_trans);
put_transaction(cur_trans);
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
- if (fs_info->sb->s_flags & MS_RDONLY) {
- pr_debug("btrfs: cleaner called for RO fs!\n");
- return 0;
- }
-
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
#include "delayed-ref.h"
#include "ctree.h"
+enum btrfs_trans_state {
+ TRANS_STATE_RUNNING = 0,
+ TRANS_STATE_BLOCKED = 1,
+ TRANS_STATE_COMMIT_START = 2,
+ TRANS_STATE_COMMIT_DOING = 3,
+ TRANS_STATE_UNBLOCKED = 4,
+ TRANS_STATE_COMPLETED = 5,
+ TRANS_STATE_MAX = 6,
+};
+
struct btrfs_transaction {
u64 transid;
/*
+ * total external writers(USERSPACE/START/ATTACH) in this
+ * transaction, it must be zero before the transaction is
+ * being committed
+ */
+ atomic_t num_extwriters;
+ /*
* total writers in this transaction, it must be zero before the
* transaction can end
*/
atomic_t num_writers;
atomic_t use_count;
- unsigned long num_joined;
-
- spinlock_t commit_lock;
- int in_commit;
- int commit_done;
- int blocked;
+ /* Be protected by fs_info->trans_lock when we want to change it. */
+ enum btrfs_trans_state state;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head ordered_operations;
+ struct list_head pending_chunks;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
-enum btrfs_trans_type {
- TRANS_START,
- TRANS_JOIN,
- TRANS_USERSPACE,
- TRANS_JOIN_NOLOCK,
- TRANS_ATTACH,
-};
+#define __TRANS_FREEZABLE (1U << 0)
+
+#define __TRANS_USERSPACE (1U << 8)
+#define __TRANS_START (1U << 9)
+#define __TRANS_ATTACH (1U << 10)
+#define __TRANS_JOIN (1U << 11)
+#define __TRANS_JOIN_NOLOCK (1U << 12)
+
+#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH (__TRANS_ATTACH)
+#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
+ __TRANS_ATTACH)
struct btrfs_trans_handle {
u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
- enum btrfs_trans_type type;
+ unsigned int type;
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include "ctree.h"
#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
{
int ret = 0;
+ /*
+ * If this fs is mixed then we need to be able to process the leaves to
+ * pin down any logged extents, so we have to read the block.
+ */
+ if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
+ }
+
if (wc->pin)
ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
eb->start, eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+ if (wc->pin && btrfs_header_level(eb) == 0)
+ ret = btrfs_exclude_logged_extents(log, eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
- break;
- ret = 0;
- } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ } else if (key.type == BTRFS_INODE_REF_KEY ||
+ key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
unsigned long log_transid = 0;
+ struct blk_plug plug;
mutex_lock(&root->log_mutex);
log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
+ blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ blk_finish_plug(&plug);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- ret = btrfs_write_and_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ ret = btrfs_write_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ blk_finish_plug(&plug);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root_no_radix(log_root_tree,
- &found_key);
+ log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 new_alloced = ulist->nodes_alloced + 128;
struct ulist_node *new_nodes;
void *old = NULL;
+ int i;
+
+ for (i = 0; i < ulist->nnodes; i++)
+ rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
/*
* if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
ulist->nodes = new_nodes;
ulist->nodes_alloced = new_alloced;
+
+ /*
+ * krealloc actually uses memcpy, which does not copy rb_node
+ * pointers, so we have to do it ourselves. Otherwise we may
+ * be bitten by crashes.
+ */
+ for (i = 0; i < ulist->nnodes; i++) {
+ ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+ if (ret < 0)
+ return ret;
+ }
}
ulist->nodes[ulist->nnodes].val = val;
ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
return ret;
}
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 *start, u64 len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+ struct map_lookup *map;
+ int i;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev != device)
+ continue;
+ if (map->stripes[i].physical >= *start + len ||
+ map->stripes[i].physical + em->orig_block_len <=
+ *start)
+ continue;
+ *start = map->stripes[i].physical +
+ em->orig_block_len;
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+
/*
* find_free_dev_extent - find free space in the specified device
* @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
*/
search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
max_hole_start = search_start;
max_hole_size = 0;
hole_size = 0;
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
- goto error;
+ goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto error;
- }
path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
if (key.offset > search_start) {
hole_size = key.offset - search_start;
+ /*
+ * Have to check before we set max_hole_start, otherwise
+ * we could end up sending back this offset anyway.
+ */
+ if (contains_pending_extent(trans, device,
+ &search_start,
+ hole_size))
+ hole_size = 0;
+
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
max_hole_size = hole_size;
}
+ if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
out:
btrfs_free_path(path);
-error:
*start = max_hole_start;
if (len)
*len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
return ret;
}
-static noinline int find_next_chunk(struct btrfs_root *root,
- u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
- int ret;
- struct btrfs_key key;
- struct btrfs_chunk *chunk;
- struct btrfs_key found_key;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = objectid;
- key.offset = (u64)-1;
- key.type = BTRFS_CHUNK_ITEM_KEY;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
-
- BUG_ON(ret == 0); /* Corruption */
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
- ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
- if (ret) {
- *offset = 0;
- } else {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
- if (found_key.objectid != objectid)
- *offset = 0;
- else {
- chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_chunk);
- *offset = found_key.offset +
- btrfs_chunk_length(path->nodes[0], chunk);
- }
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
}
- ret = 0;
-error:
- btrfs_free_path(path);
+ read_unlock(&em_tree->lock);
+
return ret;
}
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- printk(KERN_ERR "btrfs: unable to go below four devices "
- "on raid10\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid1\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
root->fs_info->fs_devices->rw_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid5\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
root->fs_info->fs_devices->rw_devices <= 3) {
- printk(KERN_ERR "btrfs: unable to go below three "
- "devices on raid6\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
goto out;
}
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bh = NULL;
disk_super = NULL;
if (!device) {
- printk(KERN_ERR "btrfs: no missing devices found to "
- "remove\n");
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
goto out;
}
} else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
if (device->is_tgtdev_for_dev_replace) {
- pr_err("btrfs: unable to remove the dev_replace target dev\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto error_brelse;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
- printk(KERN_ERR "btrfs: unable to remove the only writeable "
- "device\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto error_brelse;
}
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
- if (IS_ERR(tsk))
- return PTR_ERR(tsk);
-
- return 0;
+ return PTR_RET(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes_out, u64 *stripe_size_out,
- u64 start, u64 type)
+ struct btrfs_root *extent_root, u64 start,
+ u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(device,
+ ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- *map_ret = map;
num_bytes = stripe_size * data_stripes;
- *stripe_size_out = stripe_size;
- *num_bytes_out = num_bytes;
-
trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
+ em->orig_block_len = stripe_size;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
+ if (!ret) {
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ atomic_inc(&em->refs);
+ }
write_unlock(&em_tree->lock);
if (ret) {
free_extent_map(em);
goto error;
}
- for (i = 0; i < map->num_stripes; ++i) {
- struct btrfs_device *device;
- u64 dev_offset;
-
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device,
- info->chunk_root->root_key.objectid,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, dev_offset, stripe_size);
- if (ret)
- goto error_dev_extent;
- }
-
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- if (ret) {
- i = map->num_stripes - 1;
- goto error_dev_extent;
- }
+ if (ret)
+ goto error_del_extent;
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
kfree(devices_info);
return 0;
-error_dev_extent:
- for (; i >= 0; i--) {
- struct btrfs_device *device;
- int err;
-
- device = map->stripes[i].dev;
- err = btrfs_free_dev_extent(trans, device, start);
- if (err) {
- btrfs_abort_transaction(trans, extent_root, err);
- break;
- }
- }
+error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
return ret;
}
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
- struct map_lookup *map, u64 chunk_offset,
- u64 chunk_size, u64 stripe_size)
+ u64 chunk_offset, u64 chunk_size)
{
- u64 dev_offset;
struct btrfs_key key;
struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- size_t item_size = btrfs_chunk_item_size(map->num_stripes);
- int index = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
int ret;
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(extent_root->fs_info, "unable to find logical "
+ "%Lu len %Lu", chunk_offset, chunk_size);
+ return -EINVAL;
+ }
+
+ if (em->start != chunk_offset || em->len != chunk_size) {
+ btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+ " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ chunk_size, em->start, em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk)
- return -ENOMEM;
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
- index = 0;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out_free;
- index++;
+ goto out;
+ ret = btrfs_alloc_dev_extent(trans, device,
+ chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ goto out;
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);
- index = 0;
stripe = &chunk->stripe;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- dev_offset = map->stripes[index].physical;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
- index++;
}
btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
}
-out_free:
+out:
kfree(chunk);
+ free_extent_map(em);
return ret;
}
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type)
{
u64 chunk_offset;
- u64 chunk_size;
- u64 stripe_size;
- struct map_lookup *map;
- struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
- int ret;
-
- ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- &chunk_offset);
- if (ret)
- return ret;
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, type);
- if (ret)
- return ret;
-
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret)
- return ret;
- return 0;
+ chunk_offset = find_next_chunk(extent_root->fs_info);
+ return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
u64 sys_chunk_offset;
- u64 chunk_size;
- u64 sys_chunk_size;
- u64 stripe_size;
- u64 sys_stripe_size;
u64 alloc_profile;
- struct map_lookup *map;
- struct map_lookup *sys_map;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
- ret = find_next_chunk(fs_info->chunk_root,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
- if (ret)
- return ret;
-
+ chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+ alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = chunk_offset + chunk_size;
-
+ sys_chunk_offset = find_next_chunk(root->fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
- &sys_chunk_size, &sys_stripe_size,
- sys_chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ alloc_profile);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- /*
- * Modifying chunk tree needs allocating new blocks from both
- * system block group and metadata block group. So we only can
- * do operations require modifying the chunk tree after both
- * block groups were created.
- */
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- ret = __finish_chunk_alloc(trans, extent_root, sys_map,
- sys_chunk_offset, sys_chunk_size,
- sys_stripe_size);
if (ret)
btrfs_abort_transaction(trans, root, ret);
-
out:
-
return ret;
}
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
- if (mirror_num > map->num_stripes)
- mirror_num = 0;
-
stripe_len = map->stripe_len;
stripe_nr = offset;
/*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
return NULL;
list_add(&device->dev_list,
&fs_devices->devices);
- device->dev_root = root->fs_info->dev_root;
device->devid = devid;
device->work.func = pending_bios_fn;
device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
}
fill_device_from_item(leaf, dev_item, device);
- device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
return ret;
}
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_root *root,
struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
struct btrfs_mapping_tree *map_tree,
u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b5c1bc6776..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -439,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
- loff_t page_off = page_offset(page);
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
struct ceph_snap_context *snapc, *oldest;
- u64 snap_size = 0;
+ loff_t page_off = page_offset(page);
long writeback_stat;
+ u64 truncate_size, snap_size = 0;
+ u32 truncate_seq;
+ int err = 0, len = PAGE_CACHE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -475,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
/* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
+ if (page_off >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
+ goto out;
+ }
+ if (snap_size < page_off + len)
+ len = snap_size - page_off;
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
@@ -495,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
+ truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -632,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
ceph_osdc_put_request(req);
}
-static struct ceph_osd_request *
-ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
- struct ceph_snap_context *snapc, int num_ops)
-{
- struct ceph_fs_client *fsc;
- struct ceph_inode_info *ci;
- struct ceph_vino vino;
-
- fsc = ceph_inode_to_client(inode);
- ci = ceph_inode(inode);
- vino = ceph_vino(inode);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-
- return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
- snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
-}
-
/*
* initiate async writeback
*/
@@ -659,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -671,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync;
- u64 snap_size;
+ u64 truncate_size, snap_size;
+ u32 truncate_seq;
/*
* Include a 'sync' in the OSD request if this is a data
* integrity write (e.g., O_SYNC write or fsync()), or if our
* cap is being revoked.
*/
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ if ((wbc->sync_mode == WB_SYNC_ALL) ||
+ ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
do_sync = 1;
dout("writepages_start %p dosync=%d (mode=%s)\n",
inode, do_sync,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- fsc = ceph_inode_to_client(inode);
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
@@ -729,6 +717,14 @@ retry:
snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
+
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
if (last_snapc && snapc != last_snapc) {
/* if we switched to a newer snapc, restart our scan at the
* start of the original file range. */
@@ -740,7 +736,6 @@ retry:
while (!done && index <= end) {
int num_ops = do_sync ? 2 : 1;
- struct ceph_vino vino;
unsigned i;
int first;
pgoff_t next;
@@ -834,17 +829,18 @@ get_more_pages:
* that it will use.
*/
if (locked_pages == 0) {
- size_t size;
-
BUG_ON(pages);
-
/* prepare async write request */
offset = (u64)page_offset(page);
len = wsize;
- req = ceph_writepages_osd_request(inode,
- offset, &len, snapc,
- num_ops);
-
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
unlock_page(page);
@@ -855,8 +851,8 @@ get_more_pages:
req->r_inode = inode;
max_pages = calc_pages_for(0, (u64)len);
- size = max_pages * sizeof (*pages);
- pages = kmalloc(size, GFP_NOFS);
+ pages = kmalloc(max_pages * sizeof (*pages),
+ GFP_NOFS);
if (!pages) {
pool = fsc->wb_pagevec_pool;
pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
int have;
int alloc = 0;
LIST_HEAD(newcaps);
- int ret = 0;
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
for (i = have; i < need; i++) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
+ if (!cap)
+ break;
list_add(&cap->caps_item, &newcaps);
alloc++;
}
- BUG_ON(have + alloc != need);
+ /* we didn't manage to reserve as much as we needed */
+ if (have + alloc != need)
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
__cap_delay_requeue(mdsc, ci);
}
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap) {
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (ci->i_auth_cap == NULL ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ci->i_auth_cap = cap;
+ } else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (implemented)
*implemented |= cap->implemented;
}
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
return have;
}
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
{
- struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct rb_node *p;
- int ret = 0;
- spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
+ if (cap != ocap && __cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
}
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
__ceph_flush_snaps(ci, &session, 1);
+
if (ci->i_flushing_caps) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
- __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR));
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
}
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
} else {
dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
cap->issued = newcaps;
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
(cap->issued & unless) == 0)) {
if ((cap->issued & drop) &&
(cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
cap->issued &= ~drop;
cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
+ cap->mds_wanted = wanted;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 16c989d3e23c..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
hold_mutex = true;
@@ -809,7 +808,6 @@ retry_snap:
out:
if (hold_mutex)
mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
int ret;
mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e20d62e..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
- dn, dn->d_count,
- realdn, realdn->d_count,
+ dn, d_count(dn),
+ realdn, d_count(realdn),
realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode;
dout("vmtruncate_work %p\n", inode);
- __ceph_do_pending_vmtruncate(inode, true);
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ mutex_unlock(&inode->i_mutex);
iput(inode);
}
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
* Make sure any pending truncation is applied before doing anything
* that may depend on it.
*/
-void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock)
+void __ceph_do_pending_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
@@ -1525,11 +1527,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- if (needlock)
- mutex_lock(&inode->i_mutex);
truncate_inode_pages(inode->i_mapping, to);
- if (needlock)
- mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
err = inode_change_ok(inode, attr);
if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
return err;
out:
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 690f73f42425..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
/**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
* counter variables, so you can prepare pagelist metadata before calling
* ceph_encode_locks.
*/
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 74fd2898b2ab..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
num = le32_to_cpu(head->num);
dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
session->s_num_cap_releases += num;
/* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, dentry->d_count, *base, len, path);
+ dentry, d_count(dentry), *base, len, path);
return path;
}
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
+ cap->mseq = 0; /* and migrate_seq */
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL)
+ if (mdsc->mdsmap == NULL) {
+ kfree(mdsc);
return -ENOMEM;
+ }
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
+ struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].global_id = global_id;
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].laggy =
- (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- m->m_info[mds].export_targets[j] =
- ceph_decode_32(&pexport_targets);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
+
+ if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ continue;
+
+ info = &m->m_info[mds];
+ info->global_id = global_id;
+ info->state = state;
+ info->addr = addr;
+ info->laggy = (laggy_since.tv_sec != 0 ||
+ laggy_since.tv_nsec != 0);
+ info->num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ info->export_targets = kcalloc(num_export_targets,
+ sizeof(u32), GFP_NOFS);
+ if (info->export_targets == NULL)
+ goto badmem;
+ for (j = 0; j < num_export_targets; j++)
+ info->export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ info->export_targets = NULL;
}
}
@@ -170,7 +174,7 @@ bad:
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(err);
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
- if (*dev_name_end != ':') {
+ if (dev_name_end < dev_name || *dev_name_end != ':') {
pr_err("device name is missing path (no : separator in %s)\n",
dev_name);
goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4aea2e..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
- goto out;
+ return err;
}
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 3d8bf941d126..45e57cc38200 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
/*
* fs/cifs/cifsencrypt.c
*
- * Copyright (C) International Business Machines Corp., 2005,2006
+ * Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -31,6 +31,36 @@
#include <linux/random.h>
#include <linux/highmem.h>
+static int
+cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
+{
+ int rc;
+ unsigned int size;
+
+ if (server->secmech.sdescmd5 != NULL)
+ return 0; /* already allocated */
+
+ server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(server->secmech.md5)) {
+ cifs_dbg(VFS, "could not allocate crypto md5\n");
+ return PTR_ERR(server->secmech.md5);
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.md5);
+ server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdescmd5) {
+ rc = -ENOMEM;
+ crypto_free_shash(server->secmech.md5);
+ server->secmech.md5 = NULL;
+ return rc;
+ }
+ server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+ server->secmech.sdescmd5->shash.flags = 0x0;
+
+ return 0;
+}
+
/*
* Calculate and return the CIFS signature based on the mac key and SMB PDU.
* The 16 byte signature must be allocated by the caller. Note we only use the
@@ -50,8 +80,11 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
return -EINVAL;
if (!server->secmech.sdescmd5) {
- cifs_dbg(VFS, "%s: Can't generate signature\n", __func__);
- return -1;
+ rc = cifs_crypto_shash_md5_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+ return -1;
+ }
}
rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
@@ -556,6 +589,33 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
+static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
+{
+ unsigned int size;
+
+ /* check if already allocated */
+ if (server->secmech.sdeschmacmd5)
+ return 0;
+
+ server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+ if (IS_ERR(server->secmech.hmacmd5)) {
+ cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
+ return PTR_ERR(server->secmech.hmacmd5);
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.hmacmd5);
+ server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdeschmacmd5) {
+ crypto_free_shash(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+ server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+ return 0;
+}
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
@@ -606,6 +666,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+ rc = crypto_hmacmd5_alloc(ses->server);
+ if (rc) {
+ cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
+ goto setup_ntlmv2_rsp_ret;
+ }
+
/* calculate ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
@@ -705,123 +771,32 @@ calc_seckey(struct cifs_ses *ses)
void
cifs_crypto_shash_release(struct TCP_Server_Info *server)
{
- if (server->secmech.cmacaes)
+ if (server->secmech.cmacaes) {
crypto_free_shash(server->secmech.cmacaes);
+ server->secmech.cmacaes = NULL;
+ }
- if (server->secmech.hmacsha256)
+ if (server->secmech.hmacsha256) {
crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ }
- if (server->secmech.md5)
+ if (server->secmech.md5) {
crypto_free_shash(server->secmech.md5);
+ server->secmech.md5 = NULL;
+ }
- if (server->secmech.hmacmd5)
+ if (server->secmech.hmacmd5) {
crypto_free_shash(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
+ }
kfree(server->secmech.sdesccmacaes);
-
+ server->secmech.sdesccmacaes = NULL;
kfree(server->secmech.sdeschmacsha256);
-
+ server->secmech.sdeschmacsha256 = NULL;
kfree(server->secmech.sdeschmacmd5);
-
+ server->secmech.sdeschmacmd5 = NULL;
kfree(server->secmech.sdescmd5);
-}
-
-int
-cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- int rc;
- unsigned int size;
-
- server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
- if (IS_ERR(server->secmech.hmacmd5)) {
- cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
- return PTR_ERR(server->secmech.hmacmd5);
- }
-
- server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(server->secmech.md5)) {
- cifs_dbg(VFS, "could not allocate crypto md5\n");
- rc = PTR_ERR(server->secmech.md5);
- goto crypto_allocate_md5_fail;
- }
-
- server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
- if (IS_ERR(server->secmech.hmacsha256)) {
- cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
- rc = PTR_ERR(server->secmech.hmacsha256);
- goto crypto_allocate_hmacsha256_fail;
- }
-
- server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
- if (IS_ERR(server->secmech.cmacaes)) {
- cifs_dbg(VFS, "could not allocate crypto cmac-aes");
- rc = PTR_ERR(server->secmech.cmacaes);
- goto crypto_allocate_cmacaes_fail;
- }
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacmd5);
- server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacmd5) {
- rc = -ENOMEM;
- goto crypto_allocate_hmacmd5_sdesc_fail;
- }
- server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
- server->secmech.sdeschmacmd5->shash.flags = 0x0;
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.md5);
- server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdescmd5) {
- rc = -ENOMEM;
- goto crypto_allocate_md5_sdesc_fail;
- }
- server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
- server->secmech.sdescmd5->shash.flags = 0x0;
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacsha256);
- server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacsha256) {
- rc = -ENOMEM;
- goto crypto_allocate_hmacsha256_sdesc_fail;
- }
- server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
- server->secmech.sdeschmacsha256->shash.flags = 0x0;
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.cmacaes);
- server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdesccmacaes) {
- cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
- rc = -ENOMEM;
- goto crypto_allocate_cmacaes_sdesc_fail;
- }
- server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
- server->secmech.sdesccmacaes->shash.flags = 0x0;
-
- return 0;
-
-crypto_allocate_cmacaes_sdesc_fail:
- kfree(server->secmech.sdeschmacsha256);
-
-crypto_allocate_hmacsha256_sdesc_fail:
- kfree(server->secmech.sdescmd5);
-
-crypto_allocate_md5_sdesc_fail:
- kfree(server->secmech.sdeschmacmd5);
-
-crypto_allocate_hmacmd5_sdesc_fail:
- crypto_free_shash(server->secmech.cmacaes);
-
-crypto_allocate_cmacaes_fail:
- crypto_free_shash(server->secmech.hmacsha256);
-
-crypto_allocate_hmacsha256_fail:
- crypto_free_shash(server->secmech.md5);
-
-crypto_allocate_md5_fail:
- crypto_free_shash(server->secmech.hmacmd5);
-
- return rc;
+ server->secmech.sdescmd5 = NULL;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e66b08882548..1fdc37041057 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -194,6 +194,7 @@ struct cifs_writedata;
struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
+struct cifs_open_parms;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, void *,
@@ -307,9 +308,8 @@ struct smb_version_operations {
const char *, const char *,
struct cifs_sb_info *);
/* open a file for non-posix mounts */
- int (*open)(const unsigned int, struct cifs_tcon *, const char *, int,
- int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *,
- struct cifs_sb_info *);
+ int (*open)(const unsigned int, struct cifs_open_parms *,
+ __u32 *, FILE_ALL_INFO *);
/* set fid protocol-specific info */
void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
/* close a file */
@@ -912,6 +912,17 @@ struct cifs_search_info {
bool smallBuf:1; /* so we know which buf_release function to call */
};
+struct cifs_open_parms {
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ int disposition;
+ int desired_access;
+ int create_options;
+ const char *path;
+ struct cifs_fid *fid;
+ bool reconnect:1;
+};
+
struct cifs_fid {
__u16 netfid;
#ifdef CONFIG_CIFS_SMB2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c8ff018fae68..f7e584d047e2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -433,7 +433,6 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
const struct nls_table *);
extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
-extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
extern int calc_seckey(struct cifs_ses *);
extern void generate_smb3signingkey(struct TCP_Server_Info *);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index afcb8a1a33b7..fa68813396b5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2108,12 +2108,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
goto out_err;
}
- rc = cifs_crypto_shash_allocate(tcp_ses);
- if (rc) {
- cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
- goto out_err;
- }
-
tcp_ses->ops = volume_info->ops;
tcp_ses->vals = volume_info->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5175aebf6737..d62ce0d48141 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,6 +204,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
struct inode *newinode = NULL;
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_open_parms oparms;
*oplock = 0;
if (tcon->ses->server->oplocks)
@@ -319,9 +320,16 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, fid, oplock,
- buf, cifs_sb);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
goto out;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 91d8629e69a2..1e57f36ea1b2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -183,6 +183,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
int create_options = CREATE_NOT_DIR;
FILE_ALL_INFO *buf;
struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_open_parms oparms;
if (!server->ops->open)
return -ENOSYS;
@@ -224,9 +225,16 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, fid, oplock, buf,
- cifs_sb);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc)
goto out;
@@ -553,11 +561,10 @@ cifs_relock_file(struct cifsFileInfo *cfile)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
- /* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ down_read(&cinode->lock_sem);
if (cinode->can_cache_brlcks) {
- /* can cache locks - no need to push them */
- up_write(&cinode->lock_sem);
+ /* can cache locks - no need to relock */
+ up_read(&cinode->lock_sem);
return rc;
}
@@ -568,7 +575,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
else
rc = tcon->ses->server->ops->push_mand_locks(cfile);
- up_write(&cinode->lock_sem);
+ up_read(&cinode->lock_sem);
return rc;
}
@@ -587,7 +594,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
int desired_access;
int disposition = FILE_OPEN;
int create_options = CREATE_NOT_DIR;
- struct cifs_fid fid;
+ struct cifs_open_parms oparms;
xid = get_xid();
mutex_lock(&cfile->fh_mutex);
@@ -637,7 +644,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
rc = cifs_posix_open(full_path, NULL, inode->i_sb,
cifs_sb->mnt_file_mode /* ignored */,
- oflags, &oplock, &fid.netfid, xid);
+ oflags, &oplock, &cfile->fid.netfid, xid);
if (rc == 0) {
cifs_dbg(FYI, "posix reopen succeeded\n");
goto reopen_success;
@@ -654,7 +661,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
create_options |= CREATE_OPEN_BACKUP_INTENT;
if (server->ops->get_lease_key)
- server->ops->get_lease_key(inode, &fid);
+ server->ops->get_lease_key(inode, &cfile->fid);
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = &cfile->fid;
+ oparms.reconnect = true;
/*
* Can not refresh inode by passing in file_info buf to be returned by
@@ -663,9 +679,14 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
* version of file size can be stale. If we knew for sure that inode was
* not dirty locally we could do this.
*/
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, &fid, &oplock,
- NULL, cifs_sb);
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ if (rc == -ENOENT && oparms.reconnect == false) {
+ /* durable handle timeout is expired - open the file again */
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ /* indicate that we need to relock the file */
+ oparms.reconnect = true;
+ }
+
if (rc) {
mutex_unlock(&cfile->fh_mutex);
cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
@@ -696,8 +717,9 @@ reopen_success:
* to the server to get the new inode info.
*/
- server->ops->set_fid(cfile, &fid, oplock);
- cifs_relock_file(cfile);
+ server->ops->set_fid(cfile, &cfile->fid, oplock);
+ if (oparms.reconnect)
+ cifs_relock_file(cfile);
reopen_error_exit:
kfree(full_path);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20efd81266c6..449b6cf09b09 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -558,6 +558,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
fattr->cf_mode &= ~(S_IWUGO);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ if (fattr->cf_nlink < 1) {
+ cifs_dbg(1, "replacing bogus file nlink value %u\n",
+ fattr->cf_nlink);
+ fattr->cf_nlink = 1;
+ }
}
fattr->cf_uid = cifs_sb->mnt_uid;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index e813f04511d8..6457690731a2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -674,20 +674,23 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
}
static int
-cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
- int disposition, int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
- struct cifs_sb_info *cifs_sb)
-{
- if (!(tcon->ses->capabilities & CAP_NT_SMBS))
- return SMBLegacyOpen(xid, tcon, path, disposition,
- desired_access, create_options,
- &fid->netfid, oplock, buf,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf)
+{
+ if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
+ return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition,
+ oparms->desired_access,
+ oparms->create_options,
+ &oparms->fid->netfid, oplock, buf,
+ oparms->cifs_sb->local_nls,
+ oparms->cifs_sb->mnt_cifs_flags
& CIFS_MOUNT_MAP_SPECIAL_CHR);
- return CIFSSMBOpen(xid, tcon, path, disposition, desired_access,
- create_options, &fid->netfid, oplock, buf,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition, oparms->desired_access,
+ oparms->create_options, &oparms->fid->netfid, oplock,
+ buf, oparms->cifs_sb->local_nls,
+ oparms->cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 5da1b55a2258..04a81a4142c3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -40,7 +40,8 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
oplock &= 0xFF;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
- if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+ if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+ oplock == SMB2_OPLOCK_LEVEL_BATCH) {
cinode->clientCanCacheAll = true;
cinode->clientCanCacheRead = true;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
@@ -57,17 +58,16 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
}
int
-smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
- int disposition, int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
- struct cifs_sb_info *cifs_sb)
+smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf)
{
int rc;
__le16 *smb2_path;
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
+ struct cifs_fid *fid = oparms->fid;
- smb2_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
rc = -ENOMEM;
goto out;
@@ -80,21 +80,19 @@ smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
goto out;
}
- desired_access |= FILE_READ_ATTRIBUTES;
- *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
- if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+ if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
- rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid,
- &fid->volatile_fid, desired_access, disposition,
- 0, 0, smb2_oplock, smb2_data);
+ rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
if (rc)
goto out;
if (buf) {
/* open response does not have IndexNumber field - get it */
- rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid,
+ rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid,
&smb2_data->IndexNumber);
if (rc) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fff6dfba6204..c6ec1633309a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -41,21 +41,26 @@ static int
smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition,
- __u32 file_attributes, __u32 create_options,
- void *data, int command)
+ __u32 create_options, void *data, int command)
{
int rc, tmprc = 0;
- u64 persistent_fid, volatile_fid;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- desired_access, create_disposition, file_attributes,
- create_options, &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
if (rc) {
kfree(utf16_path);
return rc;
@@ -65,8 +70,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
case SMB2_OP_DELETE:
break;
case SMB2_OP_QUERY_INFO:
- tmprc = SMB2_query_info(xid, tcon, persistent_fid,
- volatile_fid,
+ tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid,
(struct smb2_file_all_info *)data);
break;
case SMB2_OP_MKDIR:
@@ -76,19 +81,21 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
*/
break;
case SMB2_OP_RENAME:
- tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid,
- (__le16 *)data);
+ tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, (__le16 *)data);
break;
case SMB2_OP_HARDLINK:
- tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid,
- volatile_fid, (__le16 *)data);
+ tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, (__le16 *)data);
break;
case SMB2_OP_SET_EOF:
- tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, (__le64 *)data);
+ tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, current->tgid,
+ (__le64 *)data);
break;
case SMB2_OP_SET_INFO:
- tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid,
+ tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid,
(FILE_BASIC_INFO *)data);
break;
default:
@@ -96,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
break;
}
- rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (tmprc)
rc = tmprc;
kfree(utf16_path);
@@ -129,8 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return -ENOMEM;
rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0,
- smb2_data, SMB2_OP_QUERY_INFO);
+ FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
+ SMB2_OP_QUERY_INFO);
if (rc)
goto out;
@@ -145,7 +152,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
}
@@ -164,7 +171,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
@@ -175,7 +182,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+ CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
NULL, SMB2_OP_DELETE);
}
@@ -184,7 +191,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- 0, CREATE_DELETE_ON_CLOSE, NULL,
+ CREATE_DELETE_ON_CLOSE, NULL,
SMB2_OP_DELETE);
}
@@ -203,7 +210,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, 0, smb2_to_name, command);
+ FILE_OPEN, 0, smb2_to_name, command);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -234,7 +241,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
{
__le64 eof = cpu_to_le64(size);
return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof,
+ FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
SMB2_OP_SET_EOF);
}
@@ -250,7 +257,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf,
+ FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
SMB2_OP_SET_INFO);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6d15cab95b99..f259e6cc8357 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -213,22 +213,29 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
{
int rc;
- __u64 persistent_fid, volatile_fid;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
if (rc) {
kfree(utf16_path);
return rc;
}
- rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
kfree(utf16_path);
return rc;
}
@@ -443,15 +450,20 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
__le16 *utf16_path;
int rc;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- __u64 persistent_fid, volatile_fid;
+ struct cifs_open_parms oparms;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0,
- &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
kfree(utf16_path);
if (rc) {
cifs_dbg(VFS, "open dir failed\n");
@@ -460,14 +472,12 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
srch_inf->entries_in_buffer = 0;
srch_inf->index_of_last_entry = 0;
- fid->persistent_fid = persistent_fid;
- fid->volatile_fid = volatile_fid;
- rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
- srch_inf);
+ rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
+ fid->volatile_fid, 0, srch_inf);
if (rc) {
cifs_dbg(VFS, "query directory failed\n");
- SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
return rc;
}
@@ -528,17 +538,25 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
struct kstatfs *buf)
{
int rc;
- u64 persistent_fid, volatile_fid;
__le16 srch_path = 0; /* Null - open root of share */
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
- rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
if (rc)
return rc;
buf->f_type = SMB2_MAGIC_NUMBER;
- rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf);
- SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ buf);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
return rc;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b312e4eeaa6..abc9c2809b51 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -847,29 +847,76 @@ create_lease_buf(u8 *lease_key, u8 oplock)
return buf;
}
+static struct create_durable *
+create_durable_buf(void)
+{
+ struct create_durable *buf;
+
+ buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable, Data));
+ buf->ccontext.DataLength = cpu_to_le32(16);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = 'n';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable *
+create_reconnect_durable_buf(struct cifs_fid *fid)
+{
+ struct create_durable *buf;
+
+ buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable, Data));
+ buf->ccontext.DataLength = cpu_to_le32(16);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Data.Fid.PersistentFileId = fid->persistent_fid;
+ buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = 'n';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
static __u8
parse_lease_state(struct smb2_create_rsp *rsp)
{
char *data_offset;
struct create_lease *lc;
bool found = false;
+ unsigned int next = 0;
+ char *name;
- data_offset = (char *)rsp;
- data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
lc = (struct create_lease *)data_offset;
do {
- char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
+ lc = (struct create_lease *)((char *)lc + next);
+ name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
strncmp(name, "RqLs", 4)) {
- lc = (struct create_lease *)((char *)lc
- + le32_to_cpu(lc->ccontext.Next));
+ next = le32_to_cpu(lc->ccontext.Next);
continue;
}
if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
found = true;
break;
- } while (le32_to_cpu(lc->ccontext.Next) != 0);
+ } while (next != 0);
if (!found)
return 0;
@@ -877,23 +924,74 @@ parse_lease_state(struct smb2_create_rsp *rsp)
return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
}
+static int
+add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_lease);
+ req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) - 4 +
+ iov[num - 1].iov_len);
+ req->CreateContextsLength = cpu_to_le32(
+ le32_to_cpu(req->CreateContextsLength) +
+ sizeof(struct create_lease));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (oparms->reconnect) {
+ iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+ } else
+ iov[num].iov_base = create_durable_buf();
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ req->CreateContextsLength =
+ cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
+ sizeof(struct create_durable));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
+ *num_iovec = num + 1;
+ return 0;
+}
+
int
-SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
- u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
- __u32 create_disposition, __u32 file_attributes, __u32 create_options,
+SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u8 *oplock, struct smb2_file_all_info *buf)
{
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
struct TCP_Server_Info *server;
+ struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct kvec iov[3];
+ struct kvec iov[4];
int resp_buftype;
int uni_path_len;
__le16 *copy_path = NULL;
int copy_size;
int rc = 0;
- int num_iovecs = 2;
+ unsigned int num_iovecs = 2;
+ __u32 file_attributes = 0;
cifs_dbg(FYI, "create/open\n");
@@ -906,55 +1004,47 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
if (rc)
return rc;
+ if (oparms->create_options & CREATE_OPTION_READONLY)
+ file_attributes |= ATTR_READONLY;
+
req->ImpersonationLevel = IL_IMPERSONATION;
- req->DesiredAccess = cpu_to_le32(desired_access);
+ req->DesiredAccess = cpu_to_le32(oparms->desired_access);
/* File attributes ignored on open (used in create though) */
req->FileAttributes = cpu_to_le32(file_attributes);
req->ShareAccess = FILE_SHARE_ALL_LE;
- req->CreateDisposition = cpu_to_le32(create_disposition);
- req->CreateOptions = cpu_to_le32(create_options);
+ req->CreateDisposition = cpu_to_le32(oparms->disposition);
+ req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
- req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)
- - 8 /* pad */ - 4 /* do not count rfc1001 len field */);
+ /* do not count rfc1001 len field */
+ req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field */
iov[0].iov_len = get_rfc1002_length(req) + 4;
/* MUST set path len (NameLength) to 0 opening root of share */
- if (uni_path_len >= 4) {
- req->NameLength = cpu_to_le16(uni_path_len - 2);
- /* -1 since last byte is buf[0] which is sent below (path) */
- iov[0].iov_len--;
- if (uni_path_len % 8 != 0) {
- copy_size = uni_path_len / 8 * 8;
- if (copy_size < uni_path_len)
- copy_size += 8;
-
- copy_path = kzalloc(copy_size, GFP_KERNEL);
- if (!copy_path)
- return -ENOMEM;
- memcpy((char *)copy_path, (const char *)path,
- uni_path_len);
- uni_path_len = copy_size;
- path = copy_path;
- }
-
- iov[1].iov_len = uni_path_len;
- iov[1].iov_base = path;
- /*
- * -1 since last byte is buf[0] which was counted in
- * smb2_buf_len.
- */
- inc_rfc1001_len(req, uni_path_len - 1);
- } else {
- iov[0].iov_len += 7;
- req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
- req->hdr.smb2_buf_length) + 8 - 1);
- num_iovecs = 1;
- req->NameLength = 0;
+ req->NameLength = cpu_to_le16(uni_path_len - 2);
+ /* -1 since last byte is buf[0] which is sent below (path) */
+ iov[0].iov_len--;
+ if (uni_path_len % 8 != 0) {
+ copy_size = uni_path_len / 8 * 8;
+ if (copy_size < uni_path_len)
+ copy_size += 8;
+
+ copy_path = kzalloc(copy_size, GFP_KERNEL);
+ if (!copy_path)
+ return -ENOMEM;
+ memcpy((char *)copy_path, (const char *)path,
+ uni_path_len);
+ uni_path_len = copy_size;
+ path = copy_path;
}
+ iov[1].iov_len = uni_path_len;
+ iov[1].iov_base = path;
+ /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
+ inc_rfc1001_len(req, uni_path_len - 1);
+
if (!server->oplocks)
*oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -962,21 +1052,29 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
*oplock == SMB2_OPLOCK_LEVEL_NONE)
req->RequestedOplockLevel = *oplock;
else {
- iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock);
- if (iov[num_iovecs].iov_base == NULL) {
+ rc = add_lease_context(iov, &num_iovecs, oplock);
+ if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
- return -ENOMEM;
+ return rc;
+ }
+ }
+
+ if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+ /* need to set Next field of lease context if we request it */
+ if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[num_iovecs-1].iov_base;
+ ccontext->Next =
+ cpu_to_le32(sizeof(struct create_lease));
+ }
+ rc = add_durable_context(iov, &num_iovecs, oparms);
+ if (rc) {
+ cifs_small_buf_release(req);
+ kfree(copy_path);
+ kfree(iov[num_iovecs-1].iov_base);
+ return rc;
}
- iov[num_iovecs].iov_len = sizeof(struct create_lease);
- req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) - 4 - 8 +
- iov[num_iovecs-1].iov_len);
- req->CreateContextsLength = cpu_to_le32(
- sizeof(struct create_lease));
- inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
- num_iovecs++;
}
rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -987,8 +1085,8 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
goto creat_exit;
}
- *persistent_fid = rsp->PersistentFileId;
- *volatile_fid = rsp->VolatileFileId;
+ oparms->fid->persistent_fid = rsp->PersistentFileId;
+ oparms->fid->volatile_fid = rsp->VolatileFileId;
if (buf) {
memcpy(buf, &rsp->CreationTime, 32);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f31043b26bd3..36b0d37ea69b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -428,7 +428,7 @@ struct smb2_create_req {
__le16 NameLength;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
- __u8 Buffer[8];
+ __u8 Buffer[0];
} __packed;
struct smb2_create_rsp {
@@ -485,6 +485,18 @@ struct create_lease {
struct lease_context lcontext;
} __packed;
+struct create_durable {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
/* this goes in the ioctl buffer when doing a copychunk request */
struct copychunk_ioctl {
char SourceKey[24];
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index d4e1eb807457..1a5ecbed40ed 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,11 +84,9 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *from_name, const char *to_name,
struct cifs_sb_info *cifs_sb);
-extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon,
- const char *full_path, int disposition,
- int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock,
- FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
+extern int smb2_open_file(const unsigned int xid,
+ struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf);
extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
extern int smb2_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
@@ -106,11 +104,9 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
const char *tree, struct cifs_tcon *tcon,
const struct nls_table *);
extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
-extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
- __le16 *path, u64 *persistent_fid, u64 *volatile_fid,
- __u32 desired_access, __u32 create_disposition,
- __u32 file_attributes, __u32 create_options,
- __u8 *oplock, struct smb2_file_all_info *buf);
+extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
+ __le16 *path, __u8 *oplock,
+ struct smb2_file_all_info *buf);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
bool is_fsctl, char *in_data, u32 indatalen,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 09b4fbaadeb6..301b191270b9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,6 +39,77 @@
#include "smb2status.h"
#include "smb2glob.h"
+static int
+smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ unsigned int size;
+
+ if (server->secmech.sdeschmacsha256 != NULL)
+ return 0; /* already allocated */
+
+ server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ if (IS_ERR(server->secmech.hmacsha256)) {
+ cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+ return PTR_ERR(server->secmech.hmacsha256);
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.hmacsha256);
+ server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdeschmacsha256) {
+ crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
+ server->secmech.sdeschmacsha256->shash.flags = 0x0;
+
+ return 0;
+}
+
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ unsigned int size;
+ int rc;
+
+ if (server->secmech.sdesccmacaes != NULL)
+ return 0; /* already allocated */
+
+ rc = smb2_crypto_shash_allocate(server);
+ if (rc)
+ return rc;
+
+ server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(server->secmech.cmacaes)) {
+ cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+ kfree(server->secmech.sdeschmacsha256);
+ server->secmech.sdeschmacsha256 = NULL;
+ crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ return PTR_ERR(server->secmech.cmacaes);
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.cmacaes);
+ server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdesccmacaes) {
+ cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+ kfree(server->secmech.sdeschmacsha256);
+ server->secmech.sdeschmacsha256 = NULL;
+ crypto_free_shash(server->secmech.hmacsha256);
+ crypto_free_shash(server->secmech.cmacaes);
+ server->secmech.hmacsha256 = NULL;
+ server->secmech.cmacaes = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+ server->secmech.sdesccmacaes->shash.flags = 0x0;
+
+ return 0;
+}
+
+
int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
@@ -52,6 +123,12 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+ rc = smb2_crypto_shash_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
+ return rc;
+ }
+
rc = crypto_shash_setkey(server->secmech.hmacsha256,
server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
@@ -61,7 +138,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init sha256", __func__);
return rc;
}
@@ -129,6 +206,12 @@ generate_smb3signingkey(struct TCP_Server_Info *server)
memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+ rc = smb3_crypto_shash_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+ goto smb3signkey_ret;
+ }
+
rc = crypto_shash_setkey(server->secmech.hmacsha256,
server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
@@ -210,6 +293,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
+ /*
+ * we already allocate sdesccmacaes when we init smb3 signing key,
+ * so unlike smb2 case we do not have to check here if secmech are
+ * initialized
+ */
rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
if (rc) {
cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 14a14808320c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -526,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
if (cii->c_flags & C_FLUSH)
coda_flag_inode_children(inode, C_FLUSH);
- if (de->d_count > 1)
+ if (d_count(de) > 1)
/* pretend it's valid, but don't change the flags */
goto out;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 64e5323cbbb0..277bd1be21fd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
- pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+ pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
dput(parent);
}
@@ -660,19 +660,15 @@ static int create_default_group(struct config_group *parent_group,
struct config_group *group)
{
int ret;
- struct qstr name;
struct configfs_dirent *sd;
/* We trust the caller holds a reference to parent */
struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
if (!group->cg_item.ci_name)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
- name.name = group->cg_item.ci_name;
- name.len = strlen(name.name);
- name.hash = full_name_hash(name.name, name.len);
ret = -ENOMEM;
- child = d_alloc(parent, &name);
+ child = d_alloc_name(parent, group->cg_item.ci_name);
if (child) {
d_add(child, NULL);
@@ -1650,7 +1646,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
struct config_group *group = &subsys->su_group;
- struct qstr name;
struct dentry *dentry;
struct dentry *root;
struct configfs_dirent *sd;
@@ -1667,12 +1662,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
- name.name = group->cg_item.ci_name;
- name.len = strlen(name.name);
- name.hash = full_name_hash(name.name, name.len);
-
err = -ENOMEM;
- dentry = d_alloc(root, &name);
+ dentry = d_alloc_name(root, group->cg_item.ci_name);
if (dentry) {
d_add(dentry, NULL);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cfa109a4d5a2..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
+#define DECRYPT 0
+#define ENCRYPT 1
/**
* ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
}
/**
- * encrypt_scatterlist
+ * crypt_scatterlist
* @crypt_stat: Pointer to the crypt_stat struct to initialize.
- * @dest_sg: Destination of encrypted data
- * @src_sg: Data to be encrypted
- * @size: Length of data to be encrypted
- * @iv: iv to use during encryption
+ * @dst_sg: Destination of the data after performing the crypto operation
+ * @src_sg: Data to be encrypted or decrypted
+ * @size: Length of data
+ * @iv: IV to use
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Returns the number of bytes encrypted; negative value on error
+ * Returns the number of bytes encrypted or decrypted; negative value on error
*/
-static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
+static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+ struct scatterlist *dst_sg,
+ struct scatterlist *src_sg, int size,
+ unsigned char *iv, int op)
{
struct ablkcipher_request *req = NULL;
struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->flags |= ECRYPTFS_KEY_SET;
}
mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
- ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
- rc = crypto_ablkcipher_encrypt(req);
+ ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+ rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
+ crypto_ablkcipher_decrypt(req);
if (rc == -EINPROGRESS || rc == -EBUSY) {
struct extent_crypt_result *ecr = req->base.data;
@@ -407,41 +400,43 @@ out:
}
/**
- * ecryptfs_lower_offset_for_extent
+ * lower_offset_for_page
*
* Convert an eCryptfs page index into a lower byte offset
*/
-static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
- struct ecryptfs_crypt_stat *crypt_stat)
+static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *page)
{
- (*offset) = ecryptfs_lower_header_size(crypt_stat)
- + (crypt_stat->extent_size * extent_num);
+ return ecryptfs_lower_header_size(crypt_stat) +
+ (page->index << PAGE_CACHE_SHIFT);
}
/**
- * ecryptfs_encrypt_extent
- * @enc_extent_page: Allocated page into which to encrypt the data in
- * @page
+ * crypt_extent
* @crypt_stat: crypt_stat containing cryptographic context for the
* encryption operation
- * @page: Page containing plaintext data extent to encrypt
+ * @dst_page: The page to write the result into
+ * @src_page: The page to read from
* @extent_offset: Page extent offset for use in generating IV
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Encrypts one extent of data.
+ * Encrypts or decrypts one extent of data.
*
* Return zero on success; non-zero otherwise
*/
-static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page,
- unsigned long extent_offset)
+static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page,
+ struct page *src_page,
+ unsigned long extent_offset, int op)
{
+ pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+ struct scatterlist src_sg, dst_sg;
+ size_t extent_size = crypt_stat->extent_size;
int rc;
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
+ extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
- page, (extent_offset
- * crypt_stat->extent_size),
- crypt_stat->extent_size, extent_iv);
+
+ sg_init_table(&src_sg, 1);
+ sg_init_table(&dst_sg, 1);
+
+ sg_set_page(&src_sg, src_page, extent_size,
+ extent_offset * extent_size);
+ sg_set_page(&dst_sg, dst_page, extent_size,
+ extent_offset * extent_size);
+
+ rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
+ extent_iv, op);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to encrypt page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ printk(KERN_ERR "%s: Error attempting to crypt page with "
+ "page_index = [%ld], extent_offset = [%ld]; "
+ "rc = [%d]\n", __func__, page_index, extent_offset, rc);
goto out;
}
rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
char *enc_extent_virt;
struct page *enc_extent_page = NULL;
loff_t extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
"encrypted extent\n");
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, enc_extent_page, page,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
goto out;
}
- ecryptfs_lower_offset_for_extent(
- &offset, ((((loff_t)page->index)
- * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
- offset, crypt_stat->extent_size);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to write lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
- }
- rc = 0;
-out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
}
- return rc;
-}
-static int ecryptfs_decrypt_extent(struct page *page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *enc_extent_page,
- unsigned long extent_offset)
-{
- loff_t extent_base;
- char extent_iv[ECRYPTFS_MAX_IV_BYTES];
- int rc;
-
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
- rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
- (extent_offset
- * crypt_stat->extent_size),
- enc_extent_page, 0,
- crypt_stat->extent_size, extent_iv);
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ enc_extent_virt = kmap(enc_extent_page);
+ rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
+ PAGE_CACHE_SIZE);
+ kunmap(enc_extent_page);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to decrypt to page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to write lower page; rc = [%d]\n",
+ rc);
goto out;
}
rc = 0;
out:
+ if (enc_extent_page) {
+ __free_page(enc_extent_page);
+ }
return rc;
}
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- char *enc_extent_virt;
- struct page *enc_extent_page = NULL;
+ char *page_virt;
unsigned long extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- enc_extent_page = alloc_page(GFP_USER);
- if (!enc_extent_page) {
- rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR, "Error allocating memory for "
- "encrypted extent\n");
+
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ page_virt = kmap(page);
+ rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+ ecryptfs_inode);
+ kunmap(page);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to read lower page; rc = [%d]\n",
+ rc);
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- ecryptfs_lower_offset_for_extent(
- &offset, ((page->index * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_read_lower(enc_extent_virt, offset,
- crypt_stat->extent_size,
- ecryptfs_inode);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to read lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
- rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, page, page,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -638,140 +590,7 @@ int ecryptfs_decrypt_page(struct page *page)
}
}
out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
- }
- return rc;
-}
-
-/**
- * decrypt_scatterlist
- * @crypt_stat: Cryptographic context
- * @dest_sg: The destination scatterlist to decrypt into
- * @src_sg: The source scatterlist to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted; negative value on error
- */
-static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
-{
- struct ablkcipher_request *req = NULL;
- struct extent_crypt_result ecr;
- int rc = 0;
-
- BUG_ON(!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
- crypt_stat->key_size);
- ecryptfs_dump_hex(crypt_stat->key,
- crypt_stat->key_size);
- }
-
- init_completion(&ecr.completion);
-
- mutex_lock(&crypt_stat->cs_tfm_mutex);
- req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
- if (!req) {
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -ENOMEM;
- goto out;
- }
-
- ablkcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- extent_crypt_complete, &ecr);
- /* Consider doing this once, when the file is opened */
- if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
- rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_ERR,
- "Error setting key; rc = [%d]\n",
- rc);
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -EINVAL;
- goto out;
- }
- crypt_stat->flags |= ECRYPTFS_KEY_SET;
- }
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
- ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
- rc = crypto_ablkcipher_decrypt(req);
- if (rc == -EINPROGRESS || rc == -EBUSY) {
- struct extent_crypt_result *ecr = req->base.data;
-
- wait_for_completion(&ecr->completion);
- rc = ecr->rc;
- INIT_COMPLETION(ecr->completion);
- }
-out:
- ablkcipher_request_free(req);
return rc;
-
-}
-
-/**
- * ecryptfs_encrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to encrypt into
- * @dst_offset: The offset in the page to encrypt into
- * @src_page: The page to encrypt from
- * @src_offset: The offset in the page to encrypt from
- * @size: The number of bytes to encrypt
- * @iv: The initialization vector to use for the encryption
- *
- * Returns the number of bytes encrypted
- */
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_init_table(&dst_sg, 1);
-
- sg_set_page(&src_sg, src_page, size, src_offset);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
- return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
-
-/**
- * ecryptfs_decrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to decrypt into
- * @dst_offset: The offset in the page to decrypt into
- * @src_page: The page to decrypt from
- * @src_offset: The offset in the page to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted
- */
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_set_page(&src_sg, src_page, size, src_offset);
-
- sg_init_table(&dst_sg, 1);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
-
- return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
}
#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24f1105fda3a..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
unsigned long nr_segs, loff_t pos)
{
ssize_t rc;
- struct path lower;
+ struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,9 +60,8 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
if (-EIOCBQUEUED == rc)
rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
- lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
- lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
- touch_atime(&lower);
+ path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
+ touch_atime(path);
}
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a2f2bb2c256d..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
- BUG_ON(!lower_dentry->d_count);
+ BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
ecryptfs_set_dentry_lower(dentry, lower_dentry);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
+ rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+ "rc = [%d]\n", path->dentry, path->mnt, rc);
(*lower_file) = NULL;
}
return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
goto unlock;
}
msg_size = (sizeof(*msg) + msg->data_len);
- msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
+ msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
if (!msg_ctx->msg) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
"GFP_KERNEL memory\n", __func__, msg_size);
goto unlock;
}
- memcpy(msg_ctx->msg, msg, msg_size);
msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
wake_up_process(msg_ctx->task);
rc = 0;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 7e787fb90293..07ab49745e31 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -155,20 +155,8 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
return 0;
};
-/*
- * Handle negative dentry.
- */
-static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_add(dentry, NULL);
- return NULL;
-}
-
const struct inode_operations efivarfs_dir_inode_operations = {
- .lookup = efivarfs_lookup,
+ .lookup = simple_lookup,
.unlink = efivarfs_unlink,
.create = efivarfs_create,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext3_sync_file_enter(file, datasync);
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated state */
+ smp_rmb();
+ if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+ return -EROFS;
return 0;
-
+ }
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
goto out;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 998ea111e537..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1780,11 +1780,11 @@ retry:
inode->i_op = &ext3_file_inode_operations;
inode->i_fop = &ext3_file_operations;
ext3_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext3_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
if (test_opt (sb, ERRORS_RO)) {
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- sb->s_flags |= MS_RDONLY;
set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 58339393fa6e..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
- group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- block) >>
+ group = (block -
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7097b0f680e6..a61873808f76 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2835,6 +2835,9 @@ again:
err = -EIO;
break;
}
+ /* Yield here to deal with large extent trees.
+ * Should be a no-op if we did IO above. */
+ cond_resched();
if (WARN_ON(i + 1 > depth)) {
err = -EIO;
break;
@@ -4261,8 +4264,8 @@ got_allocated_blocks:
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), fb_flags);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
goto out2;
}
@@ -4382,8 +4385,9 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
-
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+ ext4_es_lru_add(inode);
return err ? err : allocated;
}
@@ -4405,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
+ if (err == ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ return;
+ }
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ ext4_std_error(inode->i_sb, err);
}
static void ext4_falloc_update_inode(struct inode *inode,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ee018d5f397e..91cb110da1b4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -148,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
int __init ext4_init_es(void)
{
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
if (in_range(es->es_lblk, ee_block, ee_len)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu we can find an extent "
"at block [%d/%d/%llu/%c], but we "
"want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (es->es_lblk < ee_block ||
ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
if (ee_status ^ es_status) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
* that we don't want to add an written/unwritten extent.
*/
if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"can't find an extent at block %d but we want "
"to add an written/unwritten extent "
"[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
* We want to add a delayed/hole extent but this
* block has been allocated.
*/
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can find blocks but we want to add a "
"delayed/hole extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
return;
} else if (ext4_es_is_written(es)) {
if (retval != es->es_len) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu retval %d != es_len %d\n",
inode->i_ino, retval, es->es_len);
return;
}
if (map.m_pblk != ext4_es_pblock(es)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu m_pblk %llu != "
"es_pblk %llu\n",
inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
}
} else if (retval == 0) {
if (ext4_es_is_written(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can't find the block but we want to add "
"an written extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -632,10 +634,8 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a extent status tree.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
*
* Return 0 on success, error code on failure.
*/
@@ -667,7 +667,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
err = __es_remove_extent(inode, lblk, end);
if (err != 0)
goto error;
+retry:
err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -746,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err = 0;
+ int err;
+retry:
+ err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -782,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (err) {
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
goto out;
}
} else {
@@ -891,22 +903,14 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
return -1;
}
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
struct list_head *cur, *tmp;
LIST_HEAD(skiped);
- int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
- trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-
- if (!nr_to_scan)
- return ret;
-
spin_lock(&sbi->s_es_lru_lock);
/*
@@ -935,7 +939,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
continue;
}
- if (ei->i_es_lru_nr == 0)
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei)
continue;
write_lock(&ei->i_es_lock);
@@ -954,6 +958,27 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
list_splice_tail(&skiped, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+
+ return nr_shrunk;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0188e65e1f58..ba33c67d6e48 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
if (es_map->m_lblk != map->m_lblk ||
es_map->m_flags != map->m_flags ||
es_map->m_pblk != map->m_pblk) {
- printk("ES cache assertation failed for inode: %lu "
+ printk("ES cache assertion failed for inode: %lu "
"es_cached ex [%d/%d/%llu/%x] != "
"found ex [%d/%d/%llu/%x] retval %d flags %x\n",
inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -514,10 +514,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
- ext4_es_lru_add(inode);
-
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -558,7 +557,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
#ifdef ES_AGGRESSIVE_TEST
if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
+ printk("ES len assertion failed for inode: %lu "
"retval %d != map->m_len %d "
"in %s (lookup)\n", inode->i_ino, retval,
map->m_len, __func__);
@@ -659,7 +658,7 @@ found:
#ifdef ES_AGGRESSIVE_TEST
if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
+ printk("ES len assertion failed for inode: %lu "
"retval %d != map->m_len %d "
"in %s (allocation)\n", inode->i_ino, retval,
map->m_len, __func__);
@@ -1529,11 +1528,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
- ext4_es_lru_add(inode);
-
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
-
+ ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read((&EXT4_I(inode)->i_data_sem));
@@ -1642,7 +1639,7 @@ add_delayed:
#ifdef ES_AGGRESSIVE_TEST
if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
+ printk("ES len assertion failed for inode: %lu "
"retval %d != map->m_len %d "
"in %s (lookup)\n", inode->i_ino, retval,
map->m_len, __func__);
@@ -2163,7 +2160,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
- while (map->m_len) {
+ do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
struct super_block *sb = inode->i_sb;
@@ -2201,7 +2198,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
err = mpage_map_and_submit_buffers(mpd);
if (err < 0)
return err;
- }
+ } while (map->m_len);
/* Update on-disk size after IO is submitted */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9ff5e5137ca..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4740,11 +4740,16 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
+ retry:
new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
- ext4_mb_unload_buddy(&e4b);
- err = -ENOMEM;
- goto error_return;
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
}
new_entry->efd_start_cluster = bit;
new_entry->efd_group = block_group;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 234b834d5a97..35f55a0dbc4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2316,11 +2316,11 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
if (handle)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 48786cdb5e6c..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -55,7 +56,7 @@ void ext4_exit_pageio(void)
static void buffer_io_error(struct buffer_head *bh)
{
char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
bdevname(bh->b_bdev, b),
(unsigned long long)bh->b_blocknr);
}
@@ -308,6 +309,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
return io_end;
}
+/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
@@ -318,18 +320,6 @@ static void ext4_end_bio(struct bio *bio, int error)
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- /*
- * Link bio into list hanging from io_end. We have to do it
- * atomically as bio completions can be racing against each
- * other.
- */
- bio->bi_private = xchg(&io_end->bio, bio);
- } else {
- ext4_finish_bio(bio);
- bio_put(bio);
- }
-
if (error) {
struct inode *inode = io_end->inode;
@@ -341,7 +331,24 @@ static void ext4_end_bio(struct bio *bio, int error)
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
}
- ext4_put_io_end_defer(io_end);
+
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
}
void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 85b3dd60169b..bca26f34edf4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1702,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
#endif
}
@@ -3624,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
- /* Do we have standard group size of blocksize * 8 blocks ? */
- if (sbi->s_blocks_per_group == blocksize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
-
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3697,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
/*
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9d1cd423450d..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -610,13 +610,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned int bit_pos = 0, start_bit_pos = 0;
+ unsigned int bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
unsigned char d_type = DT_UNKNOWN;
- int slots;
bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
@@ -625,7 +624,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(dentry_page))
continue;
- start_bit_pos = bit_pos;
dentry_blk = kmap(dentry_page);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -634,19 +632,19 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (bit_pos >= NR_DENTRY_IN_BLOCK)
break;
- ctx->pos += bit_pos - start_bit_pos;
de = &dentry_blk->dentry[bit_pos];
if (de->file_type < F2FS_FT_MAX)
d_type = f2fs_filetype_table[de->file_type];
else
d_type = DT_UNKNOWN;
if (!dir_emit(ctx,
- dentry_blk->filename[bit_pos],
- le16_to_cpu(de->name_len),
- le32_to_cpu(de->ino), d_type))
- goto success;
- slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- bit_pos += slots;
+ dentry_blk->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ le32_to_cpu(de->ino), d_type))
+ goto stop;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
}
bit_pos = 0;
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -654,7 +652,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
f2fs_put_page(dentry_page, 1);
dentry_page = NULL;
}
-success:
+stop:
if (dentry_page && !IS_ERR(dentry_page)) {
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
const void *dir_ops; /* Opaque; default directory operations */
int dir_per_block; /* dir entries per block */
int dir_per_block_bits; /* log2(dir_per_block) */
+ unsigned int vol_id; /*volume ID*/
int fatent_shift;
struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
return err;
}
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ return put_user(sbi->vol_id, user_attr);
+}
+
long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return fat_ioctl_get_attributes(inode, user_attr);
case FAT_IOCTL_SET_ATTRIBUTES:
return fat_ioctl_set_attributes(filp, user_attr);
+ case FAT_IOCTL_GET_VOLUME_ID:
+ return fat_ioctl_get_volume_id(inode, user_attr);
default:
return -ENOTTY; /* Inappropriate ioctl for device */
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
brelse(fsinfo_bh);
}
+ /* interpret volume ID as a little endian 32 bit integer */
+ if (sbi->fat_bits == 32)
+ sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+ ((u32)b->fat32.vol_id[1] << 8) |
+ ((u32)b->fat32.vol_id[2] << 16) |
+ ((u32)b->fat32.vol_id[3] << 24));
+ else /* fat 16 or 12 */
+ sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+ ((u32)b->fat16.vol_id[1] << 8) |
+ ((u32)b->fat16.vol_id[2] << 16) |
+ ((u32)b->fat16.vol_id[3] << 24));
+
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
diff --git a/fs/file_table.c b/fs/file_table.c
index 08e719b884ca..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
mntput(mnt);
}
-static DEFINE_SPINLOCK(delayed_fput_lock);
-static LIST_HEAD(delayed_fput_list);
+static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
- LIST_HEAD(head);
- spin_lock_irq(&delayed_fput_lock);
- list_splice_init(&delayed_fput_list, &head);
- spin_unlock_irq(&delayed_fput_lock);
- while (!list_empty(&head)) {
- struct file *f = list_first_entry(&head, struct file, f_u.fu_list);
- list_del_init(&f->f_u.fu_list);
- __fput(f);
+ struct llist_node *node = llist_del_all(&delayed_fput_list);
+ struct llist_node *next;
+
+ for (; node; node = next) {
+ next = llist_next(node);
+ __fput(llist_entry(node, struct file, f_u.fu_llist));
}
}
@@ -306,18 +303,22 @@ void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- unsigned long flags;
file_sb_list_del(file);
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
return;
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. free_ipc_ns()->
+ * shm_destroy() can do this. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
- spin_lock_irqsave(&delayed_fput_lock, flags);
- list_add(&file->f_u.fu_list, &delayed_fput_list);
- schedule_work(&delayed_fput_work);
- spin_unlock_irqrestore(&delayed_fput_lock, flags);
+
+ if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+ schedule_work(&delayed_fput_work);
}
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a85ac4e33436..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
/*
* Retrieve work items and do the writeback they describe
*/
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
@@ -971,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
- /*
- * Override sync mode, in case we must wait for completion
- * because this thread is exiting now.
- */
- if (force_wait)
- work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
@@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
* rescuer as work_list needs to be drained.
*/
do {
- pages_written = wb_do_writeback(wb, 0);
+ pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0eda52738ec4..72a5d5b04494 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1223,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file,
if (name.name[1] == '.' && name.len == 2)
return 0;
}
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
fc = get_fuse_conn(dir);
name.hash = full_name_hash(name.name, name.len);
dentry = d_lookup(parent, &name);
- if (dentry && dentry->d_inode) {
+ if (dentry) {
inode = dentry->d_inode;
- if (get_node_id(inode) == o->nodeid) {
+ if (!inode) {
+ d_drop(dentry);
+ } else if (get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ err = d_invalidate(dentry);
+ if (err)
+ goto out;
+ } else if (is_bad_inode(inode)) {
+ err = -EIO;
+ goto out;
+ } else {
struct fuse_inode *fi;
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+
/*
* The other branch to 'found' comes via fuse_iget()
* which bumps nlookup inside
*/
goto found;
}
- err = d_invalidate(dentry);
- if (err)
- goto out;
dput(dentry);
- dentry = NULL;
}
dentry = d_alloc(parent, &name);
@@ -1259,25 +1275,30 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- alias = d_materialise_unique(dentry, inode);
- err = PTR_ERR(alias);
- if (IS_ERR(alias))
- goto out;
+ if (S_ISDIR(inode->i_mode)) {
+ mutex_lock(&fc->inst_mutex);
+ alias = fuse_d_add_directory(dentry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias)) {
+ iput(inode);
+ goto out;
+ }
+ } else {
+ alias = d_splice_alias(inode, dentry);
+ }
+
if (alias) {
dput(dentry);
dentry = alias;
}
found:
- fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
- attr_version);
-
fuse_change_entry_timeout(dentry, o);
err = 0;
out:
- if (dentry)
- dput(dentry);
+ dput(dentry);
return err;
}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ip->i_sb,
- "dbFree: block to be freed is outside the map");
+ jfs_error(ip->i_sb, "block to be freed is outside the map\n");
return -EIO;
}
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
/* free the blocks. */
if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
- jfs_error(ip->i_sb, "dbFree: error in block map\n");
+ jfs_error(ip->i_sb, "error in block map\n");
release_metapage(mp);
IREAD_UNLOCK(ipbmap);
return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ipbmap->i_sb,
- "dbUpdatePMap: blocks are outside the map");
+ jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
return -EIO;
}
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
/* the hint should be within the map */
if (hint >= mapSize) {
- jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+ jfs_error(ip->i_sb, "the hint is outside the map\n");
return -EIO;
}
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
bmp = sbi->bmap;
if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
IREAD_UNLOCK(ipbmap);
- jfs_error(ip->i_sb,
- "dbExtend: the block is outside the filesystem");
+ jfs_error(ip->i_sb, "the block is outside the filesystem\n");
return -EIO;
}
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
u32 mask;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNext: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
s8 *leaf;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNear: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
*/
if (l2nb > bmp->db_agl2size) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: allocation request is larger than the "
- "allocation group size");
+ "allocation request is larger than the allocation group size\n");
return -EIO;
}
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
(unsigned long long) blkno,
(unsigned long long) nblocks);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: dbAllocCtl failed in free AG");
+ "dbAllocCtl failed in free AG\n");
}
return (rc);
}
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
budmin = dcp->budmin;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
}
if (n == 4) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: failed descending stree");
+ "failed descending stree\n");
release_metapage(mp);
return -EIO;
}
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
&blkno))) {
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: control page "
- "inconsistent");
+ "control page inconsistent\n");
return -EIO;
}
return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: unable to allocate blocks");
+ "unable to allocate blocks\n");
rc = -EIO;
}
return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
*/
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAny: unable to allocate blocks");
+ jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
return -EIO;
}
return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
if (totrim == NULL) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: no memory for trim array");
+ jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
IWRITE_UNLOCK(ipbmap);
return 0;
}
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
nblocks = 1 << l2nb;
} else {
/* Trim any already allocated blocks */
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: -EIO");
+ jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
break;
}
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: Corrupt dmapctl page");
+ "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (rc) {
if (lev != level) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: dmap inconsistent");
+ "dmap inconsistent\n");
return -EIO;
}
return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
if (dp->tree.stree[ROOT] != L2BPERDMAP) {
release_metapage(mp);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: the dmap is not all free");
+ "the dmap is not all free\n");
rc = -EIO;
goto backout;
}
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: I/O Error: Block Leakage.");
+ "I/O Error: Block Leakage\n");
continue;
}
dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
release_metapage(mp);
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: Block Leakage.");
+ jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
continue;
}
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
for (; nwords > 0; nwords -= nw) {
if (leaf[word] < BUDMIN) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocBits: leaf page "
- "corrupt");
+ "leaf page corrupt\n");
break;
}
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
dcp = (struct dmapctl *) mp->data;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
assert(level == bmp->db_maxlevel);
if (bmp->db_maxfreebud != oldroot) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: the maximum free buddy is "
- "not the old root");
+ "the maximum free buddy is not the old root\n");
}
bmp->db_maxfreebud = dcp->stree[ROOT];
}
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
p = BMAPBLKNO + nbperpage; /* L2 page */
l2mp = read_metapage(ipbmap, p, PSIZE, 0);
if (!l2mp) {
- jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+ jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
return -EIO;
}
l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
}
} /* for each L1 in a L2 */
- jfs_error(ipbmap->i_sb,
- "dbExtendFS: function has not returned as expected");
+ jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
errout:
if (l0mp)
release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
}
if (bmp->db_agpref >= bmp->db_numag) {
jfs_error(ipbmap->i_sb,
- "cannot find ag with average freespace");
+ "cannot find ag with average freespace\n");
}
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9f4ed13d9f15..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
/* get page buffer for specified block address */
-#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
- if (!(RC))\
- {\
- if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
- {\
- BT_PUTPAGE(MP);\
- jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
+ if (!(RC)) { \
+ if (((P)->header.nextindex > \
+ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
+ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ BT_PUTPAGE(MP); \
+ jfs_error((IP)->i_sb, \
+ "DT_GETPAGE: dtree page corrupt\n"); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
/* Something's corrupted, mark filesystem dirty so
* chkdsk will fix it.
*/
- jfs_error(sb, "stack overrun in dtSearch!");
+ jfs_error(sb, "stack overrun!\n");
BT_STACK_DUMP(btstack);
rc = -EIO;
goto out;
@@ -3247,8 +3247,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
/* Sanity Check */
if (d_namleft == 0) {
jfs_error(ip->i_sb,
- "JFS:Dtree error: ino = "
- "%ld, bn=%Ld, index = %d",
+ "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
(long)ip->i_ino,
(long long)bn,
i);
@@ -3368,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
*/
if (BT_STACK_FULL(btstack)) {
DT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+ jfs_error(ip->i_sb, "btstack overrun\n");
BT_STACK_DUMP(btstack);
return -EIO;
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
if ((rc == 0) && xlen) {
if (xlen != nbperpage) {
- jfs_error(ip->i_sb, "extHint: corrupt xtree");
+ jfs_error(ip->i_sb, "corrupt xtree\n");
rc = -EIO;
}
XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
dp += rel_inode;
if (ip->i_ino != le32_to_cpu(dp->di_number)) {
- jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+ jfs_error(ip->i_sb, "i_ino != di_number\n");
rc = -EIO;
} else if (le32_to_cpu(dp->di_nlink) == 0)
rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
if (!addressPXD(&(jfs_ip->ixpxd)) ||
(lengthPXD(&(jfs_ip->ixpxd)) !=
JFS_IP(ipimap)->i_imap->im_nbperiext)) {
- jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+ jfs_error(ip->i_sb, "ixpxd invalid\n");
return -EIO;
}
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
if (iagno >= imap->im_nextiag) {
print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
imap, 32, 0);
- jfs_error(ip->i_sb,
- "diFree: inum = %d, iagno = %d, nextiag = %d",
+ jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
(uint) inum, iagno, imap->im_nextiag);
return -EIO;
}
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
mask = HIGHORDER >> bitno;
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
- jfs_error(ip->i_sb,
- "diFree: wmap shows inode already free");
+ jfs_error(ip->i_sb, "wmap shows inode already free\n");
}
if (!addressPXD(&iagp->inoext[extno])) {
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: invalid inoext");
+ jfs_error(ip->i_sb, "invalid inoext\n");
return -EIO;
}
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
/*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
* for the inode being freed.
*/
if (iagp->pmap[extno] != 0) {
- jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+ jfs_error(ip->i_sb, "the pmap does not show inode free\n");
}
iagp->wmap[extno] = 0;
PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
release_metapage(mp);
AG_UNLOCK(imap, agno);
jfs_error(ip->i_sb,
- "diAlloc: can't find free bit "
- "in wmap");
+ "can't find free bit in wmap\n");
return -EIO;
}
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
numinos = imap->im_agctl[agno].numinos;
if (numfree > numinos) {
- jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (!iagp->nfreeinos) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb,
- "diAllocIno: nfreeinos = 0, but iag on freelist");
+ jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
return -EIO;
}
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
jfs_error(ip->i_sb,
- "diAllocIno: free inode not found in summary map");
+ "free inode not found in summary map\n");
return -EIO;
}
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+ jfs_error(ip->i_sb, "no free extent found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= INOSPEREXT) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+ jfs_error(ip->i_sb, "free inode not found\n");
return -EIO;
}
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
if ((rc = diIAGRead(imap, iagno, &mp))) {
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+ jfs_error(ip->i_sb, "error reading iag\n");
return rc;
}
iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (sword >= SMAPSZ) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb,
- "diAllocExt: free ext summary map not found");
+ jfs_error(ip->i_sb, "free ext summary map not found\n");
return -EIO;
}
if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+ jfs_error(ip->i_sb, "free extent not found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
if (bmp)
release_metapage(bmp);
- jfs_error(imap->im_ipimap->i_sb,
- "diAllocBit: iag inconsistent");
+ jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
return -EIO;
}
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* better have free extents.
*/
if (!iagp->nfreeexts) {
- jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+ jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
return -EIO;
}
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
}
if (ciagp == NULL) {
jfs_error(imap->im_ipimap->i_sb,
- "diNewExt: ciagp == NULL");
+ "ciagp == NULL\n");
rc = -EIO;
goto error_out;
}
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
IWRITE_UNLOCK(ipimap);
IAGFREE_UNLOCK(imap);
jfs_error(imap->im_ipimap->i_sb,
- "diNewIAG: ipimap->i_size is wrong");
+ "ipimap->i_size is wrong\n");
return -EIO;
}
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
iagno = INOTOIAG(inum);
/* make sure that the iag is contained within the map */
if (iagno >= imap->im_nextiag) {
- jfs_error(ipimap->i_sb,
- "diUpdatePMap: the iag is outside the map");
+ jfs_error(ipimap->i_sb, "the iag is outside the map\n");
return -EIO;
}
/* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
*/
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in wmap!", inum);
+ "inode %ld not marked as allocated in wmap!\n",
+ inum);
}
if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in pmap!", inum);
+ "inode %ld not marked as allocated in pmap!\n",
+ inum);
}
/* update the bitmap for the extent of the freed inode */
iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not allocated in "
- "the working map");
+ "the inode is not allocated in the working map\n");
return -EIO;
}
if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not free in the "
- "persistent map");
+ "the inode is not free in the persistent map\n");
return -EIO;
}
/* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
iagp = (struct iag *) bp->data;
if (le32_to_cpu(iagp->iagnum) != i) {
release_metapage(bp);
- jfs_error(ipimap->i_sb,
- "diExtendFs: unexpected value of iagnum");
+ jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
return -EIO;
}
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
if (xnuminos != atomic_read(&imap->im_numinos) ||
xnumfree != atomic_read(&imap->im_numfree)) {
- jfs_error(ipimap->i_sb,
- "diExtendFs: numinos or numfree incorrect");
+ jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
return -EIO;
}
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 9e3aaff11f89..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -647,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (mp) {
if (mp->logical_size != size) {
jfs_error(inode->i_sb,
- "__get_metapage: mp->logical_size != size");
+ "get_mp->logical_size != size\n");
jfs_err("logical_size = %d, size = %d",
mp->logical_size, size);
dump_stack();
@@ -658,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (test_bit(META_discard, &mp->flag)) {
if (!new) {
jfs_error(inode->i_sb,
- "__get_metapage: using a "
- "discarded metapage");
+ "using a discarded metapage\n");
discard_metapage(mp);
goto unlock;
}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
extern int readSuper(struct super_block *, struct buffer_head **);
extern int updateSuper(struct super_block *, uint);
+__printf(2, 3)
extern void jfs_error(struct super_block *, const char *, ...);
extern int jfs_mount(struct super_block *);
extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
* mark filesystem dirty
*/
if (dirty)
- jfs_error(tblk->sb, "txAbort");
+ jfs_error(tblk->sb, "\n");
return;
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
/* get page buffer for specified block address */
/* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
- if (!(RC))\
- {\
- if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
- (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
- (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
- {\
- jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
- BT_PUTPAGE(MP);\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
+ if (!(RC)) { \
+ if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
+ (le16_to_cpu((P)->header.nextindex) > \
+ le16_to_cpu((P)->header.maxentry)) || \
+ (le16_to_cpu((P)->header.maxentry) > \
+ (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
+ jfs_error((IP)->i_sb, \
+ "XT_GETPAGE: xtree page corrupt\n"); \
+ BT_PUTPAGE(MP); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
/* push (bn, index) of the parent page/entry */
if (BT_STACK_FULL(btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+ jfs_error(ip->i_sb, "xtSearch did not find extent\n");
return -EIO;
}
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
xad = &p->xad[index];
if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+ jfs_error(ip->i_sb, "extension is not contiguous\n");
return -EIO;
}
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+ jfs_error(ip->i_sb, "couldn't find extent\n");
return -EIO;
}
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
nextindex = le16_to_cpu(p->header.nextindex);
if (index != nextindex - 1) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTailgate: the entry found is not the last entry");
+ jfs_error(ip->i_sb, "the entry found is not the last entry\n");
return -EIO;
}
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+ jfs_error(ip->i_sb, "Could not find extent\n");
return -EIO;
}
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
(nxoff + nxlen > xoff + xlen)) {
XT_PUTPAGE(mp);
jfs_error(ip->i_sb,
- "xtUpdate: nXAD in not completely contained within XAD");
+ "nXAD in not completely contained within XAD\n");
return -EIO;
}
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (xoff >= nxoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+ jfs_error(ip->i_sb, "xoff >= nxoff\n");
return -EIO;
}
/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+ jfs_error(ip->i_sb, "xtSearch failed\n");
return -EIO;
}
if (index0 != index) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtUpdate: unexpected value of index");
+ jfs_error(ip->i_sb, "unexpected value of index\n");
return -EIO;
}
}
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTruncate_pmap: did not find extent");
+ jfs_error(ip->i_sb, "did not find extent\n");
return -EIO;
}
} else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 8b19027291d6..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!S_ISDIR(old_ip->i_mode) && new_ip)
IWRITE_UNLOCK(new_ip);
jfs_error(new_ip->i_sb,
- "jfs_rename: new_ip->i_nlink != 0");
+ "new_ip->i_nlink != 0\n");
return -EIO;
}
tblk = tid_to_tblock(tid);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto resume;
error_out:
- jfs_error(sb, "jfs_extendfs");
+ jfs_error(sb, "\n");
resume:
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
/* nothing is done for continue beyond marking the superblock dirty */
}
-void jfs_error(struct super_block *sb, const char * function, ...)
+void jfs_error(struct super_block *sb, const char *fmt, ...)
{
- static char error_buf[256];
+ struct va_format vaf;
va_list args;
- va_start(args, function);
- vsnprintf(error_buf, sizeof(error_buf), function, args);
- va_end(args);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
- pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
+ pr_err("ERROR: (device %s): %pf: %pV\n",
+ sb->s_id, __builtin_return_address(0), &vaf);
+
+ va_end(args);
jfs_handle_error(sb);
}
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
txQuiesce(sb);
rc = lmLogShutdown(log);
if (rc) {
- jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
+ jfs_error(sb, "lmLogShutdown failed\n");
/* let operations fail rather than hang */
txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
rc = updateSuper(sb, FM_MOUNT);
if (rc) {
- jfs_error(sb, "jfs_unfreeze: updateSuper failed");
+ jfs_error(sb, "updateSuper failed\n");
goto out;
}
rc = lmLogInit(log);
if (rc)
- jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
+ jfs_error(sb, "lmLogInit failed\n");
out:
txResume(sb);
}
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
nbytes = sizeDXD(&ji->ea);
if (!nbytes) {
- jfs_error(sb, "ea_read: nbytes is 0");
+ jfs_error(sb, "nbytes is 0\n");
return -EIO;
}
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
current_blocks = 0;
} else {
if (!(ji->ea.flag & DXD_EXTENT)) {
- jfs_error(sb, "ea_get: invalid ea.flag)");
+ jfs_error(sb, "invalid ea.flag\n");
return -EIO;
}
current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index c3a0837fb861..3a3a9b53bf5a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -61,7 +61,8 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- d_set_d_op(dentry, &simple_dentry_operations);
+ if (!dentry->d_sb->s_d_op)
+ d_set_d_op(dentry, &simple_dentry_operations);
d_add(dentry, NULL);
return NULL;
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 067778b0ccc9..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -951,6 +951,7 @@ nlmsvc_retry_blocked(void)
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
+ spin_lock(&nlm_blocked_lock);
while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
@@ -960,6 +961,7 @@ nlmsvc_retry_blocked(void)
timeout = block->b_when - jiffies;
break;
}
+ spin_unlock(&nlm_blocked_lock);
dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
block, block->b_when);
@@ -969,7 +971,9 @@ nlmsvc_retry_blocked(void)
retry_deferred_block(block);
} else
nlmsvc_grant_blocked(block);
+ spin_lock(&nlm_blocked_lock);
}
+ spin_unlock(&nlm_blocked_lock);
return timeout;
}
diff --git a/fs/locks.c b/fs/locks.c
index 04e2c1fdb157..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,6 +127,8 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
#include <asm/uaccess.h>
@@ -155,11 +157,13 @@ int lease_break_time = 45;
for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
/*
- * The global file_lock_list is only used for displaying /proc/locks. Protected
- * by the file_lock_lock.
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
*/
-static HLIST_HEAD(file_lock_list);
-static DEFINE_SPINLOCK(file_lock_lock);
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -506,20 +510,30 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner;
}
+/* Must be called with the i_lock held! */
static inline void
locks_insert_global_locks(struct file_lock *fl)
{
- spin_lock(&file_lock_lock);
- hlist_add_head(&fl->fl_link, &file_lock_list);
- spin_unlock(&file_lock_lock);
+ lg_local_lock(&file_lock_lglock);
+ fl->fl_link_cpu = smp_processor_id();
+ hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+ lg_local_unlock(&file_lock_lglock);
}
+/* Must be called with the i_lock held! */
static inline void
locks_delete_global_locks(struct file_lock *fl)
{
- spin_lock(&file_lock_lock);
+ /*
+ * Avoid taking lock if already unhashed. This is safe since this check
+ * is done while holding the i_lock, and new insertions into the list
+ * also require that it be held.
+ */
+ if (hlist_unhashed(&fl->fl_link))
+ return;
+ lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
hlist_del_init(&fl->fl_link);
- spin_unlock(&file_lock_lock);
+ lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
}
static unsigned long
@@ -1454,7 +1468,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
if ((arg == F_WRLCK)
- && ((dentry->d_count > 1)
+ && ((d_count(dentry) > 1)
|| (atomic_read(&inode->i_count) > 1)))
goto out;
@@ -2243,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+struct locks_iterator {
+ int li_cpu;
+ loff_t li_pos;
+};
+
static void lock_get_status(struct seq_file *f, struct file_lock *fl,
loff_t id, char *pfx)
{
@@ -2316,39 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
static int locks_show(struct seq_file *f, void *v)
{
+ struct locks_iterator *iter = f->private;
struct file_lock *fl, *bfl;
fl = hlist_entry(v, struct file_lock, fl_link);
- lock_get_status(f, fl, *((loff_t *)f->private), "");
+ lock_get_status(f, fl, iter->li_pos, "");
list_for_each_entry(bfl, &fl->fl_block, fl_block)
- lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+ lock_get_status(f, bfl, iter->li_pos, " ->");
return 0;
}
static void *locks_start(struct seq_file *f, loff_t *pos)
{
- loff_t *p = f->private;
+ struct locks_iterator *iter = f->private;
- spin_lock(&file_lock_lock);
+ iter->li_pos = *pos + 1;
+ lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
- *p = (*pos + 1);
- return seq_hlist_start(&file_lock_list, *pos);
+ return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
- loff_t *p = f->private;
- ++*p;
- return seq_hlist_next(v, &file_lock_list, pos);
+ struct locks_iterator *iter = f->private;
+
+ ++iter->li_pos;
+ return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
{
spin_unlock(&blocked_lock_lock);
- spin_unlock(&file_lock_lock);
+ lg_global_unlock(&file_lock_lglock);
}
static const struct seq_operations locks_seq_operations = {
@@ -2360,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
static int locks_open(struct inode *inode, struct file *filp)
{
- return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+ return seq_open_private(filp, &locks_seq_operations,
+ sizeof(struct locks_iterator));
}
static const struct file_operations proc_locks_operations = {
@@ -2460,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
static int __init filelock_init(void)
{
+ int i;
+
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+ for_each_possible_cpu(i)
+ INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index b2beee7a733f..8b61d103a8a7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2977,7 +2977,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
file->f_flags = op->open_flag;
- if (unlikely(file->f_flags & O_TMPFILE)) {
+ if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
goto out;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0765ad12c382..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
switch (optval) {
case 'u':
data->uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->uid))
+ if (!uid_valid(data->uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'g':
data->gid = make_kgid(current_user_ns(), optint);
- if (!gid_valid(data->gid))
+ if (!gid_valid(data->gid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'o':
data->mounted_uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->mounted_uid))
+ if (!uid_valid(data->mounted_uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'm':
data->file_mode = optint;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
If unsure, say N.
+config NFS_V4_2
+ bool "NFS client support for NFSv4.2"
+ depends on NFS_V4_1
+ help
+ This option enables support for minor version 2 of the NFSv4 protocol
+ in the kernel's NFS client.
+
+ If unsure, say N.
+
config PNFS_FILE_LAYOUT
tristate
depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
If the NFS client is unchanged from the upstream kernel, this
option should be set to the default "kernel.org".
+config NFS_V4_SECURITY_LABEL
+ bool
+ depends on NFS_V4_2 && SECURITY
+ default y
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
direct.o pagelist.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o \
- dns_resolve.o cache_lib.o
+ write.o namespace.o mount_clnt.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
obj-$(CONFIG_NFS_V4) += nfsv4.o
nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
- nfs4namespace.o nfs4getroot.o nfs4client.o
+ nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
dev->pgbase = 0;
dev->pglen = PAGE_SIZE * max_pages;
dev->mincount = 0;
+ dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev);
+ rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc) {
rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index da6a43d19aa3..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -281,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
ret = nfs4_callback_up_net(serv, net);
break;
case 1:
+ case 2:
ret = nfs41_callback_up_net(serv, net);
break;
default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
OP_CB_WANTS_CANCELLED = 12,
OP_CB_NOTIFY_LOCK = 13,
OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+ OP_CB_OFFLOAD = 15,
OP_CB_ILLEGAL = 10044,
};
@@ -39,6 +41,7 @@ struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
u32 slotid;
+ u32 minorversion;
struct net *net;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
- clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+ &args->csa_sessionid, cps->minorversion);
if (clp == NULL)
goto out;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
- /* Check minor version is zero or one. */
- if (hdr->minorversion <= 1) {
- hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+ /* Check for minor version support */
+ if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
} else {
pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
"illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
}
#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+ if (status != htonl(NFS4ERR_OP_ILLEGAL))
+ return status;
+
+ if (op_nr == OP_CB_OFFLOAD)
+ return htonl(NFS4ERR_NOTSUPP);
+ return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
static __be32
preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
return htonl(NFS_OK);
}
-static __be32 process_op(uint32_t minorversion, int nop,
- struct svc_rqst *rqstp,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
struct xdr_stream *xdr_out, void *resp,
struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
return status;
dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
- __func__, minorversion, nop, op_nr);
+ __func__, cps->minorversion, nop, op_nr);
+
+ switch (cps->minorversion) {
+ case 0:
+ status = preprocess_nfs4_op(op_nr, &op);
+ break;
+ case 1:
+ status = preprocess_nfs41_op(nop, op_nr, &op);
+ break;
+ case 2:
+ status = preprocess_nfs42_op(nop, op_nr, &op);
+ break;
+ default:
+ status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
- status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
- preprocess_nfs4_op(op_nr, &op);
if (status == htonl(NFS4ERR_OP_ILLEGAL))
op_nr = OP_CB_ILLEGAL;
if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
return rpc_drop_reply;
}
+ cps.minorversion = hdr_arg.minorversion;
hdr_res.taglen = hdr_arg.taglen;
hdr_res.tag = hdr_arg.tag;
if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
return rpc_system_err;
while (status == 0 && nops != hdr_arg.nops) {
- status = process_op(hdr_arg.minorversion, nops, rqstp,
- &xdr_in, argp, &xdr_out, resp, &cps);
+ status = process_op(nops, rqstp, &xdr_in,
+ argp, &xdr_out, resp, &cps);
nops++;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
data->timeo, data->retrans);
if (data->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- if (server->options & NFS_OPTION_MIGRATION)
- set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
}
if (!(fattr->valid & NFS_ATTR_FATTR)) {
- error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+ error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d7ed697133f0..e474ca2b2bfe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -437,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct dentry *alias;
struct inode *dir = parent->d_inode;
struct inode *inode;
+ int status;
if (filename.name[0] == '.') {
if (filename.len == 1)
@@ -449,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
dentry = d_lookup(parent, &filename);
if (dentry != NULL) {
if (nfs_same_file(dentry, entry)) {
- nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ if (!status)
+ nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
goto out;
} else {
if (d_invalidate(dentry) != 0)
@@ -462,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (dentry == NULL)
return;
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
if (IS_ERR(inode))
goto out;
@@ -587,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
if (entry.fh == NULL || entry.fattr == NULL)
goto out;
+ entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(entry.label)) {
+ status = PTR_ERR(entry.label);
+ goto out;
+ }
+
array = nfs_readdir_get_array(page);
if (IS_ERR(array)) {
status = PTR_ERR(array);
- goto out;
+ goto out_label_free;
}
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
@@ -616,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
nfs_readdir_free_large_page(pages_ptr, pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
+out_label_free:
+ nfs4_label_free(entry.label);
out:
nfs_free_fattr(entry.fattr);
nfs_free_fhandle(entry.fh);
@@ -806,7 +818,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
- int res;
+ int res = 0;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -828,7 +840,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- res = nfs_revalidate_mapping(inode, file->f_mapping);
+ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+ res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1040,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
if (flags & LOOKUP_RCU)
@@ -1082,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (fhandle == NULL || fattr == NULL)
goto out_error;
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out_error;
+
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error)
goto out_bad;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if ((error = nfs_refresh_inode(inode, fattr)) != 0)
goto out_bad;
+ nfs_setsecurity(inode, fattr, label);
+
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
@@ -1108,6 +1130,7 @@ out_zap_parent:
out_bad:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
nfs_mark_for_revalidate(dir);
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
@@ -1128,6 +1151,7 @@ out_zap_parent:
out_error:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
__func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (fhandle == NULL || fattr == NULL)
goto out;
+ label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out;
+
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
res = ERR_CAST(inode);
if (IS_ERR(res))
goto out_unblock_sillyrename;
@@ -1310,6 +1339,7 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
+ nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
@@ -1357,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
{
int err;
- if (ctx->dentry != dentry) {
- dput(ctx->dentry);
- ctx->dentry = dget(dentry);
- }
-
- /* If the open_intent is for execute, we have an extra check to make */
- if (ctx->mode & FMODE_EXEC) {
- err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
- if (err < 0)
- goto out;
- }
-
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;
@@ -1427,13 +1445,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
nfs_block_sillyrename(dentry->d_parent);
inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
- d_drop(dentry);
+ nfs_unblock_sillyrename(dentry->d_parent);
if (IS_ERR(inode)) {
- nfs_unblock_sillyrename(dentry->d_parent);
put_nfs_open_context(ctx);
err = PTR_ERR(inode);
switch (err) {
case -ENOENT:
+ d_drop(dentry);
d_add(dentry, NULL);
break;
case -EISDIR:
@@ -1449,16 +1467,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
- res = d_add_unique(dentry, inode);
- if (res != NULL)
- dentry = res;
-
- nfs_unblock_sillyrename(dentry->d_parent);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
- err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
- dput(res);
+ err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
out:
return err;
@@ -1528,7 +1538,8 @@ no_open:
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct dentry *parent = dget_parent(dentry);
struct inode *dir = parent->d_inode;
@@ -1541,18 +1552,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
if (dentry->d_inode)
goto out;
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
if (error)
goto out_error;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (!(fattr->valid & NFS_ATTR_FATTR)) {
struct nfs_server *server = NFS_SB(dentry->d_sb);
- error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
if (error < 0)
goto out_error;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_error;
@@ -1721,7 +1732,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_ino, dentry->d_name.name);
spin_lock(&dentry->d_lock);
- if (dentry->d_count > 1) {
+ if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
@@ -1866,7 +1877,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
- new_dentry->d_count);
+ d_count(new_dentry));
/*
* For non-directories, check whether the target is busy and if so,
@@ -1884,7 +1895,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rehash = new_dentry;
}
- if (new_dentry->d_count > 2) {
+ if (d_count(new_dentry) > 2) {
int err;
/* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
kfree(ip_addr);
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
#else
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
ret = -ESRCH;
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
static struct cache_detail nfs_dns_resolve_template = {
.owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
cache_destroy_net(nn->nfs_dns_resolve, net);
}
+static int nfs4_dns_net_init(struct net *net)
+{
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+ .init = nfs4_dns_net_init,
+ .exit = nfs4_dns_net_exit,
+};
+
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
void *ptr)
{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
int nfs_dns_resolver_init(void)
{
- return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ int err;
+
+ err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+ if (err < 0)
+ goto out;
+ err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ if (err < 0)
+ goto out1;
+ return 0;
+out1:
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+ return err;
}
void nfs_dns_resolver_destroy(void)
{
rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
}
#endif
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
goto out;
}
- inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+ inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
return desclen;
}
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
- const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, struct idmap *idmap)
{
- const struct cred *saved_cred;
- struct key *rkey;
char *desc;
- struct user_key_payload *payload;
+ struct key *rkey;
ssize_t ret;
ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
if (ret <= 0)
- goto out;
+ return ERR_PTR(ret);
+
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ if (IS_ERR(rkey)) {
+ mutex_lock(&idmap->idmap_mutex);
+ rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+ desc, "", 0, idmap);
+ mutex_unlock(&idmap->idmap_mutex);
+ }
+
+ kfree(desc);
+ return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ struct user_key_payload *payload;
+ ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
- if (idmap)
- rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
- else
- rkey = request_key(&key_type_id_resolver, desc, "");
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
revert_creds(saved_cred);
- kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
goto out;
@@ -316,23 +329,6 @@ out:
return ret;
}
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
-{
- ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
- name, namelen, type, data,
- data_size, NULL);
- if (ret < 0) {
- mutex_lock(&idmap->idmap_mutex);
- ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
- name, namelen, type, data,
- data_size, idmap);
- mutex_unlock(&idmap->idmap_mutex);
- }
- return ret;
-}
-
/* ID -> Name */
static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce727047ee87..af6e806044d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
-#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
#include "netns.h"
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
nfs_fscache_invalidate(inode);
- } else {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
- }
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
+ } else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
}
void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
return 0;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ int error;
+
+ if (label == NULL)
+ return;
+
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+ return;
+
+ if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+ return;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+ error = security_inode_notifysecctx(inode, label->label,
+ label->len);
+ if (error)
+ printk(KERN_ERR "%s() %s %d "
+ "security_inode_notifysecctx() %d\n",
+ __func__,
+ (char *)label->label,
+ label->len, error);
+ }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+ struct nfs4_label *label = NULL;
+ int minor_version = server->nfs_client->cl_minorversion;
+
+ if (minor_version < 2)
+ return label;
+
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ return label;
+
+ label = kzalloc(sizeof(struct nfs4_label), flags);
+ if (label == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+ if (label->label == NULL) {
+ kfree(label);
+ return ERR_PTR(-ENOMEM);
+ }
+ label->len = NFS4_MAXLABELLEN;
+
+ return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
/*
* This is our front-end to iget that looks up inodes by file handle
* instead of inode number.
*/
struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_find_desc desc = {
.fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
}
+
+ nfs_setsecurity(inode, fattr, label);
+
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
nfsi->access_cache = RB_ROOT;
@@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
+ nfs_setsecurity(inode, fattr, label);
dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
NFS_PROTO(inode)->return_delegation(inode);
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
if (error == 0)
- nfs_refresh_inode(inode, fattr);
+ error = nfs_refresh_inode(inode, fattr);
nfs_free_fattr(fattr);
out:
return error;
@@ -713,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
*/
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = ctx->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
- filp->private_data = get_nfs_open_context(ctx);
spin_lock(&inode->i_lock);
list_add(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ filp->private_data = get_nfs_open_context(ctx);
+ if (list_empty(&ctx->list))
+ nfs_inode_attach_open_context(ctx);
+}
EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
/*
@@ -748,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
static void nfs_file_clear_open_context(struct file *filp)
{
- struct inode *inode = file_inode(filp);
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
+ struct inode *inode = ctx->dentry->d_inode;
+
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +869,7 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
+ struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
@@ -807,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label)) {
+ status = PTR_ERR(label);
+ goto out;
+ }
+
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -817,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (!S_ISDIR(inode->i_mode))
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
}
- goto out;
+ goto err_out;
}
status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
- goto out;
+ goto err_out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id,
(long long)NFS_FILEID(inode));
- out:
+err_out:
+ nfs4_label_free(label);
+out:
nfs_free_fattr(fattr);
return status;
}
@@ -847,7 +936,7 @@ int nfs_attribute_timeout(struct inode *inode)
return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
-static int nfs_attribute_cache_expired(struct inode *inode)
+int nfs_attribute_cache_expired(struct inode *inode)
{
if (nfs_have_delegated_attributes(inode))
return 0;
@@ -863,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
*/
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
- if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
&& !nfs_attribute_cache_expired(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
@@ -1243,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
spin_lock(&inode->i_lock);
status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
+
return status;
}
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
- if (invalid & NFS_INO_INVALID_ATTR) {
+ if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
}
invalid &= ~NFS_INO_INVALID_ATTR;
+ invalid &= ~NFS_INO_INVALID_LABEL;
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1638,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
nfs_clients_init(net);
- return nfs_dns_resolver_cache_init(net);
+ return 0;
}
static void nfs_net_exit(struct net *net)
{
- nfs_dns_resolver_cache_destroy(net);
nfs_cleanup_cb_ident_idr(net);
}
@@ -1661,10 +1752,6 @@ static int __init init_nfs_fs(void)
{
int err;
- err = nfs_dns_resolver_init();
- if (err < 0)
- goto out10;;
-
err = register_pernet_subsys(&nfs_net_ops);
if (err < 0)
goto out9;
@@ -1730,8 +1817,6 @@ out7:
out8:
unregister_pernet_subsys(&nfs_net_ops);
out9:
- nfs_dns_resolver_destroy();
-out10:
return err;
}
@@ -1744,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
- nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
- struct nfs4_sessionid *);
+ struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
struct nfs_subversion *);
extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
#ifdef CONFIG_NFS_V4_1
extern const u32 nfs41_maxread_overhead;
extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
#endif
/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
* nfs_mount - Obtain an NFS file handle for the given host and path
* @info: pointer to mount request arguments
*
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
*/
int nfs_mount(struct nfs_mount_request *info)
{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
dprintk("NFS: MNT request succeeded\n");
status = 0;
+ /*
+ * If the server didn't provide a flavor list, allow the
+ * client to try any flavor.
+ */
+ if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+ dprintk("NFS: Faking up auth_flavs list\n");
+ info->auth_flavs[0] = RPC_AUTH_NULL;
+ *info->auth_flav_len = 1;
+ }
out:
return status;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
struct dentry *parent = dget_parent(dentry);
/* Look it up again to get its attributes */
- err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+ err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
dput(parent);
if (err != 0)
return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce90eb4775c2..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs3_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs3_diropargs arg = {
.fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
nfs_post_op_update_inode(dir, data->res.dir_attr);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
return status;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
- int (*reclaim_complete)(struct nfs_client *);
+ int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
struct rpc_cred *);
};
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
void nfs4_free_client(struct nfs_client *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
if (err)
goto error;
+ if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+ err = -EINVAL;
+ goto error;
+ }
+
spin_lock_init(&clp->cl_lock);
INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
*/
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
struct nfs_client *clp;
struct nfs_net *nn = net_generic(net, nfs_net_id);
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
- if (nfs4_cb_match_client(addr, clp, 1) == false)
+ if (nfs4_cb_match_client(addr, clp, minorversion) == false)
continue;
if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
return NULL;
}
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (server->options & NFS_OPTION_MIGRATION)
+ set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
return -ENOMEM;
/* We must ensure the session is initialised first */
- error = nfs4_init_session(server);
+ error = nfs4_init_session(server->nfs_client);
if (error < 0)
goto out;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
goto out_drop;
}
}
- iput(inode);
if (inode != dentry->d_inode)
goto out_drop;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
NFS_SERVER(lo->plh_inode)->nfs_client, id);
if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+ dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+ lo->plh_lc_cred, gfp_flags);
if (dsaddr == NULL)
goto out;
} else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
* of available devices, and return it.
*/
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+ struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred,
+ gfp_t gfp_flags)
{
struct pnfs_device *pdev = NULL;
u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
pdev->pgbase = 0;
pdev->pglen = max_resp_sz;
pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
- rc = nfs4_proc_getdeviceinfo(server, pdev);
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc)
goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 28241a42f363..cf11799297c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state);
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ int err;
+
+ if (label == NULL)
+ return NULL;
+
+ if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+ return NULL;
+
+ if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+ return NULL;
+
+ err = security_dentry_init_security(dentry, sattr->ia_mode,
+ &dentry->d_name, (void **)&label->label, &label->len);
+ if (err == 0)
+ return label;
+
+ return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+ if (label)
+ security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+ if (label)
+ return server->attr_bitmask;
+
+ return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
{
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
- | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
| FATTR4_WORD0_FILEID,
};
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
FATTR4_WORD0_FILES_AVAIL
| FATTR4_WORD0_FILES_FREE
| FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
| FATTR4_WORD1_SPACE_TOTAL
};
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
FATTR4_WORD0_MAXLINK
| FATTR4_WORD0_MAXNAME,
0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD2_LAYOUT_BLKSIZE
};
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
FATTR4_WORD0_TYPE
| FATTR4_WORD0_CHANGE
| FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY
- | FATTR4_WORD1_MOUNTED_ON_FILEID
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
};
static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -762,6 +818,7 @@ struct nfs4_opendata {
struct nfs4_string owner_name;
struct nfs4_string group_name;
struct nfs_fattr f_attr;
+ struct nfs4_label *f_label;
struct dentry *dir;
struct dentry *dentry;
struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
static void nfs4_init_opendata_res(struct nfs4_opendata *p)
{
p->o_res.f_attr = &p->f_attr;
+ p->o_res.f_label = p->f_label;
p->o_res.seqid = p->o_arg.seqid;
p->c_res.seqid = p->c_arg.seqid;
p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
const struct iattr *attrs,
+ struct nfs4_label *label,
enum open_claim_type4 claim,
gfp_t gfp_mask)
{
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
goto err;
+
+ p->f_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_label))
+ goto err_free_p;
+
p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
if (p->o_arg.seqid == NULL)
- goto err_free;
+ goto err_free_label;
nfs_sb_active(dentry->d_sb);
p->dentry = dget(dentry);
p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
- p->o_arg.bitmask = server->attr_bitmask;
+ p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+ p->o_arg.label = label;
p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
switch (p->o_arg.claim) {
case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
nfs4_init_opendata_res(p);
kref_init(&p->kref);
return p;
-err_free:
+
+err_free_label:
+ nfs4_label_free(p->f_label);
+err_free_p:
kfree(p);
err:
dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
if (p->state != NULL)
nfs4_put_open_state(p->state);
nfs4_put_state_owner(p->owner);
+
+ nfs4_label_free(p->f_label);
+
dput(p->dir);
dput(p->dentry);
nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (ret)
goto err;
+ nfs_setsecurity(inode, &data->f_attr, data->f_label);
+
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
ret = -EAGAIN;
if (!(data->f_attr.valid & NFS_ATTR_FATTR))
goto err;
- inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
ret = PTR_ERR(inode);
if (IS_ERR(inode))
goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
struct nfs4_opendata *opendata;
opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
- NULL, claim, GFP_NOFS);
+ NULL, NULL, claim, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
- _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+ _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
return 0;
}
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->stateid;
- int status;
+ struct nfs_delegation *delegation;
+ struct rpc_cred *cred = NULL;
+ int status = -NFS4ERR_BAD_STATEID;
/* If a state reset has been done, test_stateid is unneeded */
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
return;
- status = nfs41_test_stateid(server, stateid);
+ /* Get the delegation credential for use by test/free_stateid */
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match(&delegation->stateid, stateid)) {
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, stateid, cred);
+ } else
+ rcu_read_unlock();
+
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
nfs_remove_bad_delegation(state->inode);
write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
write_sequnlock(&state->seqlock);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
}
+
+ if (cred != NULL)
+ put_rpccred(cred);
}
/**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->open_stateid;
+ struct rpc_cred *cred = state->owner->so_cred;
int status;
/* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
(test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
return -NFS4ERR_BAD_STATEID;
- status = nfs41_test_stateid(server, stateid);
+ status = nfs41_test_stateid(server, stateid, cred);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
fmode_t fmode,
int flags,
- struct nfs4_state **res)
+ struct nfs_open_context *ctx)
{
struct nfs4_state_owner *sp = opendata->owner;
struct nfs_server *server = sp->so_server;
+ struct dentry *dentry;
struct nfs4_state *state;
unsigned int seq;
int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (server->caps & NFS_CAP_POSIX_LOCK)
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ dentry = opendata->dentry;
+ if (dentry->d_inode == NULL) {
+ /* FIXME: Is this d_drop() ever needed? */
+ d_drop(dentry);
+ dentry = d_add_unique(dentry, igrab(state->inode));
+ if (dentry == NULL) {
+ dentry = opendata->dentry;
+ } else if (dentry != ctx->dentry) {
+ dput(ctx->dentry);
+ ctx->dentry = dget(dentry);
+ }
+ nfs_set_verifier(dentry,
+ nfs_save_change_attribute(opendata->dir->d_inode));
+ }
+
ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
if (ret != 0)
goto out;
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- nfs4_schedule_stateid_recovery(server, state);
- *res = state;
+ ctx->state = state;
+ if (dentry->d_inode == state->inode) {
+ nfs_inode_attach_open_context(ctx);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ nfs4_schedule_stateid_recovery(server, state);
+ }
out:
return ret;
}
@@ -1978,19 +2086,21 @@ out:
* Returns a referenced nfs4_state
*/
static int _nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_state **res,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
struct nfs4_state_owner *sp;
struct nfs4_state *state = NULL;
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_opendata *opendata;
+ struct dentry *dentry = ctx->dentry;
+ struct rpc_cred *cred = ctx->cred;
+ struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+ fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct nfs4_label *olabel = NULL;
int status;
/* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
if (dentry->d_inode)
claim = NFS4_OPEN_CLAIM_FH;
opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
- claim, GFP_KERNEL);
+ label, claim, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
+ if (label) {
+ olabel = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = PTR_ERR(olabel);
+ goto err_opendata_put;
+ }
+ }
+
if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
if (!opendata->f_attr.mdsthreshold)
- goto err_opendata_put;
+ goto err_free_label;
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
if (dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
- status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
+ status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
if (status != 0)
- goto err_opendata_put;
+ goto err_free_label;
+ state = ctx->state;
if ((opendata->o_arg.open_flags & O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- state);
- if (status == 0)
+ state, label, olabel);
+ if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
}
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
kfree(opendata->f_attr.mdsthreshold);
opendata->f_attr.mdsthreshold = NULL;
+ nfs4_label_free(olabel);
+
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
- *res = state;
return 0;
+err_free_label:
+ nfs4_label_free(olabel);
err_opendata_put:
kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
out_err:
- *res = NULL;
return status;
}
static struct nfs4_state *nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_state *res;
int status;
- fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
do {
- status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
- &res, ctx_th);
+ status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+ res = ctx->state;
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.iap = sattr,
.server = server,
.bitmask = server->attr_bitmask,
+ .label = ilabel,
};
struct nfs_setattrres res = {
.fattr = fattr,
+ .label = olabel,
.server = server,
};
struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
bool truncate;
int status;
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
nfs_fattr_init(fattr);
/* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
};
int err;
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+ err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
{
struct nfs4_state *state;
+ struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+ label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
/* Protect against concurrent sillydeletes */
- state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
- ctx->cred, &ctx->mdsthreshold);
+ state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+
+ nfs4_label_release_security(label);
+
if (IS_ERR(state))
return ERR_CAST(state);
- ctx->state = state;
- return igrab(state->inode);
+ return state->inode;
}
static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_CTIME;
if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+ memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+ sizeof(server->attr_bitmask));
+ if (server->caps & NFS_CAP_SECURITY_LABEL) {
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
+ u32 bitmask[3];
struct nfs4_lookup_root_arg args = {
- .bitmask = nfs4_fattr_bitmap,
+ .bitmask = bitmask,
};
struct nfs4_lookup_res res = {
.server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
+ bitmask[0] = nfs4_fattr_bitmap[0];
+ bitmask[1] = nfs4_fattr_bitmap[1];
+ /*
+ * Process the label in the upcoming getfattr
+ */
+ bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
nfs_fattr_init(info->fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
+ struct nfs4_label *label = NULL;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- error = nfs4_proc_getattr(server, mntfh, fattr);
+ label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ error = nfs4_proc_getattr(server, mntfh, fattr, label);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
- return error;
+ goto err_free_label;
}
if (fattr->valid & NFS_ATTR_FATTR_FSID &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid))
memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+err_free_label:
+ nfs4_label_free(label);
+
return error;
}
@@ -2711,7 +2869,8 @@ out:
return status;
}
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_getattr_arg args = {
.fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
struct nfs4_getattr_res res = {
.fattr = fattr,
+ .label = label,
.server = server,
};
struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
-
+
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getattr(server, fhandle, fattr),
+ _nfs4_proc_getattr(server, fhandle, fattr, label),
&exception);
} while (exception.retry);
return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct inode *inode = dentry->d_inode;
struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
+ struct nfs4_label *label = NULL;
int status;
if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
}
- status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
- if (status == 0)
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+ if (status == 0) {
nfs_setattr_update_inode(inode, sattr);
+ nfs_setsecurity(inode, fattr, label);
+ }
+ nfs4_label_free(label);
return status;
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
const struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct nfs4_lookup_res res = {
.server = server,
.fattr = fattr,
+ .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
.rpc_resp = &res,
};
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
dprintk("NFS call lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
struct rpc_clnt *client = *clnt;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+ err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
switch (err) {
case -NFS4ERR_BADNAME:
err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
}
static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -2899,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
int status;
struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
if (status < 0) {
rpc_shutdown_client(client);
return ERR_PTR(status);
@@ -2924,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
.rpc_cred = entry->cred,
};
int mode = entry->mask;
- int status;
+ int status = 0;
/*
* Determine which access bits we want to ask for...
@@ -3029,6 +3204,7 @@ static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -3037,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
+ ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, dentry, ctx->mode,
- flags, sattr, ctx->cred,
- &ctx->mdsthreshold);
- d_drop(dentry);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
}
- d_add(dentry, igrab(state->inode));
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- ctx->state = state;
out:
+ nfs4_label_release_security(ilabel);
put_nfs_open_context(ctx);
return status;
}
@@ -3098,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+ nfs_fattr_init(res->dir_attr);
}
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
.rpc_resp = &res,
};
int status = -ENOMEM;
-
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
};
struct nfs4_link_res res = {
.server = server,
+ .label = NULL,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
if (res.fattr == NULL)
goto out;
+ res.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(res.label)) {
+ status = PTR_ERR(res.label);
+ goto out;
+ }
+ arg.bitmask = nfs4_bitmask(server, res.label);
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(dir, &res.cinfo);
- nfs_post_op_update_inode(inode, res.fattr);
+ status = nfs_post_op_update_inode(inode, res.fattr);
+ if (!status)
+ nfs_setsecurity(inode, res.fattr, res.label);
}
+
+
+ nfs4_label_free(res.label);
+
out:
nfs_free_fattr(res.fattr);
return status;
@@ -3247,6 +3436,7 @@ struct nfs4_createdata {
struct nfs4_create_res res;
struct nfs_fh fh;
struct nfs_fattr fattr;
+ struct nfs4_label *label;
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
if (data != NULL) {
struct nfs_server *server = NFS_SERVER(dir);
+ data->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->label))
+ goto out_free;
+
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
data->msg.rpc_argp = &data->arg;
data->msg.rpc_resp = &data->res;
@@ -3266,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.name = name;
data->arg.attrs = sattr;
data->arg.ftype = ftype;
- data->arg.bitmask = server->attr_bitmask;
+ data->arg.bitmask = nfs4_bitmask(server, data->label);
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
+ data->res.label = data->label;
nfs_fattr_init(data->res.fattr);
}
return data;
+out_free:
+ kfree(data);
+ return NULL;
}
static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
update_changeattr(dir, &data->res.dir_cinfo);
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
}
return status;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
+ nfs4_label_free(data->label);
kfree(data);
}
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr)
+ struct page *page, unsigned int len, struct iattr *sattr,
+ struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENAMETOOLONG;
@@ -3308,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
data->arg.u.symlink.pages = &page;
data->arg.u.symlink.len = len;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
@@ -3320,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
_nfs4_proc_symlink(dir, dentry, page,
- len, sattr),
+ len, sattr, label),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
return err;
}
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+ struct iattr *sattr, struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENOMEM;
@@ -3340,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (data == NULL)
goto out;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
nfs4_free_createdata(data);
@@ -3351,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mkdir(dir, dentry, sattr),
+ _nfs4_proc_mkdir(dir, dentry, sattr, label),
&exception);
} while (exception.retry);
+ nfs4_label_release_security(label);
+
return err;
}
@@ -3416,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
}
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, dev_t rdev)
+ struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
{
struct nfs4_createdata *data;
int mode = sattr->ia_mode;
@@ -3441,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
status = -EINVAL;
goto out_free;
}
-
+
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
out_free:
nfs4_free_createdata(data);
@@ -3453,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mknod(dir, dentry, sattr, rdev),
+ _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
+
return err;
}
@@ -4187,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
return err;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr fattr;
+ struct nfs4_label label = {0, 0, buflen, buf};
+
+ u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs4_getattr_arg args = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = &fattr,
+ .label = &label,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+
+ nfs_fattr_init(&fattr);
+
+ ret = rpc_call_sync(server->client, &msg, 0);
+ if (ret)
+ return ret;
+ if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+ return -ENOENT;
+ if (buflen < label.len)
+ return -ERANGE;
+ return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_get_security_label(inode, buf, buflen),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+
+ struct iattr sattr = {0};
+ struct nfs_server *server = NFS_SERVER(inode);
+ const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_setattrargs args = {
+ .fh = NFS_FH(inode),
+ .iap = &sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ nfs4_stateid_copy(&args.stateid, &zero_stateid);
+
+ status = rpc_call_sync(server->client, &msg, 0);
+ if (status)
+ dprintk("%s failed: %d\n", __func__, status);
+
+ return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_do_set_security_label(inode, ilabel,
+ fattr, olabel),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+ struct nfs4_label ilabel, *olabel = NULL;
+ struct nfs_fattr fattr;
+ struct rpc_cred *cred;
+ struct inode *inode = dentry->d_inode;
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ nfs_fattr_init(&fattr);
+
+ ilabel.pi = 0;
+ ilabel.lfs = 0;
+ ilabel.label = (char *)buf;
+ ilabel.len = buflen;
+
+ cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+
+ olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = -PTR_ERR(olabel);
+ goto out;
+ }
+
+ status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ if (status == 0)
+ nfs_setsecurity(inode, &fattr, olabel);
+
+ nfs4_label_free(olabel);
+out:
+ put_rpccred(cred);
+ return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
static int
nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
{
@@ -4345,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
/* cb_client4 */
rcu_read_lock();
setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
- sizeof(setclientid.sc_netid),
+ sizeof(setclientid.sc_netid), "%s",
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_NETID));
rcu_read_unlock();
@@ -5056,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
list_for_each_entry(lsp, &state->lock_states, ls_locks) {
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- status = nfs41_test_stateid(server, &lsp->ls_stateid);
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+ status = nfs41_test_stateid(server,
+ &lsp->ls_stateid,
+ cred);
if (status != NFS_OK) {
/* Free the stateid unless the server
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
nfs41_free_stateid(server,
- &lsp->ls_stateid);
+ &lsp->ls_stateid,
+ cred);
clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
ret = status;
}
@@ -5295,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
return len;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+ return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_set_security_label(dentry, buf, buflen);
+
+ return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+ void *buf, size_t buflen, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+ return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+ size_t list_len, const char *name,
+ size_t name_len, int type)
+{
+ size_t len = 0;
+
+ if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+ if (list && len <= list_len)
+ security_inode_listsecurity(dentry->d_inode, list, len);
+ }
+ return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = nfs4_xattr_list_nfs4_label,
+ .get = nfs4_xattr_get_nfs4_label,
+ .set = nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
*/
@@ -5318,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
struct page *page)
{
struct nfs_server *server = NFS_SERVER(dir);
- u32 bitmask[2] = {
+ u32 bitmask[3] = {
[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
};
struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
struct nfs41_exchange_id_args args = {
.verifier = &verifier,
.client = clp,
- .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+ .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID,
};
struct nfs41_exchange_id_res res = {
0
@@ -5762,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
*/
static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
{
- struct nfs4_session *session = args->client->cl_session;
- unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
- mxresp_sz = session->fc_target_max_resp_sz;
+ unsigned int max_rqst_sz, max_resp_sz;
+
+ max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+ max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
- if (mxrqst_sz == 0)
- mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
- if (mxresp_sz == 0)
- mxresp_sz = NFS_MAX_FILE_IO_SIZE;
/* Fore channel attributes */
- args->fc_attrs.max_rqst_sz = mxrqst_sz;
- args->fc_attrs.max_resp_sz = mxresp_sz;
+ args->fc_attrs.max_rqst_sz = max_rqst_sz;
+ args->fc_attrs.max_resp_sz = max_resp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
args->fc_attrs.max_reqs = max_session_slots;
@@ -6159,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
/*
* Issue a global reclaim complete.
*/
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+ struct rpc_cred *cred)
{
struct nfs4_reclaim_complete_data *calldata;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
.rpc_argp = &lgp->args,
.rpc_resp = &lgp->res,
+ .rpc_cred = lgp->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = server->client,
@@ -6451,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
.rpc_argp = &lrp->args,
.rpc_resp = &lrp->res,
+ .rpc_cred = lrp->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_getdeviceinfo_args args = {
.pdev = pdev,
@@ -6532,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
int status;
@@ -6542,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
return status;
}
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getdeviceinfo(server, pdev),
+ _nfs4_proc_getdeviceinfo(server, pdev, cred),
&exception);
} while (exception.retry);
return err;
@@ -6733,7 +7161,9 @@ out:
return err;
}
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
int status;
struct nfs41_test_stateid_args args = {
@@ -6744,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
dprintk("NFS call test_stateid %p\n", stateid);
@@ -6764,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to test
+ * @cred: credential
*
* Returns NFS_OK if the server recognizes that "stateid" is valid.
* Otherwise a negative NFS4ERR value is returned if the operation
* failed or the state ID is not currently valid.
*/
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs41_test_stateid(server, stateid);
+ err = _nfs41_test_stateid(server, stateid, cred);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7257,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
+ struct rpc_cred *cred,
bool privileged)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup = {
.rpc_client = server->client,
@@ -6859,16 +7295,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
+ * @cred: credential
*
* Returns NFS_OK if the server freed "stateid". Otherwise a
* negative NFS4ERR value is returned.
*/
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
struct rpc_task *task;
int ret;
- task = _nfs41_free_stateid(server, stateid, true);
+ task = _nfs41_free_stateid(server, stateid, cred, true);
if (IS_ERR(task))
return PTR_ERR(task);
ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7320,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct rpc_task *task;
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
- task = _nfs41_free_stateid(server, &lsp->ls_stateid, false);
+ task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -7004,11 +7444,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
};
#endif
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+ .minor_version = 2,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_CHANGE_ATTR
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1,
+ .call_sync = nfs4_call_sync_sequence,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
[0] = &nfs_v4_0_minor_ops,
#if defined(CONFIG_NFS_V4_1)
[1] = &nfs_v4_1_minor_ops,
#endif
+#if defined(CONFIG_NFS_V4_2)
+ [2] = &nfs_v4_2_minor_ops,
+#endif
};
const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ &nfs4_xattr_nfs4_label_handler,
+#endif
NULL
};
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
return 0;
}
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
- struct nfs4_session *session;
- unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
- unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
-
if (!nfs4_has_session(clp))
return 0;
- if (server->rsize != 0)
- target_max_resp_sz = server->rsize;
- target_max_resp_sz += nfs41_maxread_overhead;
-
- if (server->wsize != 0)
- target_max_rqst_sz = server->wsize;
- target_max_rqst_sz += nfs41_maxwrite_overhead;
-
- session = clp->cl_session;
- spin_lock(&clp->cl_lock);
- if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
- /* Initialise targets and channel attributes */
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
- session->fc_target_max_resp_sz = target_max_resp_sz;
- session->fc_attrs.max_resp_sz = target_max_resp_sz;
- } else {
- /* Just adjust the targets */
- if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- if (target_max_resp_sz > session->fc_target_max_resp_sz) {
- session->fc_target_max_resp_sz = target_max_resp_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- }
- spin_unlock(&clp->cl_lock);
-
- if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- nfs4_schedule_lease_recovery(clp);
-
+ clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
return nfs41_check_session_ready(clp);
}
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -66,9 +66,6 @@ struct nfs4_session {
struct nfs4_channel_attrs bc_attrs;
struct nfs4_slot_table bc_slot_table;
struct nfs_client *clp;
- /* Create session arguments */
- unsigned int fc_target_max_rqst_sz;
- unsigned int fc_target_max_resp_sz;
};
enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
#else /* defined(CONFIG_NFS_V4_1) */
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
{
return 0;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 55418811a55a..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
return status;
}
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
{
- struct nfs4_session *ses = clp->cl_session;
- struct nfs4_slot_table *tbl;
-
- if (ses == NULL)
- return;
- tbl = &ses->fc_slot_table;
if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_lock(&tbl->slot_tbl_lock);
nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
}
}
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
/*
* Signal state manager thread if session fore channel is drained
*/
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
}
static void nfs4_reclaim_complete(struct nfs_client *clp,
- const struct nfs4_state_recovery_ops *ops)
+ const struct nfs4_state_recovery_ops *ops,
+ struct rpc_cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
- (void)ops->reclaim_complete(clp);
+ (void)ops->reclaim_complete(clp, cred);
}
static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
+ const struct nfs4_state_recovery_ops *ops;
+ struct rpc_cred *cred;
+
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
- nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+ ops = clp->cl_mvops->reboot_recovery_ops;
+ cred = ops->get_clid_cred(clp);
+ nfs4_reclaim_complete(clp, ops, cred);
+ put_rpccred(cred);
}
static void nfs_delegation_clear_all(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
#include "delegation.h"
#include "internal.h"
#include "nfs4_fs.h"
+#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
{
int err;
- err = nfs_idmap_init();
+ err = nfs_dns_resolver_init();
if (err)
goto out;
- err = nfs4_register_sysctl();
+ err = nfs_idmap_init();
if (err)
goto out1;
+ err = nfs4_register_sysctl();
+ if (err)
+ goto out2;
+
register_nfs_version(&nfs_v4);
return 0;
-out1:
+out2:
nfs_idmap_quit();
+out1:
+ nfs_dns_resolver_destroy();
out:
return err;
}
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
unregister_nfs_version(&nfs_v4);
nfs4_unregister_sysctl();
nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
}
MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..3850b018815f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define nfs4_label_maxsz 0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
/* We support only one layout type per file system */
#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
/* This is based on getfattr, which uses the most attributes: */
#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
3 + 3 + 3 + nfs4_owner_maxsz + \
- nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+ nfs4_group_maxsz + nfs4_label_maxsz + \
+ decode_mdsthreshold_maxsz))
#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
nfs4_fattr_value_maxsz)
#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
1 + 2 + 1 + \
nfs4_owner_maxsz + \
nfs4_group_maxsz + \
+ nfs4_label_maxsz + \
4 + 4)
#define encode_savefh_maxsz (op_encode_hdr_maxsz)
#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
encode_stateid_maxsz + 3)
#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
- 2 + encode_verifier_maxsz + 5)
+ 2 + encode_verifier_maxsz + 5 + \
+ nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + \
+ nfs4_label_maxsz + nfs4_fattr_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
decode_sequence_maxsz +
decode_putfh_maxsz) *
XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz) *
+ XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
#endif /* CONFIG_NFS_V4_1 */
static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
}
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+ const struct nfs4_label *label,
+ const struct nfs_server *server)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -977,17 +999,19 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
__be32 *p;
__be32 *q;
int len;
+ uint32_t bmval_len = 2;
uint32_t bmval0 = 0;
uint32_t bmval1 = 0;
+ uint32_t bmval2 = 0;
/*
* We reserve enough space to write the entire attribute buffer at once.
* In the worst-case, this would be
- * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
- * = 36 bytes, plus any contribution from variable-length fields
+ * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ * = 40 bytes, plus any contribution from variable-length fields
* such as owner/group.
*/
- len = 16;
+ len = 8;
/* Sigh */
if (iap->ia_valid & ATTR_SIZE)
@@ -1025,15 +1049,22 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len += 16;
else if (iap->ia_valid & ATTR_MTIME)
len += 4;
+ if (label) {
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+ bmval_len = 3;
+ }
+
+ len += bmval_len << 2;
p = reserve_space(xdr, len);
/*
* We write the bitmap length now, but leave the bitmap and the attribute
* buffer length to be backfilled at the end of this routine.
*/
- *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval_len);
q = p;
- p += 3;
+ /* Skip bitmap entries + attrlen */
+ p += bmval_len + 1;
if (iap->ia_valid & ATTR_SIZE) {
bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1102,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
+ if (label) {
+ bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+ *p++ = cpu_to_be32(label->lfs);
+ *p++ = cpu_to_be32(label->pi);
+ *p++ = cpu_to_be32(label->len);
+ p = xdr_encode_opaque_fixed(p, label->label, label->len);
+ }
/*
* Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len, ((char *)p - (char *)q) + 4);
BUG();
}
- len = (char *)p - (char *)q - 12;
*q++ = htonl(bmval0);
*q++ = htonl(bmval1);
+ if (bmval_len == 3)
+ *q++ = htonl(bmval2);
+ len = (char *)p - (char *)(q + 1);
*q = htonl(len);
/* out: */
@@ -1136,7 +1176,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1228,10 @@ encode_getattr_three(struct xdr_stream *xdr,
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
- bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+ encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+ bitmask[1] & nfs4_fattr_bitmap[1],
+ bitmask[2] & nfs4_fattr_bitmap[2],
+ hdr);
}
static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1409,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1423,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->server);
+ encode_attrs(xdr, &dummy, arg->label, arg->server);
}
}
@@ -1532,7 +1574,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
- uint32_t attrs[2] = {
+ uint32_t attrs[3] = {
FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
@@ -1555,20 +1597,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
encode_uint64(xdr, readdir->cookie);
encode_nfs4_verifier(xdr, &readdir->verifier);
- p = reserve_space(xdr, 20);
+ p = reserve_space(xdr, encode_readdir_space);
*p++ = cpu_to_be32(dircount);
*p++ = cpu_to_be32(readdir->count);
- *p++ = cpu_to_be32(2);
-
+ *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
- *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ if (encode_readdir_bitmask_sz > 2) {
+ if (hdr->minorversion > 1)
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+ }
memcpy(verf, readdir->verifier.data, sizeof(verf));
- dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+
+ dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
__func__,
(unsigned long long)readdir->cookie,
verf[0], verf[1],
attrs[0] & readdir->bitmask[0],
- attrs[1] & readdir->bitmask[1]);
+ attrs[1] & readdir->bitmask[1],
+ attrs[2] & readdir->bitmask[2]);
}
static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1675,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, server);
+ encode_attrs(xdr, arg->iap, arg->label, server);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1937,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
- *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
+ *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
*p++ = cpu_to_be32(0); /* bitmap length 0 */
}
@@ -4038,6 +4086,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
return status;
}
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_label *label)
+{
+ uint32_t pi = 0;
+ uint32_t lfs = 0;
+ __u32 len;
+ __be32 *p;
+ int status = 0;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ lfs = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ pi = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (len < NFS4_MAXLABELLEN) {
+ if (label) {
+ memcpy(label->label, p, len);
+ label->len = len;
+ label->pi = pi;
+ label->lfs = lfs;
+ status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+ }
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ } else
+ printk(KERN_WARNING "%s: label too long (%u)!\n",
+ __func__, len);
+ }
+ if (label && label->label)
+ dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+ (char *)label->label, label->len, label->pi, label->lfs);
+ return status;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
{
int status = 0;
@@ -4380,7 +4478,7 @@ out_overflow:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- struct nfs4_fs_locations *fs_loc,
+ struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
const struct nfs_server *server)
{
int status;
@@ -4488,6 +4586,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
+ if (label) {
+ status = decode_attr_security_label(xdr, bitmap, label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+ }
+
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
@@ -4495,7 +4600,7 @@ xdr_error:
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
- const struct nfs_server *server)
+ struct nfs4_label *label, const struct nfs_server *server)
{
unsigned int savep;
uint32_t attrlen,
@@ -4514,7 +4619,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+ label, server);
if (status < 0)
goto xdr_error;
@@ -4524,10 +4630,16 @@ xdr_error:
return status;
}
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs4_label *label, const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
}
/*
@@ -5919,7 +6031,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -5945,7 +6057,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr,
+ res->label, res->server);
out:
return status;
}
@@ -6036,7 +6149,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6065,7 +6178,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6097,7 +6210,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6230,7 +6343,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (res->access_request)
decode_access(xdr, &res->access_supported, &res->access_result);
- decode_getfattr(xdr, res->f_attr, res->server);
+ decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
out:
return status;
}
@@ -6307,7 +6420,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6696,7 +6809,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
NULL, res->fs_locations,
- res->fs_locations->server);
+ NULL, res->fs_locations->server);
out:
return status;
}
@@ -7109,7 +7222,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- NULL, entry->server) < 0)
+ NULL, entry->label, entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
pd.pgbase = 0;
pd.pglen = PAGE_SIZE;
pd.mincount = 0;
+ pd.maxcount = PAGE_SIZE;
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+ pnfslay->plh_lc_cred);
dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
if (err)
goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
-static inline u64
+static u64
end_offset(u64 start, u64 len)
{
u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_contained(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_intersecting(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
}
static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
- struct pnfs_layout_range *recall_range)
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
{
return (recall_range->iomode == IOMODE_ANY ||
lseg_range->iomode == recall_range->iomode) &&
- lo_seg_intersecting(lseg_range, recall_range);
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
}
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
/* Synchronously retrieve layout information from server and
* store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
lrp->args.inode = ino;
lrp->args.layout = lo;
lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = lo->plh_lc_cred;
status = nfs4_proc_layoutreturn(lrp);
out:
@@ -984,8 +986,8 @@ out:
* are seen first.
*/
static s64
-cmp_layout(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
s64 d;
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+ if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
INIT_LIST_HEAD(&lo->plh_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
- lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+ lo->plh_lc_cred = get_rpccred(ctx->cred);
return lo;
}
@@ -1091,21 +1093,21 @@ out_existing:
* READ READ true
* READ RW true
*/
-static int
-is_matching_lseg(struct pnfs_layout_range *ls_range,
- struct pnfs_layout_range *range)
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+ const struct pnfs_layout_range *range)
{
struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
- !lo_seg_intersecting(ls_range, range))
+ !pnfs_lseg_range_intersecting(ls_range, range))
return 0;
/* range1 covers only the first byte in the range */
range1 = *range;
range1.length = 1;
- return lo_seg_contained(ls_range, &range1);
+ return pnfs_lseg_range_contained(ls_range, &range1);
}
/*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(&lseg->pls_range, range)) {
+ pnfs_lseg_range_match(&lseg->pls_range, range)) {
ret = pnfs_get_lseg(lseg);
break;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
struct nfs4_deviceid dev_id;
unsigned int layout_type;
unsigned int mincount;
+ unsigned int maxcount; /* gdia_maxcount */
struct page **pages;
unsigned int pgbase;
- unsigned int pglen;
+ unsigned int pglen; /* reply buffer length */
};
#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
const struct nfs_fh *fh,
struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
- struct pnfs_device *dev);
+ struct pnfs_device *dev,
+ struct rpc_cred *cred);
extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* should fill in the data with a LOOKUP call on the wire.
*/
if (status == 0)
- status = nfs_instantiate(dentry, fh, fattr);
+ status = nfs_instantiate(dentry, fh, fattr, NULL);
out_free:
nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..71fdc0dfa0d2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
enum {
Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
- Opt_vers_4_1,
+ Opt_vers_4_1, Opt_vers_4_2,
Opt_vers_err
};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
{ Opt_vers_4, "4" },
{ Opt_vers_4_0, "4.0" },
{ Opt_vers_4_1, "4.1" },
+ { Opt_vers_4_2, "4.2" },
{ Opt_vers_err, NULL }
};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
seq_printf(m, "\n\tnfsv4:\t");
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
show_sessions(m, nfss);
show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
mnt->version = 4;
mnt->minorversion = 1;
break;
+ case Opt_vers_4_2:
+ mnt->version = 4;
+ mnt->minorversion = 2;
+ break;
default:
return 0;
}
@@ -1608,29 +1614,13 @@ out_security_failure:
}
/*
- * Select a security flavor for this mount. The selected flavor
- * is planted in args->auth_flavors[0].
- *
- * Returns 0 on success, -EACCES on failure.
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * the server. Returns 0 if it's ok, and -EACCES if not.
*/
-static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
- struct nfs_mount_request *request)
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+ rpc_authflavor_t *server_authlist, unsigned int count)
{
- unsigned int i, count = *(request->auth_flav_len);
- rpc_authflavor_t flavor;
-
- /*
- * The NFSv2 MNT operation does not return a flavor list.
- */
- if (args->mount_server.version != NFS_MNT3_VERSION)
- goto out_default;
-
- /*
- * Certain releases of Linux's mountd return an empty
- * flavor list in some cases.
- */
- if (count == 0)
- goto out_default;
+ unsigned int i;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
* means that the server will ignore the rpc creds, so any flavor
* can be used.
*/
- if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
- for (i = 0; i < count; i++) {
- if (args->auth_flavors[0] == request->auth_flavs[i] ||
- request->auth_flavs[i] == RPC_AUTH_NULL)
- goto out;
- }
- dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
- args->auth_flavors[0]);
- goto out_err;
- }
-
- /*
- * RFC 2623, section 2.7 suggests we SHOULD prefer the
- * flavor listed first. However, some servers list
- * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
- */
for (i = 0; i < count; i++) {
- struct rpcsec_gss_info info;
-
- flavor = request->auth_flavs[i];
- switch (flavor) {
- case RPC_AUTH_UNIX:
- goto out_set;
- case RPC_AUTH_NULL:
- continue;
- default:
- if (rpcauth_get_gssinfo(flavor, &info) == 0)
- goto out_set;
- }
+ if (args->auth_flavors[0] == server_authlist[i] ||
+ server_authlist[i] == RPC_AUTH_NULL)
+ goto out;
}
- /*
- * As a last chance, see if the server list contains AUTH_NULL -
- * if it does, use the default flavor.
- */
- for (i = 0; i < count; i++) {
- if (request->auth_flavs[i] == RPC_AUTH_NULL)
- goto out_default;
- }
-
- dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
- goto out_err;
+ dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
+ args->auth_flavors[0]);
+ return -EACCES;
-out_default:
- /* use default if flavor not already set */
- flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
- RPC_AUTH_UNIX : args->auth_flavors[0];
-out_set:
- args->auth_flavors[0] = flavor;
out:
- dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
return 0;
-out_err:
- return -EACCES;
}
/*
@@ -1701,10 +1650,10 @@ out_err:
* corresponding to the provided path.
*/
static int nfs_request_mount(struct nfs_parsed_mount_data *args,
- struct nfs_fh *root_fh)
+ struct nfs_fh *root_fh,
+ rpc_authflavor_t *server_authlist,
+ unsigned int *server_authlist_len)
{
- rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
- unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
struct nfs_mount_request request = {
.sap = (struct sockaddr *)
&args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
.protocol = args->mount_server.protocol,
.fh = root_fh,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
- .auth_flav_len = &server_authlist_len,
+ .auth_flav_len = server_authlist_len,
.auth_flavs = server_authlist,
.net = args->net,
};
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
return status;
}
- return nfs_select_flavor(args, &request);
+ return 0;
}
-struct dentry *nfs_try_mount(int flags, const char *dev_name,
- struct nfs_mount_info *mount_info,
- struct nfs_subversion *nfs_mod)
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+ struct nfs_subversion *nfs_mod)
{
int status;
- struct nfs_server *server;
+ unsigned int i;
+ bool tried_auth_unix = false;
+ bool auth_null_in_list = false;
+ struct nfs_server *server = ERR_PTR(-EACCES);
+ struct nfs_parsed_mount_data *args = mount_info->parsed;
+ rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+ unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+ status = nfs_request_mount(args, mount_info->mntfh, authlist,
+ &authlist_len);
+ if (status)
+ return ERR_PTR(status);
- if (mount_info->parsed->need_mount) {
- status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
+ /*
+ * Was a sec= authflavor specified in the options? First, verify
+ * whether the server supports it, and then just try to use it if so.
+ */
+ if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+ status = nfs_verify_authflavor(args, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
if (status)
return ERR_PTR(status);
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ }
+
+ /*
+ * No sec= option was provided. RFC 2623, section 2.7 suggests we
+ * SHOULD prefer the flavor listed first. However, some servers list
+ * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+ */
+ for (i = 0; i < authlist_len; ++i) {
+ rpc_authflavor_t flavor;
+ struct rpcsec_gss_info info;
+
+ flavor = authlist[i];
+ switch (flavor) {
+ case RPC_AUTH_UNIX:
+ tried_auth_unix = true;
+ break;
+ case RPC_AUTH_NULL:
+ auth_null_in_list = true;
+ continue;
+ default:
+ if (rpcauth_get_gssinfo(flavor, &info) != 0)
+ continue;
+ /* Fallthrough */
+ }
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+ args->auth_flavors[0] = flavor;
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ if (!IS_ERR(server))
+ return server;
}
- /* Get a volume representation */
- server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ /*
+ * Nothing we tried so far worked. At this point, give up if we've
+ * already tried AUTH_UNIX or if the server's list doesn't contain
+ * AUTH_NULL
+ */
+ if (tried_auth_unix || !auth_null_in_list)
+ return server;
+
+ /* Last chance! Try AUTH_UNIX */
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+}
+
+struct dentry *nfs_try_mount(int flags, const char *dev_name,
+ struct nfs_mount_info *mount_info,
+ struct nfs_subversion *nfs_mod)
+{
+ struct nfs_server *server;
+
+ if (mount_info->parsed->need_mount)
+ server = nfs_try_mount_request(mount_info, nfs_mod);
+ else
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+
if (IS_ERR(server))
return ERR_CAST(server);
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
- return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+ int error;
+ unsigned long kflags = 0, kflags_out = 0;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+ error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+ kflags, &kflags_out);
+ if (error)
+ goto err;
+
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+ return error;
}
EXPORT_SYMBOL_GPL(nfs_set_sb_security);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- dentry->d_count);
+ d_count(dentry));
nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
return PageUptodate(page) != 0;
}
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes or if we have a write delegation
+ * from the server then we can just skip the rest of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+{
+ if (file->f_flags & O_DSYNC)
+ return 0;
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ return 1;
+ if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
+ (inode->i_flock->fl_start == 0 &&
+ inode->i_flock->fl_end == OFFSET_MAX &&
+ inode->i_flock->fl_type != F_RDLCK)))
+ return 1;
+ return 0;
+}
+
/*
* Update and possibly write a cached page of an NFS file.
*
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
file->f_path.dentry->d_name.name, count,
(long long)(page_file_offset(page) + offset));
- /* If we're not using byte range locks, and we know the page
- * is up to date, it may be more efficient to extend the write
- * to cover the entire page in order to avoid fragmentation
- * inefficiencies.
- */
- if (nfs_write_pageuptodate(page, inode) &&
- inode->i_flock == NULL &&
- !(file->f_flags & O_DSYNC)) {
+ if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
If unsure, say N.
+config NFSD_V4_SECURITY_LABEL
+ bool "Provide Security Label support for NFSv4 server"
+ depends on NFSD_V4 && SECURITY
+ help
+
+ Say Y here if you want enable fine-grained security label attribute
+ support for NFS version 4. Security labels allow security modules like
+ SELinux and Smack to label files to facilitate enforcement of their policies.
+ Without this an NFSv4 mount will have the same label on each file.
+
+ If you do not wish to enable fine-grained security labels SELinux or
+ Smack policies on NFSv4 files, say N.
+
+ WARNING: there is still a chance of backwards-incompatible protocol changes.
+ For now we recommend "Y" only for developers and testers."
+
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..0d4c410e4589 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
#include "current_stateid.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+ struct inode *inode = resfh->fh_dentry->d_inode;
+ int status;
+
+ mutex_lock(&inode->i_mutex);
+ status = security_inode_setsecctx(resfh->fh_dentry,
+ label->data, label->len);
+ mutex_unlock(&inode->i_mutex);
+
+ if (status)
+ /*
+ * XXX: We should really fail the whole open, but we may
+ * already have created a new file, so it may be too
+ * late. For now this seems the least of evils:
+ */
+ bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
+
#define NFSDDBG_FACILITY NFSDDBG_PROC
static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
+ if (!status && open->op_label.len)
+ nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+
/*
* Following rfc 3530 14.2.16, use the returned bitmask
* to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
accmode = NFSD_MAY_NOP;
- if (open->op_created)
+ if (open->op_created ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
accmode |= NFSD_MAY_OWNER_OVERRIDE;
status = do_open_permission(rqstp, resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
+ if (create->cr_label.len)
+ nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+
if (create->cr_acl != NULL)
do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
setattr->sa_acl);
if (status)
goto out;
+ if (setattr->sa_label.len)
+ status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+ &setattr->sa_label);
+ if (status)
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
out:
@@ -1251,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* According to RFC3010, this takes precedence over all other errors.
*/
status = nfserr_minor_vers_mismatch;
- if (args->minorversion > nfsd_supported_minorversion)
+ if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
goto out;
status = nfs41_check_op_ordering(args);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f17051838b41..280acef6f0dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
static void free_session(struct nfsd4_session *);
-void nfsd4_put_session(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
{
- atomic_dec(&ses->se_ref);
+ return ses->se_flags & NFS4_SESSION_DEAD;
}
-static bool is_session_dead(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
{
- return ses->se_flags & NFS4_SESSION_DEAD;
+ if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+ free_session(ses);
}
-static __be32 mark_session_dead_locked(struct nfsd4_session *ses)
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
{
- if (atomic_read(&ses->se_ref))
+ if (atomic_read(&ses->se_ref) > ref_held_by_me)
return nfserr_jukebox;
ses->se_flags |= NFS4_SESSION_DEAD;
return nfs_ok;
@@ -364,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
dprintk("NFSD alloc_init_deleg\n");
- /*
- * Major work on the lease subsystem (for example, to support
- * calbacks on stat) will be required before we can support
- * write delegations properly.
- */
- if (type != NFS4_OPEN_DELEGATE_READ)
- return NULL;
if (fp->fi_had_conflict)
return NULL;
if (num_delegations > max_delegations)
@@ -397,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
INIT_LIST_HEAD(&dp->dl_recall_lru);
get_nfs4_file(fp);
dp->dl_file = fp;
- dp->dl_type = type;
+ dp->dl_type = NFS4_OPEN_DELEGATE_READ;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
target->cr_gid = source->cr_gid;
target->cr_group_info = source->cr_group_info;
get_group_info(target->cr_group_info);
+ target->cr_gss_mech = source->cr_gss_mech;
+ if (source->cr_gss_mech)
+ gss_mech_get(source->cr_gss_mech);
return 0;
}
@@ -1262,6 +1259,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
}
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+ u32 service;
+
+ service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+ return service == RPC_GSS_SVC_INTEGRITY ||
+ service == RPC_GSS_SVC_PRIVACY;
+}
+
+static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+
+ if (!cl->cl_mach_cred)
+ return true;
+ if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+ return false;
+ if (!svc_rqst_integrity_protected(rqstp))
+ return false;
+ if (!cr->cr_principal)
+ return false;
+ return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
+
static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
static u32 current_clientid = 1;
@@ -1639,16 +1661,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
- /* Currently only support SP4_NONE */
switch (exid->spa_how) {
+ case SP4_MACH_CRED:
+ if (!svc_rqst_integrity_protected(rqstp))
+ return nfserr_inval;
case SP4_NONE:
break;
default: /* checked by xdr code */
WARN_ON_ONCE(1);
case SP4_SSV:
return nfserr_encr_alg_unsupp;
- case SP4_MACH_CRED:
- return nfserr_serverfault; /* no excuse :-/ */
}
/* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1685,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_inval;
goto out;
}
+ if (!mach_creds_match(conf, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
if (!creds_match) { /* case 9 */
status = nfserr_perm;
goto out;
@@ -1709,7 +1735,8 @@ out_new:
status = nfserr_jukebox;
goto out;
}
- new->cl_minorversion = 1;
+ new->cl_minorversion = cstate->minorversion;
+ new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -1839,6 +1866,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
return nfs_ok;
}
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+ switch (cbs->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ return nfs_ok;
+ default:
+ /*
+ * GSS case: the spec doesn't allow us to return this
+ * error. But it also doesn't allow us not to support
+ * GSS.
+ * I'd rather this fail hard than return some error the
+ * client might think it can already handle:
+ */
+ return nfserr_encr_alg_unsupp;
+ }
+}
+
__be32
nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1854,6 +1899,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
+ status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+ if (status)
+ return status;
status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
if (status)
return status;
@@ -1874,6 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
WARN_ON_ONCE(conf && unconf);
if (conf) {
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(conf, rqstp))
+ goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
@@ -1890,6 +1941,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
goto out_free_conn;
}
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(unconf, rqstp))
+ goto out_free_conn;
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
@@ -1957,7 +2011,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
{
struct nfsd4_session *session = cstate->session;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+ status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+ if (status)
+ return status;
spin_lock(&nn->client_lock);
session->se_cb_prog = bc->bc_cb_program;
session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2044,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
status = nfserr_badsession;
if (!session)
goto out;
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(session->se_client, rqstp))
+ goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
goto out;
@@ -2014,6 +2075,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
{
struct nfsd4_session *ses;
__be32 status;
+ int ref_held_by_me = 0;
struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
nfs4_lock_state();
@@ -2021,6 +2083,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
if (!nfsd4_last_compound_op(r))
goto out;
+ ref_held_by_me++;
}
dump_sessionid(__func__, &sessionid->sessionid);
spin_lock(&nn->client_lock);
@@ -2028,17 +2091,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
status = nfserr_badsession;
if (!ses)
goto out_client_lock;
- status = mark_session_dead_locked(ses);
- if (status)
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(ses->se_client, r))
goto out_client_lock;
+ nfsd4_get_session_locked(ses);
+ status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+ if (status)
+ goto out_put_session;
unhash_session(ses);
spin_unlock(&nn->client_lock);
nfsd4_probe_callback_sync(ses->se_client);
spin_lock(&nn->client_lock);
- free_session(ses);
status = nfs_ok;
+out_put_session:
+ nfsd4_put_session(ses);
out_client_lock:
spin_unlock(&nn->client_lock);
out:
@@ -2058,26 +2126,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
return NULL;
}
-static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
{
struct nfs4_client *clp = ses->se_client;
struct nfsd4_conn *c;
+ __be32 status = nfs_ok;
int ret;
spin_lock(&clp->cl_lock);
c = __nfsd4_find_conn(new->cn_xprt, ses);
- if (c) {
- spin_unlock(&clp->cl_lock);
- free_conn(new);
- return;
- }
+ if (c)
+ goto out_free;
+ status = nfserr_conn_not_bound_to_session;
+ if (clp->cl_mach_cred)
+ goto out_free;
__nfsd4_hash_conn(new, ses);
spin_unlock(&clp->cl_lock);
ret = nfsd4_register_conn(new);
if (ret)
/* oops; xprt is already down: */
nfsd4_conn_lost(&new->cn_xpt_user);
- return;
+ return nfs_ok;
+out_free:
+ spin_unlock(&clp->cl_lock);
+ free_conn(new);
+ return status;
}
static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2242,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (status)
goto out_put_session;
- nfsd4_sequence_check_conn(conn, session);
+ status = nfsd4_sequence_check_conn(conn, session);
conn = NULL;
+ if (status)
+ goto out_put_session;
/* Success! bump slot seqid */
slot->sl_seqid = seq->seqid;
@@ -2232,7 +2307,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfserr_stale_clientid;
goto out;
}
-
+ if (!mach_creds_match(clp, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
expire_client(clp);
out:
nfs4_unlock_state();
@@ -2940,13 +3018,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
return fl;
}
-static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
struct file_lock *fl;
int status;
- fl = nfs4_alloc_init_lease(dp, flag);
+ fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3042,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
return 0;
}
-static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+static int nfs4_set_delegation(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
if (!fp->fi_lease)
- return nfs4_setlease(dp, flag);
+ return nfs4_setlease(dp);
spin_lock(&recall_lock);
if (fp->fi_had_conflict) {
spin_unlock(&recall_lock);
@@ -3005,6 +3083,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
/*
* Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
*/
static void
nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3094,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
- int status = 0, flag = 0;
+ int status = 0;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
- flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
open->op_recall = 1;
- flag = open->op_delegate_type;
- if (flag == NFS4_OPEN_DELEGATE_NONE)
- goto out;
+ if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
+ goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
- /* Let's not give out any delegations till everyone's
- * had the chance to reclaim theirs.... */
+ /*
+ * Let's not give out any delegations till everyone's
+ * had the chance to reclaim theirs....
+ */
if (locks_in_grace(net))
- goto out;
+ goto out_no_deleg;
if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
- goto out;
+ goto out_no_deleg;
+ /*
+ * Also, if the file was opened for write or
+ * create, there's a good chance the client's
+ * about to write to it, resulting in an
+ * immediate recall (since we don't support
+ * write delegations):
+ */
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
- flag = NFS4_OPEN_DELEGATE_WRITE;
- else
- flag = NFS4_OPEN_DELEGATE_READ;
+ goto out_no_deleg;
+ if (open->op_create == NFS4_OPEN_CREATE)
+ goto out_no_deleg;
break;
default:
- goto out;
+ goto out_no_deleg;
}
-
- dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
+ dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
if (dp == NULL)
goto out_no_deleg;
- status = nfs4_set_delegation(dp, flag);
+ status = nfs4_set_delegation(dp);
if (status)
goto out_free;
@@ -3053,24 +3140,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
STATEID_VAL(&dp->dl_stid.sc_stateid));
-out:
- open->op_delegate_type = flag;
- if (flag == NFS4_OPEN_DELEGATE_NONE) {
- if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
- open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
- dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-
- /* 4.1 client asking for a delegation? */
- if (open->op_deleg_want)
- nfsd4_open_deleg_none_ext(open, status);
- }
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
return;
out_free:
unhash_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
out_no_deleg:
- flag = NFS4_OPEN_DELEGATE_NONE;
- goto out;
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+ open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+ dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+ open->op_recall = 1;
+ }
+
+ /* 4.1 client asking for a delegation? */
+ if (open->op_deleg_want)
+ nfsd4_open_deleg_none_ext(open, status);
+ return;
}
static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3513,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
/* Returns true iff a is later than b: */
static bool stateid_generation_after(stateid_t *a, stateid_t *b)
{
- return (s32)a->si_generation - (s32)b->si_generation > 0;
+ return (s32)(a->si_generation - b->si_generation) > 0;
}
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4521,6 @@ __be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_locku *locku)
{
- struct nfs4_lockowner *lo;
struct nfs4_ol_stateid *stp;
struct file *filp = NULL;
struct file_lock *file_lock = NULL;
@@ -4468,10 +4553,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_jukebox;
goto out;
}
- lo = lockowner(stp->st_stateowner);
locks_init_lock(file_lock);
file_lock->fl_type = F_UNLCK;
- file_lock->fl_owner = (fl_owner_t)lo;
+ file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
file_lock->fl_pid = current->tgid;
file_lock->fl_file = filp;
file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4574,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
- WARN_ON_ONCE(cstate->replay_owner);
- release_lockowner(lo);
- }
-
out:
nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..0c0f3ea90de5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
#include "cache.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
+
+
#define NFSDDBG_FACILITY NFSDDBG_XDR
/*
@@ -134,6 +139,19 @@ xdr_error: \
} \
} while (0)
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+ argp->pagelist++;
+ argp->p = page_address(argp->pagelist[0]);
+ if (argp->pagelen < PAGE_SIZE) {
+ argp->end = argp->p + (argp->pagelen>>2);
+ argp->pagelen = 0;
+ } else {
+ argp->end = argp->p + (PAGE_SIZE>>2);
+ argp->pagelen -= PAGE_SIZE;
+ }
+}
+
static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
* guarantee p points to at least nbytes bytes.
*/
memcpy(p, argp->p, avail);
- /* step to next page */
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + (argp->pagelen>>2);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
+ next_decode_page(argp);
memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
argp->p += XDR_QUADLEN(nbytes - avail);
return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
static __be32
nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
- struct iattr *iattr, struct nfs4_acl **acl)
+ struct iattr *iattr, struct nfs4_acl **acl,
+ struct xdr_netobj *label)
{
int expected_len, len = 0;
u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
goto xdr_error;
}
}
+
+ label->len = 0;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* lfs: we don't use it */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* pi: we don't use it either */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32);
+ READ_BUF(dummy32);
+ if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+ return nfserr_badlabel;
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+ if (!label->data)
+ return nfserr_jukebox;
+ defer_free(argp, kfree, label->data);
+ memcpy(label->data, buf, dummy32);
+ }
+#endif
+
if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
|| bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
|| bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
/* callback_sec_params4 */
READ_BUF(4);
READ32(nr_secflavs);
- cbs->flavor = (u32)(-1);
+ if (nr_secflavs)
+ cbs->flavor = (u32)(-1);
+ else
+ /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+ cbs->flavor = 0;
for (i = 0; i < nr_secflavs; ++i) {
READ_BUF(4);
READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
- &create->cr_acl);
+ &create->cr_acl, &create->cr_label);
if (status)
goto out;
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
if (status)
return status;
return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
- &setattr->sa_acl);
+ &setattr->sa_acl, &setattr->sa_label);
}
static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
+ [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
};
static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{
+ __be32 *p = *pp;
+
+ if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+ return nfserr_resource;
+
+ /*
+ * For now we use a 0 here to indicate the null translation; in
+ * the future we may place a call to translation code here.
+ */
+ if ((*buflen -= 8) < 0)
+ return nfserr_resource;
+
+ WRITE32(0); /* lfs */
+ WRITE32(0); /* pi */
+ p = xdr_encode_opaque(p, context, len);
+ *buflen -= (XDR_QUADLEN(len) << 2) + 4;
+
+ *pp = p;
+ return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{ return 0; }
+#endif
+
static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
{
/* As per referral draft: */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
int err;
int aclsupport = 0;
struct nfs4_acl *acl = NULL;
+ void *context = NULL;
+ int contextlen;
+ bool contextsupport = false;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
}
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ err = security_inode_getsecctx(dentry->d_inode,
+ &context, &contextlen);
+ contextsupport = (err == 0);
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ if (err == -EOPNOTSUPP)
+ bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ else if (err)
+ goto out_nfserr;
+ }
+ }
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+
if (bmval2) {
if ((buflen -= 16) < 0)
goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
+ if (!contextsupport)
+ word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
if (!word2) {
if ((buflen -= 12) < 0)
goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
get_parent_attributes(exp, &stat);
WRITE64(stat.ino);
}
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ status = nfsd4_encode_security_label(rqstp, context,
+ contextlen, &p, &buflen);
+ if (status)
+ goto out;
+ }
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
status = nfs_ok;
out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (context)
+ security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
kfree(acl);
if (fhp == &tempfh)
fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
{
__be32 *p;
- RESERVE_SPACE(12);
+ RESERVE_SPACE(16);
if (nfserr) {
- WRITE32(2);
+ WRITE32(3);
+ WRITE32(0);
WRITE32(0);
WRITE32(0);
}
else {
- WRITE32(2);
+ WRITE32(3);
WRITE32(setattr->sa_bmval[0]);
WRITE32(setattr->sa_bmval[1]);
+ WRITE32(setattr->sa_bmval[2]);
}
ADJUST_ARGS();
return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
return nfserr;
}
+static const u32 nfs4_minimal_spo_must_enforce[2] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+};
+
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
@@ -3264,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
/* state_protect4_r. Currently only support SP4_NONE */
BUG_ON(exid->spa_how != SP4_NONE);
WRITE32(exid->spa_how);
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce bitmap: */
+ WRITE32(2);
+ WRITE32(nfs4_minimal_spo_must_enforce[0]);
+ WRITE32(nfs4_minimal_spo_must_enforce[1]);
+ /* empty spo_must_allow bitmap: */
+ WRITE32(0);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
/* The server_owner struct */
WRITE64(minor_id); /* Minor id */
@@ -3635,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
if (nfsd4_has_session(cs)) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_client *clp = cs->session->se_client;
if (cs->status != nfserr_replay_cache) {
nfsd4_store_cache_entry(resp);
cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
}
/* Renew the clientid on success and on replay */
- put_client_renew(cs->session->se_client);
+ spin_lock(&nn->client_lock);
nfsd4_put_session(cs->session);
+ spin_unlock(&nn->client_lock);
+ put_client_renew(clp);
}
return 1;
}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
/*
* nfsd version
*/
-#define NFSD_SUPPORTED_MINOR_VERSION 1
+#define NFSD_SUPPORTED_MINOR_VERSION 2
/*
* Maximum blocksizes supported by daemon under various circumstances.
*/
@@ -53,7 +53,6 @@ struct readdir_cd {
extern struct svc_program nfsd_program;
extern struct svc_version nfsd_version2, nfsd_version3,
nfsd_version4;
-extern u32 nfsd_supported_minorversion;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
@@ -243,6 +242,12 @@ void nfsd_lockd_shutdown(void);
#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
/* error codes for internal use */
/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +327,13 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#else
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#endif
+
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +348,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
static inline u32 nfsd_suppattrs2(u32 minorversion)
{
- return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
- : NFSD4_SUPPORTED_ATTRS_WORD2;
+ switch (minorversion) {
+ default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
+ case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
+ case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
+ }
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +365,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#else
#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#endif
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program nfsd_program = {
};
-u32 nfsd_supported_minorversion;
+static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
+ [0] = 1,
+ [1] = 1,
+};
int nfsd_vers(int vers, enum vers_op change)
{
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
return -1;
switch(change) {
case NFSD_SET:
- nfsd_supported_minorversion = minorversion;
+ nfsd_supported_minorversions[minorversion] = true;
break;
case NFSD_CLEAR:
- if (minorversion == 0)
- return -1;
- nfsd_supported_minorversion = minorversion - 1;
+ nfsd_supported_minorversions[minorversion] = false;
break;
case NFSD_TEST:
- return minorversion <= nfsd_supported_minorversion;
+ return nfsd_supported_minorversions[minorversion];
case NFSD_AVAIL:
return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
+ bool cl_mach_cred; /* SP4_MACH_CRED in force */
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a6bc8a7423db..8ff6a0019b0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
#include <asm/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
+#include <linux/security.h>
#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
return 1;
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+
+ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+
+ mutex_lock(&dentry->d_inode->i_mutex);
+ host_error = security_inode_setsecctx(dentry, label->data, label->len);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ return nfserr_notsupp;
+}
+#endif
+
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
/* nfsd/vfs.c */
-int fh_lock_parent(struct svc_fh *, struct dentry *);
int nfsd_racache_init(int);
void nfsd_racache_shutdown(void);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
struct nfs4_acl *);
int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+ struct xdr_netobj *);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
struct svc_fh *, char *, int);
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
-int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
- unsigned long size);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t *, struct readdir_cd *, filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
-int nfsd_notify_change(struct inode *, struct iattr *);
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
-int nfsd_sync_dir(struct dentry *dp);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
#include "state.h"
#include "nfsd.h"
+#define NFSD4_MAX_SEC_LABEL_LEN 2048
#define NFSD4_MAX_TAGLEN 128
#define XDR_LEN(n) (((n) + 3) & ~3)
@@ -118,6 +119,7 @@ struct nfsd4_create {
struct iattr cr_iattr; /* request */
struct nfsd4_change_info cr_cinfo; /* response */
struct nfs4_acl *cr_acl;
+ struct xdr_netobj cr_label;
};
#define cr_linklen u.link.namelen
#define cr_linkname u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_acl *op_acl;
+ struct xdr_netobj op_label;
};
#define op_iattr iattr
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
u32 sa_bmval[3]; /* request */
struct iattr sa_iattr; /* request */
struct nfs4_acl *sa_acl;
+ struct xdr_netobj sa_label;
};
struct nfsd4_setclientid {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1427de5ebf4d..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -996,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
static int nilfs_tree_was_touched(struct dentry *root_dentry)
{
- return root_dentry->d_count > 1;
+ return d_count(root_dentry) > 1;
}
/**
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
spin_unlock(&fsn_mark->lock);
- /* nothing else could have found us thanks to the dnotify_mark_mutex */
+ /* nothing else could have found us thanks to the dnotify_groups
+ mark_mutex */
if (dn_mark->dn == NULL)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
- fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+ fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+ NULL, 0);
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
- * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
- * only time we clean up the marks we need to get our mark off
- * the list. */
+ * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
+ * fd is the only time we clean up the marks we need to get our mark
+ * off the list. */
if (f != filp) {
/* if we added ourselves, shoot ourselves, it's possible that
* the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1ea52f7c031f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
metadata->event_len = FAN_EVENT_METADATA_LEN;
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
+ metadata->reserved = 0;
metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -523,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -547,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
+
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
if (removed & inode->i_fsnotify_mask)
@@ -590,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
return mask & ~oldmask;
}
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+ struct inode *inode,
+ struct vfsmount *mnt)
+{
+ struct fsnotify_mark *mark;
+ int ret;
+
+ if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+ return ERR_PTR(-ENOSPC);
+
+ mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+ if (!mark)
+ return ERR_PTR(-ENOMEM);
+
+ fsnotify_init_mark(mark, fanotify_free_mark);
+ ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+ if (ret) {
+ fsnotify_put_mark(mark);
+ return ERR_PTR(ret);
+ }
+
+ return mark;
+}
+
+
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
unsigned int flags)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -627,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -641,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
(atomic_read(&inode->i_writecount) > 0))
return 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~inode->i_fsnotify_mask)
fsnotify_recalc_inode_mask(inode);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
/* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
goto out_err;
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+ NULL, 0);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
-retry:
+ mutex_lock(&group->mark_mutex);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- /*
- * inotify_new_watch could race with another thread which did an
- * inotify_new_watch between the update_existing and the add watch
- * here, go back and try to update an existing mark again.
- */
- if (ret == -EEXIST)
- goto retry;
+ mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
* fsnotify inode mark locking/lifetime/and refcnting
*
* REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object. The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
*
* LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
*
+ * group->mark_mutex
* mark->lock
- * group->mark_lock
* inode->i_lock
*
- * mark->lock protects 2 things, mark->group and mark->inode. You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list. It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list. It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
*
* inode->i_lock protects the i_fsnotify_marks list anchored inside a
* given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
* inode. We take i_lock and walk the i_fsnotify_marks safely. For each
* mark on the list we take a reference (so the mark can't disappear under us).
* We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode. For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode. If
- * we see the group and inode are not NULL we take those locks. Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future. Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref. We drop our reference we took before we unhooked it
- * from the inode. When the ref hits 0 we can free the mark.
- *
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
* Very similarly for freeing by group, except we use free_g_list.
*
* This has the very interesting property of being able to run concurrently with
diff --git a/fs/open.c b/fs/open.c
index fca72c4d3f17..d53e29895082 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,10 +840,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_SYNC)
flags |= O_DSYNC;
- if (flags & O_TMPFILE) {
- if (!(flags & O_CREAT))
+ if (flags & __O_TMPFILE) {
+ if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
acc_mode = MAY_OPEN | ACC_MODE(flags);
+ if (!(acc_mode & MAY_WRITE))
+ return -EINVAL;
} else if (flags & O_PATH) {
/*
* If we have O_PATH in the open flag. Then we
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 28503172f2e4..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -223,7 +223,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
* regions in the 1st kernel pointed to by PT_LOAD entries) into
* virtually contiguous user-space in ELF layout.
*/
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
return proc_dointvec(table, write, buffer, lenp, ppos);
}
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
{ },
};
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
{
.procname = "quota",
.mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
{ },
};
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/select.c b/fs/select.c
index 6b14dc7df3a4..35d4adc749d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,7 @@
#include <linux/hrtimer.h>
#include <linux/sched/rt.h>
#include <linux/freezer.h>
+#include <net/busy_poll.h>
#include <asm/uaccess.h>
@@ -386,9 +387,10 @@ get_max:
#define POLLEX_SET (POLLPRI)
static inline void wait_key_set(poll_table *wait, unsigned long in,
- unsigned long out, unsigned long bit)
+ unsigned long out, unsigned long bit,
+ unsigned int ll_flag)
{
- wait->_key = POLLEX_SET;
+ wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
@@ -402,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
@@ -424,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+ bool can_busy_loop = false;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -451,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) {
- wait_key_set(wait, in, out, bit);
+ wait_key_set(wait, in, out,
+ bit, busy_flag);
mask = (*f_op->poll)(f.file, wait);
}
fdput(f);
@@ -470,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval++;
wait->_qproc = NULL;
}
+ /* got something, stop busy polling */
+ if (retval) {
+ can_busy_loop = false;
+ busy_flag = 0;
+
+ /*
+ * only remember a returned
+ * POLL_BUSY_LOOP if we asked for it
+ */
+ } else if (busy_flag & mask)
+ can_busy_loop = true;
+
}
}
if (res_in)
@@ -488,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
break;
}
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
@@ -719,7 +748,9 @@ struct poll_list {
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+ bool *can_busy_poll,
+ unsigned int busy_flag)
{
unsigned int mask;
int fd;
@@ -733,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
+ pwait->_key |= busy_flag;
mask = f.file->f_op->poll(f.file, pwait);
+ if (mask & busy_flag)
+ *can_busy_poll = true;
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
@@ -752,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -764,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (;;) {
struct poll_list *walk;
+ bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -778,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt)) {
+ if (do_pollfd(pfd, pt, &can_busy_loop,
+ busy_flag)) {
count++;
pt->_qproc = NULL;
+ /* found something, stop busy polling */
+ busy_flag = 0;
+ can_busy_loop = false;
}
}
}
@@ -797,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
if (count || timed_out)
break;
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+ struct hlist_node *node;
+
+ for_each_possible_cpu(*cpu) {
+ hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+ if (pos-- == 0)
+ return node;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v: pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+ int *cpu, loff_t *pos)
+{
+ struct hlist_node *node = v;
+
+ ++*pos;
+
+ if (node->next)
+ return node->next;
+
+ for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+ *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+ struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+ if (!hlist_empty(bucket))
+ return bucket->first;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..68307c029228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super);
* and want to turn it into a full-blown active reference. grab_super()
* is called with sb_lock held and drops it. Returns 1 in case of
* success, 0 if we had failed (superblock contents was already dead or
- * dying when grab_super() had been called).
+ * dying when grab_super() had been called). Note that this is only
+ * called for superblocks not in rundown mode (== ones still on ->fs_supers
+ * of their type), so increment of ->s_count is OK here.
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
- if (atomic_inc_not_zero(&s->s_active)) {
- spin_unlock(&sb_lock);
- return 1;
- }
- /* it's going away */
s->s_count++;
spin_unlock(&sb_lock);
- /* wait for it to die */
down_write(&s->s_umount);
+ if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+ put_super(s);
+ return 1;
+ }
up_write(&s->s_umount);
put_super(s);
return 0;
@@ -463,11 +463,6 @@ retry:
destroy_super(s);
s = NULL;
}
- down_write(&old->s_umount);
- if (unlikely(!(old->s_flags & MS_BORN))) {
- deactivate_locked_super(old);
- goto retry;
- }
return old;
}
}
@@ -660,10 +655,10 @@ restart:
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
- if (grab_super(sb)) /* drops sb_lock */
- return sb;
- else
+ if (!grab_super(sb))
goto restart;
+ up_write(&sb->s_umount);
+ return sb;
}
}
spin_unlock(&sb_lock);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..09a1a25cd145 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp)
{
struct attribute *const* attr;
- int i;
+ struct bin_attribute *const* bin_attr;
- for (i = 0, attr = grp->attrs; *attr; i++, attr++)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->attrs)
+ for (attr = grp->attrs; *attr; attr++)
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->bin_attrs)
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+ sysfs_remove_bin_file(kobj, *bin_attr);
}
static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
struct attribute *const* attr;
+ struct bin_attribute *const* bin_attr;
int error = 0, i;
- for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- umode_t mode = 0;
+ if (grp->attrs) {
+ for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+ umode_t mode = 0;
+
+ /*
+ * In update mode, we're changing the permissions or
+ * visibility. Do this by first removing then
+ * re-adding (if required) the file.
+ */
+ if (update)
+ sysfs_hash_and_remove(dir_sd, NULL,
+ (*attr)->name);
+ if (grp->is_visible) {
+ mode = grp->is_visible(kobj, *attr, i);
+ if (!mode)
+ continue;
+ }
+ error = sysfs_add_file_mode(dir_sd, *attr,
+ SYSFS_KOBJ_ATTR,
+ (*attr)->mode | mode);
+ if (unlikely(error))
+ break;
+ }
+ if (error) {
+ remove_files(dir_sd, kobj, grp);
+ goto exit;
+ }
+ }
- /* in update mode, we're changing the permissions or
- * visibility. Do this by first removing then
- * re-adding (if required) the file */
- if (update)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
- if (grp->is_visible) {
- mode = grp->is_visible(kobj, *attr, i);
- if (!mode)
- continue;
+ if (grp->bin_attrs) {
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ if (update)
+ sysfs_remove_bin_file(kobj, *bin_attr);
+ error = sysfs_create_bin_file(kobj, *bin_attr);
+ if (error)
+ break;
}
- error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
- (*attr)->mode | mode);
- if (unlikely(error))
- break;
+ if (error)
+ remove_files(dir_sd, kobj, grp);
}
- if (error)
- remove_files(dir_sd, kobj, grp);
+exit:
return error;
}
@@ -67,8 +93,8 @@ static int internal_create_group(struct kobject *kobj, int update,
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
return -EINVAL;
- if (!grp->attrs) {
- WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+ if (!grp->attrs && !grp->bin_attrs) {
+ WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
kobj->name, grp->name ? "" : grp->name);
return -EINVAL;
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/alarmtimer.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
@@ -26,7 +27,10 @@
#include <linux/rcupdate.h>
struct timerfd_ctx {
- struct hrtimer tmr;
+ union {
+ struct hrtimer tmr;
+ struct alarm alarm;
+ } t;
ktime_t tintv;
ktime_t moffs;
wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
static LIST_HEAD(cancel_list);
static DEFINE_SPINLOCK(cancel_lock);
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+ return ctx->clockid == CLOCK_REALTIME_ALARM ||
+ ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
+
/*
* This gets called when the timer event triggers. We set the "expired"
* flag, but we do not re-arm the timer (in case it's necessary,
* tintv.tv64 != 0) until the timer is accessed.
*/
-static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+static void timerfd_triggered(struct timerfd_ctx *ctx)
{
- struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
unsigned long flags;
spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
ctx->ticks++;
wake_up_locked(&ctx->wqh);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+ struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+ t.tmr);
+ timerfd_triggered(ctx);
return HRTIMER_NORESTART;
}
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+ t.alarm);
+ timerfd_triggered(ctx);
+ return ALARMTIMER_NORESTART;
+}
+
/*
* Called when the clock was set to cancel the timers in the cancel
* list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
{
- if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
- (flags & TFD_TIMER_CANCEL_ON_SET)) {
+ if ((ctx->clockid == CLOCK_REALTIME ||
+ ctx->clockid == CLOCK_REALTIME_ALARM) &&
+ (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
if (!ctx->might_cancel) {
ctx->might_cancel = true;
spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
{
ktime_t remaining;
- remaining = hrtimer_expires_remaining(&ctx->tmr);
+ if (isalarm(ctx))
+ remaining = alarm_expires_remaining(&ctx->t.alarm);
+ else
+ remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->expired = 0;
ctx->ticks = 0;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
- hrtimer_init(&ctx->tmr, clockid, htmode);
- hrtimer_set_expires(&ctx->tmr, texp);
- ctx->tmr.function = timerfd_tmrproc;
+
+ if (isalarm(ctx)) {
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ } else {
+ hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_set_expires(&ctx->t.tmr, texp);
+ ctx->t.tmr.function = timerfd_tmrproc;
+ }
+
if (texp.tv64 != 0) {
- hrtimer_start(&ctx->tmr, texp, htmode);
+ if (isalarm(ctx)) {
+ if (flags & TFD_TIMER_ABSTIME)
+ alarm_start(&ctx->t.alarm, texp);
+ else
+ alarm_start_relative(&ctx->t.alarm, texp);
+ } else {
+ hrtimer_start(&ctx->t.tmr, texp, htmode);
+ }
+
if (timerfd_canceled(ctx))
return -ECANCELED;
}
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
struct timerfd_ctx *ctx = file->private_data;
timerfd_remove_cancel(ctx);
- hrtimer_cancel(&ctx->tmr);
+
+ if (isalarm(ctx))
+ alarm_cancel(&ctx->t.alarm);
+ else
+ hrtimer_cancel(&ctx->t.tmr);
kfree_rcu(ctx, rcu);
return 0;
}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
* callback to avoid DoS attacks specifying a very
* short timer period.
*/
- ticks += hrtimer_forward_now(&ctx->tmr,
- ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+ if (isalarm(ctx)) {
+ ticks += alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ticks += hrtimer_forward_now(&ctx->t.tmr,
+ ctx->tintv) - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
ctx->expired = 0;
ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
if ((flags & ~TFD_CREATE_FLAGS) ||
(clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME))
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_REALTIME_ALARM &&
+ clockid != CLOCK_BOOTTIME_ALARM))
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
init_waitqueue_head(&ctx->wqh);
ctx->clockid = clockid;
- hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+
+ if (isalarm(ctx))
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ else
+ hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+
ctx->moffs = ktime_get_monotonic_offset();
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
*/
for (;;) {
spin_lock_irq(&ctx->wqh.lock);
- if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
- break;
+
+ if (isalarm(ctx)) {
+ if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+ break;
+ } else {
+ if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+ break;
+ }
spin_unlock_irq(&ctx->wqh.lock);
cpu_relax();
}
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
* We do not update "ticks" and "expired" since the timer will be
* re-programmed again in the following timerfd_setup() call.
*/
- if (ctx->expired && ctx->tintv.tv64)
- hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+ if (ctx->expired && ctx->tintv.tv64) {
+ if (isalarm(ctx))
+ alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+ else
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+ }
old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
spin_lock_irq(&ctx->wqh.lock);
if (ctx->expired && ctx->tintv.tv64) {
ctx->expired = 0;
- ctx->ticks +=
- hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+
+ if (isalarm(ctx)) {
+ ctx->ticks +=
+ alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ctx->ticks +=
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+ - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \
xfs_dir2_sf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_icreate_item.o \
xfs_inode.o \
xfs_log_recover.o \
xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbnoa,
+ args->alignment,
+ args->userdata, *sbnoa,
*slena, &new);
/*
@@ -976,7 +985,8 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbnoa, gtlena, &gtnew);
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- ltbnoa, ltlena, &ltnew);
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 31d3cd129269..b800fbcafc7f 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -690,6 +690,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 89042848f9ec..05c698ccb238 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1161,6 +1161,24 @@ xfs_bmap_extents_to_btree(
* since the file data needs to get logged so things will stay consistent.
* (The bmap-level manipulations are ok, though).
*/
+void
+xfs_bmap_local_to_extents_empty(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_bytes == 0);
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+ xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
STATIC int /* error */
xfs_bmap_local_to_extents(
xfs_trans_t *tp, /* transaction pointer */
@@ -1174,9 +1192,12 @@ xfs_bmap_local_to_extents(
struct xfs_inode *ip,
struct xfs_ifork *ifp))
{
- int error; /* error return value */
+ int error = 0;
int flags; /* logging flags returned */
xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
/*
* We don't want to deal with the case of keeping inode data inline yet.
@@ -1185,68 +1206,65 @@ xfs_bmap_local_to_extents(
ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+ if (!ifp->if_bytes) {
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags = XFS_ILOG_CORE;
+ goto done;
+ }
+
flags = 0;
error = 0;
- if (ifp->if_bytes) {
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent block */
- xfs_bmbt_rec_host_t *ep;/* extent record pointer */
-
- ASSERT((ifp->if_flags &
- (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = ip->i_mount;
- args.firstblock = *firstblock;
- /*
- * Allocate a block. We know we need only one, since the
- * file currently fits in an inode.
- */
- if (*firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = *firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- args.total = total;
- args.minlen = args.maxlen = args.prod = 1;
- error = xfs_alloc_vextent(&args);
- if (error)
- goto done;
-
- /* Can't fail, the space was reserved. */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(args.len == 1);
- *firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-
- /* initialise the block and copy the data */
- init_fn(tp, bp, ip, ifp);
-
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
- xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_iext_add(ifp, 0, 1);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
- xfs_trans_mod_dquot_byino(tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- flags |= xfs_ilog_fext(whichfork);
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+ XFS_IFINLINE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
} else {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
}
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
+
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+ /* initialise the block and copy the data */
+ init_fn(tp, bp, ip, ifp);
+
+ /* account for the change in fork size and log everything */
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
+
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+
done:
*logflagsp = flags;
return error;
@@ -1323,25 +1341,6 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Block initialisation function for local to extent format conversion.
- *
- * This shouldn't actually be called by anyone, so make sure debug kernels cause
- * a noticable failure.
- */
-STATIC void
-xfs_bmap_local_to_extents_init_fn(
- struct xfs_trans *tp,
- struct xfs_buf *bp,
- struct xfs_inode *ip,
- struct xfs_ifork *ifp)
-{
- ASSERT(0);
- bp->b_ops = &xfs_bmbt_buf_ops;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-}
-
-/*
* Called from xfs_bmap_add_attrfork to handle local format files. Each
* different data fork content type needs a different callout to do the
* conversion. Some are basic and only require special block initialisation
@@ -1381,9 +1380,9 @@ xfs_bmap_add_attrfork_local(
flags, XFS_DATA_FORK,
xfs_symlink_local_to_remote);
- return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
- XFS_DATA_FORK,
- xfs_bmap_local_to_extents_init_fn);
+ /* should only be called for types that support local format data */
+ ASSERT(0);
+ return EFSCORRUPTED;
}
/*
@@ -4907,20 +4906,19 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & XFS_BMAPI_IGSTATE));
ASSERT(tp != NULL);
ASSERT(len > 0);
-
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
@@ -4933,37 +4931,6 @@ xfs_bmapi_write(
XFS_STATS_INC(xs_blk_mapw);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- /*
- * XXX (dgc): This assumes we are only called for inodes that
- * contain content neutral data in local format. Anything that
- * contains caller-specific data in local format that needs
- * transformation to move to a block format needs to do the
- * conversion to extent format itself.
- *
- * Directory data forks and attribute forks handle this
- * themselves, but with the addition of metadata verifiers every
- * data fork in local format now contains caller specific data
- * and as such conversion through this function is likely to be
- * broken.
- *
- * The only likely user of this branch is for remote symlinks,
- * but we cannot overwrite the data fork contents of the symlink
- * (EEXIST occurs higher up the stack) and so it will never go
- * from local format to extent format here. Hence I don't think
- * this branch is ever executed intentionally and we should
- * consider removing it and asserting that xfs_bmapi_write()
- * cannot be called directly on local format forks. i.e. callers
- * are completely responsible for local to extent format
- * conversion, not xfs_bmapi_write().
- */
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
- &bma.logflags, whichfork,
- xfs_bmap_local_to_extents_init_fn);
- if (error)
- goto error0;
- }
-
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516eb..1cf1292d29b7 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -172,6 +172,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
#endif
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
#define XFS_BMDR_SPACE_CALC(nrecs) \
(int)(sizeof(xfs_bmdr_block_t) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
/*
* Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+ if (bip->bli_flags & XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it.
+ * It is not being included in the transaction
+ * commit, so no vectors are used at all.
+ */
+ trace_xfs_buf_item_size_ordered(bip);
+ return XFS_LOG_VEC_ORDERED;
+ }
+
/*
* the vector count is based on the number of buffer vectors we have
* dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
goto out;
}
+
/*
* Fill in an iovec for each set of contiguous chunks.
*/
@@ -299,18 +310,36 @@ xfs_buf_item_format(
/*
* If it is an inode buffer, transfer the in-memory state to the
- * format flags and clear the in-memory state. We do not transfer
+ * format flags and clear the in-memory state.
+ *
+ * For buffer based inode allocation, we do not transfer
* this state if the inode buffer allocation has not yet been committed
* to the log as setting the XFS_BLI_INODE_BUF flag will prevent
* correct replay of the inode allocation.
+ *
+ * For icreate item based inode allocation, the buffers aren't written
+ * to the journal during allocation, and hence we should always tag the
+ * buffer as an inode buffer so that the correct unlinked list replay
+ * occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
+ if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+ XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so don't format it.
+ */
+ trace_xfs_buf_item_format_ordered(bip);
+ return;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
vecp = xfs_buf_item_format_segment(bip, vecp, offset,
&bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted, clean, i;
- uint hold;
+ bool clean;
+ bool aborted;
+ int flags;
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
*/
- aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
+ * Before possibly freeing the buf item, copy the per-transaction state
+ * so we can reference it safely later after clearing it from the
+ * buffer log item.
*/
- hold = bip->bli_flags & XFS_BLI_HOLD;
-
- /* Clear the per transaction state. */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+ flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
* If the buf item is marked stale, then don't do anything. We'll
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
+ if (flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
* be the only reference to the buf item, so we free it anyway
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
+ *
+ * Ordered buffers are dirty but may have no recorded changes, so ensure
+ * we only release clean items here.
*/
- clean = 1;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = 0;
- break;
+ clean = (flags & XFS_BLI_DIRTY) ? false : true;
+ if (clean) {
+ int i;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = false;
+ break;
+ }
}
}
if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
} else
atomic_dec(&bip->bli_refcount);
- if (!hold)
+ if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
}
@@ -842,12 +877,6 @@ xfs_buf_item_log(
struct xfs_buf *bp = bip->bli_buf;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* walk each buffer segment and mark them dirty appropriately.
*/
start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
*/
uint
@@ -907,11 +936,11 @@ void
xfs_buf_item_relse(
xfs_buf_t *bp)
{
- xfs_buf_log_item_t *bip;
+ xfs_buf_log_item_t *bip = bp->b_fspriv;
trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
- bip = bp->b_fspriv;
bp->b_fspriv = bip->bli_item.li_bio_list;
if (bp->b_fspriv == NULL)
bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
#define XFS_BLI_INODE_BUF 0x40
+#define XFS_BLI_ORDERED 0x80
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
{ XFS_BLI_LOGGED, "LOGGED" }, \
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
{ XFS_BLI_STALE_INODE, "STALE_INODE" }, \
- { XFS_BLI_INODE_BUF, "INODE_BUF" }
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
+ { XFS_BLI_ORDERED, "ORDERED" }
#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
*/
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
return EINVAL;
if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
return EINVAL;
-
if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f7a0e95d197a..07d735a80a0f 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -132,9 +132,6 @@ typedef enum xfs_dinode_fmt {
#define XFS_LITINO(mp, version) \
((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-#define XFS_BROOT_SIZE_ADJ(ip) \
- (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
-
/*
* Inode data & attribute fork sizes, per inode.
*/
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 09aea0247d96..5e7fbd72cf52 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_dir2_format.h"
@@ -1164,13 +1165,15 @@ xfs_dir2_sf_to_block(
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_name name;
+ struct xfs_ifork *ifp;
trace_xfs_dir2_sf_to_block(args);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
/*
* Bomb out if the shortform directory is way too short.
*/
@@ -1179,22 +1182,23 @@ xfs_dir2_sf_to_block(
return XFS_ERROR(EIO);
}
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes == dp->i_d.di_size);
+ ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_d.di_nextents == 0);
/*
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
- memcpy(sfp, oldsfp, dp->i_df.if_bytes);
+ sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ memcpy(sfp, oldsfp, ifp->if_bytes);
- xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
* Add block 0 to the inode.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e0cc1243a8aa..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp = *bpp;
struct xfs_bmbt_irec *map = mip->map;
+ struct blk_plug plug;
int error = 0;
int length;
int i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
*/
+ blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
}
}
}
+ blk_finish_plug(&plug);
out:
*bpp = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..0adf27ecf3f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
xfs_buf_t **O_bpp,
uint flags)
{
- xfs_bmbt_irec_t map;
- int nmaps = 1, error;
- xfs_buf_t *bp;
- xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
- xfs_mount_t *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- xfs_trans_t *tp = (tpp ? *tpp : NULL);
+ struct xfs_bmbt_irec map;
+ int nmaps = 1, error;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ struct xfs_trans *tp = (tpp ? *tpp : NULL);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
@@ -804,7 +804,7 @@ xfs_qm_dqget(
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
@@ -936,6 +936,7 @@ xfs_qm_dqput_final(
{
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
trace_xfs_dqput_free(dqp);
@@ -949,21 +950,29 @@ xfs_qm_dqput_final(
/*
* If we just added a udquot to the freelist, then we want to release
- * the gdquot reference that it (probably) has. Otherwise it'll keep
- * the gdquot from getting reclaimed.
+ * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
+ * keep the gdquot/pdquot from getting reclaimed.
*/
gdqp = dqp->q_gdquot;
if (gdqp) {
xfs_dqlock(gdqp);
dqp->q_gdquot = NULL;
}
+
+ pdqp = dqp->q_pdquot;
+ if (pdqp) {
+ xfs_dqlock(pdqp);
+ dqp->q_pdquot = NULL;
+ }
xfs_dqunlock(dqp);
/*
- * If we had a group quota hint, release it now.
+ * If we had a group/project quota hint, release it now.
*/
if (gdqp)
xfs_qm_dqput(gdqp);
+ if (pdqp)
+ xfs_qm_dqput(pdqp);
}
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..55abbca2883d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -53,6 +53,7 @@ typedef struct xfs_dquot {
xfs_fileoff_t q_fileoffset; /* offset in quotas file */
struct xfs_dquot*q_gdquot; /* group dquot, hint only */
+ struct xfs_dquot*q_pdquot; /* project dquot, hint only */
xfs_disk_dquot_t q_core; /* actual usage & quotas */
xfs_dq_logitem_t q_logitem; /* dquot log item */
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
@@ -118,8 +119,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
case XFS_DQ_USER:
return XFS_IS_UQUOTA_ON(mp);
case XFS_DQ_GROUP:
+ return XFS_IS_GQUOTA_ON(mp);
case XFS_DQ_PROJ:
- return XFS_IS_OQUOTA_ON(mp);
+ return XFS_IS_PQUOTA_ON(mp);
default:
return 0;
}
@@ -131,8 +133,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
case XFS_DQ_USER:
return ip->i_udquot;
case XFS_DQ_GROUP:
- case XFS_DQ_PROJ:
return ip->i_gdquot;
+ case XFS_DQ_PROJ:
+ return ip->i_pdquot;
default:
return NULL;
}
@@ -143,10 +146,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
- XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
- XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
if (!bp)
return EIO;
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_bmap.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
/*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
#endif
/*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
*/
-STATIC int
+int
xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct list_head *buffer_list,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno,
XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+ mp->m_sb.sb_inodesize, length, gen);
} else if (xfs_sb_version_hasnlink(&mp->m_sb))
version = 2;
else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction
- * to log a whole cluster of inodes instead of all the
- * individual transactions causing a lot of log traffic.
- */
+
+ /* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
ino++;
uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
xfs_dinode_calc_crc(mp, free);
- } else {
+ } else if (tp) {
/* just log the inode core */
xfs_trans_log_buf(tp, fbuf, ioffset,
ioffset + isize - 1);
}
}
- if (version == 3) {
- /* need to log the entire buffer */
- xfs_trans_log_buf(tp, fbuf, 0,
- BBTOB(fbuf->b_length) - 1);
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
}
- xfs_trans_inode_alloc_buf(tp, fbuf);
}
return 0;
}
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
- */
+ */
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
args.len, prandom_u32());
if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
struct xfs_btree_cur *cur,
xfs_agino_t agino,
xfs_inobt_rec_incore_t *rec,
- int *done,
- int left)
+ int *done)
{
int error;
int i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
pag->pagl_leftrec != NULLAGINO &&
pag->pagl_rightrec != NULLAGINO) {
error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
- &trec, &doneleft, 1);
+ &trec, &doneleft);
if (error)
goto error1;
error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
- &rec, &doneright, 0);
+ &rec, &doneright);
if (error)
goto error1;
} else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
+
extern const struct xfs_buf_ops xfs_agi_buf_ops;
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..3f90e1ceb8d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,9 @@ xfs_iget_cache_miss(
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
+ ip->i_udquot = NULL;
+ ip->i_gdquot = NULL;
+ ip->i_pdquot = NULL;
xfs_iflags_set(ip, iflags);
/* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
-int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+
+kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+ log_vector->i_len = sizeof(struct xfs_icreate_log);
+ log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ ASSERT(0);
+ return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+ .iop_size = xfs_icreate_item_size,
+ .iop_format = xfs_icreate_item_format,
+ .iop_pin = xfs_icreate_item_pin,
+ .iop_unpin = xfs_icreate_item_unpin,
+ .iop_push = xfs_icreate_item_push,
+ .iop_unlock = xfs_icreate_item_unlock,
+ .iop_committed = xfs_icreate_item_committed,
+ .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ unsigned int count,
+ unsigned int inode_size,
+ xfs_agblock_t length,
+ unsigned int generation)
+{
+ struct xfs_icreate_item *icp;
+
+ icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+ xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+ &xfs_icreate_item_ops);
+
+ icp->ic_format.icl_type = XFS_LI_ICREATE;
+ icp->ic_format.icl_size = 1; /* single vector */
+ icp->ic_format.icl_ag = cpu_to_be32(agno);
+ icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+ icp->ic_format.icl_count = cpu_to_be32(count);
+ icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+ icp->ic_format.icl_length = cpu_to_be32(length);
+ icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+ xfs_trans_add_item(tp, &icp->ic_item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H 1
+
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+ __uint16_t icl_type; /* type of log format structure */
+ __uint16_t icl_size; /* size of log format structure */
+ __be32 icl_ag; /* ag being allocated in */
+ __be32 icl_agbno; /* start block of inode range */
+ __be32 icl_count; /* number of inodes to initialise */
+ __be32 icl_isize; /* size of inodes */
+ __be32 icl_length; /* length of extent to initialise */
+ __be32 icl_gen; /* inode generation number to use */
+};
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+ struct xfs_log_item ic_item;
+ struct xfs_icreate_log ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, unsigned int count,
+ unsigned int inode_size, xfs_agblock_t length,
+ unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..b78481f99d9d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1028,6 +1028,11 @@ xfs_dinode_calc_crc(
/*
* Read the disk inode attributes into the in-core inode structure.
+ *
+ * If we are initialising a new inode and we are not utilising the
+ * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
+ * with a random generation number. If we are keeping inodes around, we need to
+ * read the inode cluster to get the existing generation number off disk.
*/
int
xfs_iread(
@@ -1047,6 +1052,22 @@ xfs_iread(
if (error)
return error;
+ /* shortcut IO on inode allocation if possible */
+ if ((iget_flags & XFS_IGET_CREATE) &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ /* initialise the on-disk inode core */
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_d.di_magic = XFS_DINODE_MAGIC;
+ ip->i_d.di_gen = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ip->i_d.di_version = 3;
+ ip->i_d.di_ino = ip->i_ino;
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ } else
+ ip->i_d.di_version = 2;
+ return 0;
+ }
+
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
@@ -1133,17 +1154,16 @@ xfs_iread(
xfs_buf_set_ref(bp, XFS_INO_REF);
/*
- * Use xfs_trans_brelse() to release the buffer containing the
- * on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal
+ * Use xfs_trans_brelse() to release the buffer containing the on-disk
+ * inode, because it was acquired with xfs_trans_read_buf() in
+ * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
* brelse(). If we're within a transaction, then xfs_trans_brelse()
* will only release the buffer if it is not dirty within the
* transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be
- * locking the new in-core inode before putting it in the hash
- * table where other processes can find it. Thus we don't have
- * to worry about the inode being changed just because we released
- * the buffer.
+ * because inodes on disk are never destroyed and we will be locking the
+ * new in-core inode before putting it in the cache where other
+ * processes can find it. Thus we don't have to worry about the inode
+ * being changed just because we released the buffer.
*/
out_brelse:
xfs_trans_brelse(tp, bp);
@@ -2028,8 +2048,6 @@ xfs_ifree(
int error;
int delete;
xfs_ino_t first_ino;
- xfs_dinode_t *dip;
- xfs_buf_t *ibp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2060,13 @@ xfs_ifree(
* Pull the on-disk inode from the AGI unlinked list.
*/
error = xfs_iunlink_remove(tp, ip);
- if (error != 0) {
+ if (error)
return error;
- }
error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
- if (error != 0) {
+ if (error)
return error;
- }
+
ip->i_d.di_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2078,10 @@ xfs_ifree(
* by reincarnations of this inode.
*/
ip->i_d.di_gen++;
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error)
- return error;
-
- /*
- * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
- * from picking up this inode when it is reclaimed (its incore state
- * initialzed but not flushed to disk yet). The in-core di_mode is
- * already cleared and a corresponding transaction logged.
- * The hack here just synchronizes the in-core to on-disk
- * di_mode value in advance before the actual inode sync to disk.
- * This is OK because the inode is already unlinked and would never
- * change its di_mode again for this inode generation.
- * This is a temporary hack that would require a proper fix
- * in the future.
- */
- dip->di_mode = 0;
-
- if (delete) {
+ if (delete)
error = xfs_ifree_cluster(ip, tp, first_ino);
- }
return error;
}
@@ -2160,8 +2156,8 @@ xfs_iroot_realloc(
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
return;
}
@@ -2214,8 +2210,9 @@ xfs_iroot_realloc(
kmem_free(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+ if (ifp->if_broot)
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
return;
}
@@ -2526,9 +2523,8 @@ xfs_iflush_fork(
if ((iip->ili_fields & brootflag[whichfork]) &&
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
- ASSERT(ifp->if_broot_bytes <=
- (XFS_IFORK_SIZE(ip, whichfork) +
- XFS_BROOT_SIZE_ADJ(ip)));
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91129794aaec..b55fd347ab5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -250,6 +250,7 @@ typedef struct xfs_inode {
struct xfs_mount *i_mount; /* fs mount struct ptr */
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e999680094a..6e2bca5d44d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -248,7 +248,7 @@ xfs_open_by_handle(
goto out_dput;
}
- fd = get_unused_fd();
+ fd = get_unused_fd_flags(0);
if (fd < 0) {
error = fd;
goto out_dput;
@@ -928,7 +928,7 @@ xfs_ioctl_setattr(
struct xfs_trans *tp;
unsigned int lock_flags = 0;
struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
@@ -957,7 +957,7 @@ xfs_ioctl_setattr(
if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+ XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
if (code)
return code;
}
@@ -994,8 +994,8 @@ xfs_ioctl_setattr(
XFS_IS_PQUOTA_ON(mp) &&
xfs_get_projid(ip) != fa->fsx_projid) {
ASSERT(tp);
- code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
+ pdqp, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
goto error_return;
@@ -1113,7 +1113,7 @@ xfs_ioctl_setattr(
if (xfs_get_projid(ip) != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
+ &ip->i_pdquot, pdqp);
}
xfs_set_projid(ip, fa->fsx_projid);
@@ -1160,13 +1160,13 @@ xfs_ioctl_setattr(
*/
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
return code;
error_return:
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
xfs_trans_cancel(tp, 0);
if (lock_flags)
xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
return 0;
/*
+ * If the file is smaller than the minimum prealloc and we are using
+ * dynamic preallocation, don't do any preallocation at all as it is
+ * likely this is the only write to the file that is going to be done.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+ return 0;
+
+ /*
* If there are any real blocks past eof, then don't
* do any speculative allocation.
*/
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
return 0;
+ /* If the file is small, then use the minimum prealloc */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+ return 0;
+
/*
* As we write multiple pages, the offset will always align to the
* start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..96dda62d497b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -467,9 +467,6 @@ xfs_setattr_mode(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
-
ip->i_d.di_mode &= S_IFMT;
ip->i_d.di_mode |= mode & ~S_IFMT;
@@ -495,15 +492,18 @@ xfs_setattr_nonsize(
trace_xfs_setattr(ip);
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ /* If acls are being inherited, we already have this checked */
+ if (!(flags & XFS_ATTR_NOACL)) {
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- error = -inode_change_ok(inode, iattr);
- if (error)
- return XFS_ERROR(error);
+ error = -inode_change_ok(inode, iattr);
+ if (error)
+ return XFS_ERROR(error);
+ }
ASSERT((mask & ATTR_SIZE) == 0);
@@ -539,7 +539,7 @@ xfs_setattr_nonsize(
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
- qflags, &udqp, &gdqp);
+ qflags, &udqp, &gdqp, NULL);
if (error)
return error;
}
@@ -575,7 +575,7 @@ xfs_setattr_nonsize(
(XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
ASSERT(tp);
error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
goto out_trans_cancel;
@@ -987,7 +987,8 @@ xfs_fiemap_format(
if (bmv->bmv_oflags & BMV_OF_PREALLOC)
fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
physical = 0; /* no block yet */
}
if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..b93e14b86754 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
{
return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
(xfs_sb_version_hasquota(&mp->m_sb) &&
- (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+ xfs_is_quota_inode(&mp->m_sb, ino)));
}
/*
@@ -221,7 +221,6 @@ xfs_bulkstat(
char __user *ubufp; /* pointer into user's buffer */
int ubelem; /* spaces used in user's buffer */
int ubused; /* bytes used by formatter */
- xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
/*
* Get the last inode value, see if there's nothing to do.
@@ -263,7 +262,6 @@ xfs_bulkstat(
rval = 0;
while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
cond_resched();
- bp = NULL;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error) {
/*
@@ -383,11 +381,13 @@ xfs_bulkstat(
* Also start read-ahead now for this chunk.
*/
if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ struct blk_plug plug;
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
+ blk_start_plug(&plug);
agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +399,7 @@ xfs_bulkstat(
agbno, nbcluster,
&xfs_inode_buf_ops);
}
+ blk_finish_plug(&plug);
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
@@ -433,27 +434,7 @@ xfs_bulkstat(
irbp->ir_freecount < XFS_INODES_PER_CHUNK;
chunkidx++, clustidx++, agino++) {
ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
- /*
- * Recompute agbno if this is the
- * first inode of the cluster.
- *
- * Careful with clustidx. There can be
- * multiple clusters per chunk, a single
- * cluster per chunk or a cluster that has
- * inodes represented from several different
- * chunks (if blocksize is large).
- *
- * Because of this, the starting clustidx is
- * initialized to zero in this loop but must
- * later be reset after reading in the cluster
- * buffer.
- */
- if ((chunkidx & (nicluster - 1)) == 0) {
- agbno = XFS_AGINO_TO_AGBNO(mp,
- irbp->ir_startino) +
- ((chunkidx & nimask) >>
- mp->m_sb.sb_inopblog);
- }
+
ino = XFS_AGINO_TO_INO(mp, agno, agino);
/*
* Skip if this inode is free.
@@ -499,10 +480,6 @@ xfs_bulkstat(
cond_resched();
}
-
- if (bp)
- xfs_buf_relse(bp);
-
/*
* Set up for the next loop iteration.
*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+ continue;
+
headers += lv->lv_niovecs;
for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
- while (lv && index < lv->lv_niovecs) {
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2236,13 +2240,22 @@ xlog_write(
* This loop writes out as many regions as can fit in the amount
* of space which was allocated by xlog_state_get_iclog_space().
*/
- while (lv && index < lv->lv_niovecs) {
- struct xfs_log_iovec *reg = &vecp[index];
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+ struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
+ bool ordered = false;
+
+ /* ordered log vectors have no regions to write */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(lv->lv_niovecs == 0);
+ ordered = true;
+ goto next_lv;
+ }
+ reg = &vecp[index];
ASSERT(reg->i_len % sizeof(__int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2302,12 +2315,13 @@ xlog_write(
break;
if (++index == lv->lv_niovecs) {
+next_lv:
lv = lv->lv_next;
index = 0;
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0) {
+ if (record_cnt == 0 && ordered == false) {
if (!lv)
return 0;
break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XLOG_REG_TYPE_UNMOUNT 17
#define XLOG_REG_TYPE_COMMIT 18
#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_MAX 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_MAX 20
typedef struct xfs_log_iovec {
void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
int lv_buf_len; /* size of formatted buffer */
};
+#define XFS_LOG_VEC_ORDERED (-1)
+
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
int index;
int len = 0;
uint niovecs;
+ bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
if (!niovecs)
continue;
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ }
+
new_lv = kmem_zalloc(sizeof(*new_lv) +
niovecs * sizeof(struct xfs_log_iovec),
KM_SLEEP|KM_NOFS);
+ new_lv->lv_item = lidp->lid_item;
+ new_lv->lv_niovecs = niovecs;
+ if (ordered) {
+ /* track as an ordered logvec */
+ new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ goto next;
+ }
+
/* The allocated iovec region lies beyond the log vector. */
new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
- new_lv->lv_niovecs = niovecs;
- new_lv->lv_item = lidp->lid_item;
/* build the vector array and calculate it's length */
IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
}
ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+next:
if (!ret_lv)
ret_lv = new_lv;
else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
if (old) {
/* existing lv on log item, space used is a delta */
- ASSERT(!list_empty(&lv->lv_item->li_cil));
- ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+ ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
+ old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+
+ /*
+ * If the new item is ordered, keep the old one that is already
+ * tracking dirty or ordered regions
+ */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(!lv->lv_buf);
+ kmem_free(lv);
+ return;
+ }
*len += lv->lv_buf_len - old->lv_buf_len;
*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
} else {
/* new lv, must pin the log item */
ASSERT(!lv->lv_item->li_lv);
- ASSERT(list_empty(&lv->lv_item->li_cil));
- *len += lv->lv_buf_len;
- *diff_iovecs += lv->lv_niovecs;
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ *len += lv->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs;
+ }
IOP_PIN(lv->lv_item);
}
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- for (lv = log_vector; lv; lv = lv->lv_next)
- xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
- /* account for space used by new iovec headers */
- len += diff_iovecs * sizeof(xlog_op_header_t);
-
spin_lock(&cil->xc_cil_lock);
+ for (lv = log_vector; lv; ) {
+ struct xfs_log_vec *next = lv->lv_next;
- /* move the items to the tail of the CIL */
- for (lv = log_vector; lv; lv = lv->lv_next)
+ ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
+ lv->lv_next = NULL;
+
+ /*
+ * xfs_cil_prepare_item() may free the lv, so move the item on
+ * the CIL first.
+ */
list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+ xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+ lv = next;
+ }
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
ctx->nvecs += diff_iovecs;
/*
@@ -381,9 +416,7 @@ xlog_cil_push(
struct xfs_cil_ctx *new_ctx;
struct xlog_in_core *commit_iclog;
struct xlog_ticket *tic;
- int num_lv;
int num_iovecs;
- int len;
int error = 0;
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
* side which is currently locked out by the flush lock.
*/
lv = NULL;
- num_lv = 0;
num_iovecs = 0;
- len = 0;
while (!list_empty(&cil->xc_cil)) {
struct xfs_log_item *item;
- int i;
item = list_first_entry(&cil->xc_cil,
struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
lv->lv_next = item->li_lv;
lv = item->li_lv;
item->li_lv = NULL;
-
- num_lv++;
num_iovecs += lv->lv_niovecs;
- for (i = 0; i < lv->lv_niovecs; i++)
- len += lv->lv_iovecp[i].i_len;
}
/*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = log->l_cilp->xc_ctx->sequence;
+ /* xlog_cil_insert_items() destroys log_vector list */
xlog_cil_insert_items(log, log_vector, tp->t_ticket);
/* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..6fcc910a50b9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_icreate_item.h"
/* Need all the magic numbers and buffer ops structures from these headers */
#include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
* form the cancelled buffer table. Hence they have tobe done last.
*
* 3. Inode allocation buffers must be replayed before inode items that
- * read the buffer and replay changes into it.
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
*
* 4. Inode unlink buffers must be replayed after inode items are replayed.
* This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
* from all the other buffers and move them to last.
*
* Hence, 4 lists, in order from head to tail:
- * - buffer_list for all buffers except cancelled/inode unlink buffers
- * - item_list for all non-buffer items
- * - inode_buffer_list for inode unlink buffers
- * - cancel_list for the cancelled buffers
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
@@ -2982,6 +2996,93 @@ xlog_recover_efd_pass2(
}
/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
+
+ /* existing allocation is fixed value */
+ ASSERT(count == XFS_IALLOC_INODES(mp));
+ ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+ if (count != XFS_IALLOC_INODES(mp) ||
+ length != XFS_IALLOC_BLOCKS(mp)) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
+ }
+
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
+ return 0;
+}
+
+/*
* Free up any resources allocated by the transaction
*
* Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3124,7 @@ xlog_recover_commit_pass1(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
/* nothing to do in pass 1 */
return 0;
default:
@@ -3053,6 +3155,8 @@ xlog_recover_commit_pass2(
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
return xlog_recover_dquot_pass2(log, buffer_list, item);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
return XFS_ERROR(EWRONGFS);
}
+ if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
+ (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
+ xfs_notice(mp,
+"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
/*
* Version 5 superblock feature mask validation. Reject combinations the
* kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
return error;
}
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+ if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+ if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+ sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+}
+
void
xfs_sb_from_disk(
struct xfs_sb *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
to->sb_lsn = be64_to_cpu(from->sb_lsn);
}
+static inline void
+xfs_sb_quota_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t *fields)
+{
+ __uint16_t qflags = from->sb_qflags;
+
+ if (*fields & XFS_SB_QFLAGS) {
+ /*
+ * The in-core version of sb_qflags do not have
+ * XFS_OQUOTA_* flags, whereas the on-disk version
+ * does. So, convert incore XFS_{PG}QUOTA_* flags
+ * to on-disk XFS_OQUOTA_* flags.
+ */
+ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+ XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+ if (from->sb_qflags &
+ (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+ qflags |= XFS_OQUOTA_ENFD;
+ if (from->sb_qflags &
+ (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+ qflags |= XFS_OQUOTA_CHKD;
+ to->sb_qflags = cpu_to_be16(qflags);
+ *fields &= ~XFS_SB_QFLAGS;
+ }
+}
+
/*
* Copy in core superblock to ondisk one.
*
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
if (!fields)
return;
+ xfs_sb_quota_to_disk(to, from, &fields);
while (fields) {
f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
*/
xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(&mp->m_sb);
/*
* We must be able to do sector-sized and sector-aligned IO.
*/
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
*/
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. blocksize)");
- return XFS_ERROR(EINVAL);
- }
- mp->m_dalign = mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
} else {
/*
* Convert the stripe unit and width to FSBs.
*/
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. ag size)");
- return XFS_ERROR(EINVAL);
- }
xfs_warn(mp,
- "stripe alignment turned off: sunit(%d)/swidth(%d) "
- "incompatible with agsize(%d)",
- mp->m_dalign, mp->m_swidth,
- sbp->sb_agblocks);
-
- mp->m_dalign = 0;
- mp->m_swidth = 0;
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ sbp->sb_agblocks);
+ return XFS_ERROR(EINVAL);
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "sunit(%d) less than bsize(%d)",
- mp->m_dalign,
- mp->m_blockmask +1);
- return XFS_ERROR(EINVAL);
- }
- mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
}
}
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
sbp->sb_width = mp->m_swidth;
mp->m_update_flags |= XFS_SB_WIDTH;
}
+ } else {
+ xfs_warn(mp,
+ "cannot change alignment: superblock does not support data alignment");
+ return XFS_ERROR(EINVAL);
}
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
uint m_chsize; /* size of next field */
- struct xfs_chash *m_chash; /* fs private inode per-cluster
- * hash table */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
operations, typically for
disk errors in metadata */
#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
- user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
allocations */
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..d320794d03ce 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
void *data)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
uint32_t next_index;
int last_error = 0;
int skipped;
@@ -137,6 +137,7 @@ xfs_qm_dqpurge(
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -145,8 +146,7 @@ xfs_qm_dqpurge(
}
/*
- * If this quota has a group hint attached, prepare for releasing it
- * now.
+ * If this quota has a hint attached, prepare for releasing it now.
*/
gdqp = dqp->q_gdquot;
if (gdqp) {
@@ -154,6 +154,12 @@ xfs_qm_dqpurge(
dqp->q_gdquot = NULL;
}
+ pdqp = dqp->q_pdquot;
+ if (pdqp) {
+ xfs_dqlock(pdqp);
+ dqp->q_pdquot = NULL;
+ }
+
dqp->dq_flags |= XFS_DQ_FREEING;
xfs_dqflock(dqp);
@@ -189,7 +195,7 @@ xfs_qm_dqpurge(
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -208,6 +214,8 @@ xfs_qm_dqpurge(
if (gdqp)
xfs_qm_dqput(gdqp);
+ if (pdqp)
+ xfs_qm_dqput(pdqp);
return 0;
}
@@ -299,8 +307,10 @@ xfs_qm_mount_quotas(
*/
if (!XFS_IS_UQUOTA_ON(mp))
mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
- mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+ if (!XFS_IS_GQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+ if (!XFS_IS_PQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_PQUOTA_CHKD;
write_changes:
/*
@@ -362,6 +372,10 @@ xfs_qm_unmount_quotas(
IRELE(mp->m_quotainfo->qi_gquotaip);
mp->m_quotainfo->qi_gquotaip = NULL;
}
+ if (mp->m_quotainfo->qi_pquotaip) {
+ IRELE(mp->m_quotainfo->qi_pquotaip);
+ mp->m_quotainfo->qi_pquotaip = NULL;
+ }
}
}
@@ -408,7 +422,10 @@ xfs_qm_dqattach_one(
* be reclaimed as long as we have a ref from inode and we
* hold the ilock.
*/
- dqp = udqhint->q_gdquot;
+ if (type == XFS_DQ_GROUP)
+ dqp = udqhint->q_gdquot;
+ else
+ dqp = udqhint->q_pdquot;
if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
ASSERT(*IO_idqpp == NULL);
@@ -451,28 +468,42 @@ xfs_qm_dqattach_one(
/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups.
+ * Given a udquot and group/project type, attach the group/project
+ * dquot pointer to the udquot as a hint for future lookups.
*/
STATIC void
-xfs_qm_dqattach_grouphint(
- xfs_dquot_t *udq,
- xfs_dquot_t *gdq)
+xfs_qm_dqattach_hint(
+ struct xfs_inode *ip,
+ int type)
{
- xfs_dquot_t *tmp;
+ struct xfs_dquot **dqhintp;
+ struct xfs_dquot *dqp;
+ struct xfs_dquot *udq = ip->i_udquot;
+
+ ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
xfs_dqlock(udq);
- tmp = udq->q_gdquot;
- if (tmp) {
- if (tmp == gdq)
+ if (type == XFS_DQ_GROUP) {
+ dqp = ip->i_gdquot;
+ dqhintp = &udq->q_gdquot;
+ } else {
+ dqp = ip->i_pdquot;
+ dqhintp = &udq->q_pdquot;
+ }
+
+ if (*dqhintp) {
+ struct xfs_dquot *tmp;
+
+ if (*dqhintp == dqp)
goto done;
- udq->q_gdquot = NULL;
+ tmp = *dqhintp;
+ *dqhintp = NULL;
xfs_qm_dqrele(tmp);
}
- udq->q_gdquot = xfs_qm_dqhold(gdq);
+ *dqhintp = xfs_qm_dqhold(dqp);
done:
xfs_dqunlock(udq);
}
@@ -489,8 +520,7 @@ xfs_qm_need_dqattach(
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
return false;
- if (ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return false;
return true;
}
@@ -526,12 +556,8 @@ xfs_qm_dqattach_locked(
}
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_OQUOTA_ON(mp)) {
- error = XFS_IS_GQUOTA_ON(mp) ?
- xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot) :
- xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
ip->i_udquot, &ip->i_gdquot);
/*
@@ -543,14 +569,28 @@ xfs_qm_dqattach_locked(
nquotas++;
}
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ flags & XFS_QMOPT_DQALLOC,
+ ip->i_udquot, &ip->i_pdquot);
+ /*
+ * Don't worry about the udquot that we may have
+ * attached above. It'll get detached, if not already.
+ */
+ if (error)
+ goto done;
+ nquotas++;
+ }
+
/*
- * Attach this group quota to the user quota as a hint.
+ * Attach this group/project quota to the user quota as a hint.
* This WON'T, in general, result in a thrash.
*/
- if (nquotas == 2) {
+ if (nquotas > 1 && ip->i_udquot) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_udquot);
- ASSERT(ip->i_gdquot);
+ ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
+ ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
/*
* We do not have i_udquot locked at this point, but this check
@@ -559,7 +599,10 @@ xfs_qm_dqattach_locked(
* succeed in general.
*/
if (ip->i_udquot->q_gdquot != ip->i_gdquot)
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+ xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
+
+ if (ip->i_udquot->q_pdquot != ip->i_pdquot)
+ xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
}
done:
@@ -567,8 +610,10 @@ xfs_qm_dqattach_locked(
if (!error) {
if (XFS_IS_UQUOTA_ON(mp))
ASSERT(ip->i_udquot);
- if (XFS_IS_OQUOTA_ON(mp))
+ if (XFS_IS_GQUOTA_ON(mp))
ASSERT(ip->i_gdquot);
+ if (XFS_IS_PQUOTA_ON(mp))
+ ASSERT(ip->i_pdquot);
}
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
#endif
@@ -601,13 +646,12 @@ void
xfs_qm_dqdetach(
xfs_inode_t *ip)
{
- if (!(ip->i_udquot || ip->i_gdquot))
+ if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
return;
trace_xfs_dquot_dqdetach(ip);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
if (ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
@@ -616,6 +660,10 @@ xfs_qm_dqdetach(
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if (ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
}
int
@@ -660,6 +708,7 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
INIT_LIST_HEAD(&qinf->qi_lru_list);
@@ -761,6 +810,10 @@ xfs_qm_destroy_quotainfo(
IRELE(qi->qi_gquotaip);
qi->qi_gquotaip = NULL;
}
+ if (qi->qi_pquotaip) {
+ IRELE(qi->qi_pquotaip);
+ qi->qi_pquotaip = NULL;
+ }
mutex_destroy(&qi->qi_quotaofflock);
kmem_free(qi);
mp->m_quotainfo = NULL;
@@ -1152,7 +1205,7 @@ xfs_qm_dqusage_adjust(
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ if (xfs_is_quota_inode(&mp->m_sb, ino)) {
*res = BULKSTAT_RV_NOTHING;
return XFS_ERROR(EINVAL);
}
@@ -1262,19 +1315,21 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
- xfs_inode_t *uip, *gip;
- uint flags;
- LIST_HEAD (buffer_list);
+ int done, count, error, error2;
+ xfs_ino_t lastino;
+ size_t structsz;
+ uint flags;
+ LIST_HEAD (buffer_list);
+ struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
+ struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
+ struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
count = INT_MAX;
structsz = 1;
lastino = 0;
flags = 0;
- ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+ ASSERT(uip || gip || pip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1339,6 @@ xfs_qm_quotacheck(
* their counters to zero. We need a clean slate.
* We don't log our changes till later.
*/
- uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
@@ -1293,14 +1347,20 @@ xfs_qm_quotacheck(
flags |= XFS_UQUOTA_CHKD;
}
- gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
- error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
+ error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
&buffer_list);
if (error)
goto error_return;
- flags |= XFS_OQUOTA_CHKD;
+ flags |= XFS_GQUOTA_CHKD;
+ }
+
+ if (pip) {
+ error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+ &buffer_list);
+ if (error)
+ goto error_return;
+ flags |= XFS_PQUOTA_CHKD;
}
do {
@@ -1395,15 +1455,14 @@ STATIC int
xfs_qm_init_quotainos(
xfs_mount_t *mp)
{
- xfs_inode_t *uip, *gip;
- int error;
- __int64_t sbflags;
- uint flags;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ int error;
+ __int64_t sbflags = 0;
+ uint flags = 0;
ASSERT(mp->m_quotainfo);
- uip = gip = NULL;
- sbflags = 0;
- flags = 0;
/*
* Get the uquota and gquota inodes
@@ -1412,19 +1471,27 @@ xfs_qm_init_quotainos(
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip)))
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip);
+ if (error)
return XFS_ERROR(error);
}
- if (XFS_IS_OQUOTA_ON(mp) &&
+ if (XFS_IS_GQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip))) {
- if (uip)
- IRELE(uip);
- return XFS_ERROR(error);
- }
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip);
+ if (error)
+ goto error_rele;
+ }
+ /* XXX: Use gquotino for now */
+ if (XFS_IS_PQUOTA_ON(mp) &&
+ mp->m_sb.sb_gquotino != NULLFSINO) {
+ ASSERT(mp->m_sb.sb_gquotino > 0);
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &pip);
+ if (error)
+ goto error_rele;
}
} else {
flags |= XFS_QMOPT_SBVERSION;
@@ -1433,36 +1500,52 @@ xfs_qm_init_quotainos(
}
/*
- * Create the two inodes, if they don't exist already. The changes
+ * Create the three inodes, if they don't exist already. The changes
* made above will get added to a transaction and logged in one of
* the qino_alloc calls below. If the device is readonly,
* temporarily switch to read-write to do this.
*/
if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
- if ((error = xfs_qm_qino_alloc(mp, &uip,
+ error = xfs_qm_qino_alloc(mp, &uip,
sbflags | XFS_SB_UQUOTINO,
- flags | XFS_QMOPT_UQUOTA)))
- return XFS_ERROR(error);
+ flags | XFS_QMOPT_UQUOTA);
+ if (error)
+ goto error_rele;
flags &= ~XFS_QMOPT_SBVERSION;
}
- if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
- flags |= (XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
error = xfs_qm_qino_alloc(mp, &gip,
- sbflags | XFS_SB_GQUOTINO, flags);
- if (error) {
- if (uip)
- IRELE(uip);
+ sbflags | XFS_SB_GQUOTINO,
+ flags | XFS_QMOPT_GQUOTA);
+ if (error)
+ goto error_rele;
- return XFS_ERROR(error);
- }
+ flags &= ~XFS_QMOPT_SBVERSION;
+ }
+ if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+ /* XXX: Use XFS_SB_GQUOTINO for now */
+ error = xfs_qm_qino_alloc(mp, &pip,
+ sbflags | XFS_SB_GQUOTINO,
+ flags | XFS_QMOPT_PQUOTA);
+ if (error)
+ goto error_rele;
}
mp->m_quotainfo->qi_uquotaip = uip;
mp->m_quotainfo->qi_gquotaip = gip;
+ mp->m_quotainfo->qi_pquotaip = pip;
return 0;
+
+error_rele:
+ if (uip)
+ IRELE(uip);
+ if (gip)
+ IRELE(gip);
+ if (pip)
+ IRELE(pip);
+ return XFS_ERROR(error);
}
STATIC void
@@ -1473,7 +1556,7 @@ xfs_qm_dqfree_one(
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -1656,10 +1739,13 @@ xfs_qm_vop_dqalloc(
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
- struct xfs_dquot **O_gdqpp)
+ struct xfs_dquot **O_gdqpp,
+ struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_dquot *uq, *gq;
+ struct xfs_dquot *uq = NULL;
+ struct xfs_dquot *gq = NULL;
+ struct xfs_dquot *pq = NULL;
int error;
uint lockflags;
@@ -1684,7 +1770,6 @@ xfs_qm_vop_dqalloc(
}
}
- uq = gq = NULL;
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
if (ip->i_d.di_uid != uid) {
/*
@@ -1697,11 +1782,12 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
XFS_DQ_USER,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &uq))) {
+ &uq);
+ if (error) {
ASSERT(error != ENOENT);
return error;
}
@@ -1723,15 +1809,14 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
XFS_DQ_GROUP,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &gq);
+ if (error) {
ASSERT(error != ENOENT);
- return error;
+ goto error_rele;
}
xfs_dqunlock(gq);
lockflags = XFS_ILOCK_SHARED;
@@ -1740,25 +1825,25 @@ xfs_qm_vop_dqalloc(
ASSERT(ip->i_gdquot);
gq = xfs_qm_dqhold(ip->i_gdquot);
}
- } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ }
+ if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
XFS_DQ_PROJ,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &pq);
+ if (error) {
ASSERT(error != ENOENT);
- return (error);
+ goto error_rele;
}
- xfs_dqunlock(gq);
+ xfs_dqunlock(pq);
lockflags = XFS_ILOCK_SHARED;
xfs_ilock(ip, lockflags);
} else {
- ASSERT(ip->i_gdquot);
- gq = xfs_qm_dqhold(ip->i_gdquot);
+ ASSERT(ip->i_pdquot);
+ pq = xfs_qm_dqhold(ip->i_pdquot);
}
}
if (uq)
@@ -1773,7 +1858,18 @@ xfs_qm_vop_dqalloc(
*O_gdqpp = gq;
else if (gq)
xfs_qm_dqrele(gq);
+ if (O_pdqpp)
+ *O_pdqpp = pq;
+ else if (pq)
+ xfs_qm_dqrele(pq);
return 0;
+
+error_rele:
+ if (gq)
+ xfs_qm_dqrele(gq);
+ if (uq)
+ xfs_qm_dqrele(uq);
+ return error;
}
/*
@@ -1821,29 +1917,34 @@ xfs_qm_vop_chown(
*/
int
xfs_qm_vop_chown_reserve(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ uint flags)
{
- xfs_mount_t *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
- xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ uint delblks, blkflags, prjflags = 0;
+ struct xfs_dquot *udq_unres = NULL;
+ struct xfs_dquot *gdq_unres = NULL;
+ struct xfs_dquot *pdq_unres = NULL;
+ struct xfs_dquot *udq_delblks = NULL;
+ struct xfs_dquot *gdq_delblks = NULL;
+ struct xfs_dquot *pdq_delblks = NULL;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
delblks = ip->i_delayed_blks;
- delblksudq = delblksgdq = unresudq = unresgdq = NULL;
blkflags = XFS_IS_REALTIME_INODE(ip) ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
- delblksudq = udqp;
+ udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
* unreserve those from the old dquot, and add them to the
@@ -1851,29 +1952,34 @@ xfs_qm_vop_chown_reserve(
*/
if (delblks) {
ASSERT(ip->i_udquot);
- unresudq = ip->i_udquot;
+ udq_unres = ip->i_udquot;
}
}
- if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
- if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
- xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
- prjflags = XFS_QMOPT_ENOSPC;
-
- if (prjflags ||
- (XFS_IS_GQUOTA_ON(ip->i_mount) &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
- delblksgdq = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- unresgdq = ip->i_gdquot;
- }
+ if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+ ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ gdq_delblks = gdqp;
+ if (delblks) {
+ ASSERT(ip->i_gdquot);
+ gdq_unres = ip->i_gdquot;
}
}
- if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags)))
- return (error);
+ if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
+ xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+ prjflags = XFS_QMOPT_ENOSPC;
+ pdq_delblks = pdqp;
+ if (delblks) {
+ ASSERT(ip->i_pdquot);
+ pdq_unres = ip->i_pdquot;
+ }
+ }
+
+ error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ ip->i_d.di_nblocks, 1,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
/*
* Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,15 +1991,17 @@ xfs_qm_vop_chown_reserve(
/*
* Do the reservations first. Unreservation can't fail.
*/
- ASSERT(delblksudq || delblksgdq);
- ASSERT(unresudq || unresgdq);
- if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags)))
- return (error);
+ ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
+ ASSERT(udq_unres || gdq_unres || pdq_unres);
+ error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ (xfs_qcnt_t)delblks, 0,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
- blkflags);
+ udq_unres, gdq_unres, pdq_unres,
+ -((xfs_qcnt_t)delblks), 0, blkflags);
}
return (0);
@@ -1932,7 +2040,8 @@ xfs_qm_vop_create_dqattach(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp)
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1952,13 +2061,18 @@ xfs_qm_vop_create_dqattach(
}
if (gdqp) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(XFS_IS_OQUOTA_ON(mp));
- ASSERT((XFS_IS_GQUOTA_ON(mp) ?
- ip->i_d.di_gid : xfs_get_projid(ip)) ==
- be32_to_cpu(gdqp->q_core.d_id));
-
+ ASSERT(XFS_IS_GQUOTA_ON(mp));
+ ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+ if (pdqp) {
+ ASSERT(ip->i_pdquot == NULL);
+ ASSERT(XFS_IS_PQUOTA_ON(mp));
+ ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+
+ ip->i_pdquot = xfs_qm_dqhold(pdqp);
+ xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+ }
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..579d6a02a5b6 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -44,9 +44,11 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
typedef struct xfs_quotainfo {
struct radix_tree_root qi_uquota_tree;
struct radix_tree_root qi_gquota_tree;
+ struct radix_tree_root qi_pquota_tree;
struct mutex qi_tree_lock;
- xfs_inode_t *qi_uquotaip; /* user quota inode */
- xfs_inode_t *qi_gquotaip; /* group quota inode */
+ struct xfs_inode *qi_uquotaip; /* user quota inode */
+ struct xfs_inode *qi_gquotaip; /* group quota inode */
+ struct xfs_inode *qi_pquotaip; /* project quota inode */
struct list_head qi_lru_list;
struct mutex qi_lru_lock;
int qi_lru_count;
@@ -69,30 +71,66 @@ typedef struct xfs_quotainfo {
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
-#define XFS_DQUOT_TREE(qi, type) \
- ((type & XFS_DQ_USER) ? \
- &((qi)->qi_uquota_tree) : \
- &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+ struct xfs_quotainfo *qi,
+ int type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return &qi->qi_uquota_tree;
+ case XFS_DQ_GROUP:
+ return &qi->qi_gquota_tree;
+ case XFS_DQ_PROJ:
+ return &qi->qi_pquota_tree;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+ switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ case XFS_DQ_GROUP:
+ return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ case XFS_DQ_PROJ:
+ return dqp->q_mount->m_quotainfo->qi_pquotaip;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
unsigned int nbblks);
-extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
- xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern void xfs_trans_mod_dquot(struct xfs_trans *,
+ struct xfs_dquot *, uint, long);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+ struct xfs_mount *, struct xfs_dquot *,
+ struct xfs_dquot *, struct xfs_dquot *,
+ long, long, uint);
+extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
+ * We keep the usr, grp, and prj dquots separately so that locking will be
+ * easier to do at commit time. All transactions that we know of at this point
* affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
*/
+enum {
+ XFS_QM_TRANS_USR = 0,
+ XFS_QM_TRANS_GRP,
+ XFS_QM_TRANS_PRJ,
+ XFS_QM_TRANS_DQTYPES
+};
#define XFS_QM_TRANS_MAXDQS 2
-typedef struct xfs_dquot_acct {
- xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
- xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+ struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
/*
* Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +144,23 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_IWARNLIMIT 5
#define XFS_QM_RTBWARNLIMIT 5
-extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_quotacheck(xfs_mount_t *);
-extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int xfs_qm_quotacheck(struct xfs_mount *);
+extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
/* dquot stuff */
-extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
-extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
+extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct fs_disk_quota *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+ struct fs_disk_quota *);
+extern int xfs_qm_scall_getqstat(struct xfs_mount *,
+ struct fs_quota_stat *);
+extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2d02eac1c9a8..437a52d91f6d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -112,16 +112,16 @@ xfs_qm_newmount(
if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
(!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
(gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
+ (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
+ (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
xfs_dev_is_read_only(mp, "changing quota state")) {
xfs_warn(mp, "please mount with%s%s%s%s.",
(!quotaondisk ? "out quota" : ""),
(uquotaondisk ? " usrquota" : ""),
- (pquotaondisk ? " prjquota" : ""),
- (gquotaondisk ? " grpquota" : ""));
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return XFS_ERROR(EPERM);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..e4f8b2d6f38b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,12 @@ xfs_qm_scall_quotaoff(
}
if (flags & XFS_GQUOTA_ACCT) {
dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
inactivate_flags |= XFS_GQUOTA_ACTIVE;
- } else if (flags & XFS_PQUOTA_ACCT) {
+ }
+ if (flags & XFS_PQUOTA_ACCT) {
dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
inactivate_flags |= XFS_PQUOTA_ACTIVE;
}
@@ -198,10 +199,9 @@ xfs_qm_scall_quotaoff(
}
/*
- * If quotas is completely disabled, close shop.
+ * If all quotas are completely turned off, close shop.
*/
- if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
- ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+ if (mp->m_qflags == 0) {
mutex_unlock(&q->qi_quotaofflock);
xfs_qm_destroy_quotainfo(mp);
return (0);
@@ -214,10 +214,14 @@ xfs_qm_scall_quotaoff(
IRELE(q->qi_uquotaip);
q->qi_uquotaip = NULL;
}
- if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+ if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
IRELE(q->qi_gquotaip);
q->qi_gquotaip = NULL;
}
+ if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
+ IRELE(q->qi_pquotaip);
+ q->qi_pquotaip = NULL;
+ }
out_unlock:
mutex_unlock(&q->qi_quotaofflock);
@@ -335,14 +339,14 @@ xfs_qm_scall_quotaon(
* quota acct on ondisk without m_qflags' knowing.
*/
if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
- (flags & XFS_UQUOTA_ENFD))
- ||
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ (flags & XFS_UQUOTA_ENFD)) ||
+ ((flags & XFS_GQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ (flags & XFS_GQUOTA_ENFD)) ||
((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
- (flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))) {
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ (flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
__func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +411,11 @@ xfs_qm_scall_getqstat(
struct fs_quota_stat *out)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip, *gip;
- bool tempuqip, tempgqip;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ bool tempuqip = false;
+ bool tempgqip = false;
- uip = gip = NULL;
- tempuqip = tempgqip = false;
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +780,12 @@ xfs_qm_scall_getquota(
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_USER) ||
+ (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+ (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_PROJ)) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -786,8 +793,8 @@ xfs_qm_scall_getquota(
#ifdef DEBUG
if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
- (XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+ (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+ (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
dst->d_id != 0) {
if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
@@ -833,16 +840,16 @@ xfs_qm_export_flags(
uflags = 0;
if (flags & XFS_UQUOTA_ACCT)
uflags |= FS_QUOTA_UDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_GQUOTA_ACCT)
uflags |= FS_QUOTA_GDQ_ACCT;
+ if (flags & XFS_PQUOTA_ACCT)
+ uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_UQUOTA_ENFD)
uflags |= FS_QUOTA_UDQ_ENFD;
- if (flags & (XFS_OQUOTA_ENFD)) {
- uflags |= (flags & XFS_GQUOTA_ACCT) ?
- FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
- }
+ if (flags & XFS_GQUOTA_ENFD)
+ uflags |= FS_QUOTA_GDQ_ENFD;
+ if (flags & XFS_PQUOTA_ENFD)
+ uflags |= FS_QUOTA_PDQ_ENFD;
return (uflags);
}
@@ -856,9 +863,11 @@ xfs_dqrele_inode(
{
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+ ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
+ ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
+ ASSERT(ip->i_pdquot == NULL);
return 0;
}
@@ -867,10 +876,14 @@ xfs_dqrele_inode(
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
}
- if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..b14f42c714b6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,11 +108,28 @@ typedef struct xfs_dqblk {
{ XFS_DQ_FREEING, "FREEING" }
/*
- * In the worst case, when both user and group quotas are on,
- * we can have a max of three dquots changing in a single transaction.
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
*/
-#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3)
-
+#define XFS_DQUOT_LOGRES(mp) \
+ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
/*
* These are the structures used to lay out dquots and quotaoff
@@ -161,30 +178,42 @@ typedef struct xfs_qoff_logformat {
#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
+
+/*
* Quota Accounting/Enforcement flags
*/
#define XFS_ALL_QUOTA_ACCT \
(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ENFD \
+ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD \
+ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
/*
* Incore only flags for quotaoff - these bits get cleared when quota(s)
* are in the process of getting turned off. These flags are in m_qflags but
* never in sb_qflags.
*/
-#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+ (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
/*
* Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,33 +288,24 @@ typedef struct xfs_qoff_logformat {
* we didn't have the inode locked, the appropriate dquot(s) will be
* attached atomically.
*/
-#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
- (ip)->i_udquot == NULL) || \
- (XFS_IS_OQUOTA_ON(mp) && \
- (ip)->i_gdquot == NULL))
+#define XFS_NOT_DQATTACHED(mp, ip) \
+ ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
+ (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
+ (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
#define XFS_QM_NEED_QUOTACHECK(mp) \
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
(XFS_IS_GQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
(XFS_IS_PQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
-
-#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
-
-#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
- XFS_GQUOTA_ACCT)
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+ XFS_PQUOTA_CHKD)
/*
@@ -318,17 +338,18 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
struct xfs_inode *, long, long, uint);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, long, long, uint);
+ struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
- struct xfs_dquot **, struct xfs_dquot **);
+ struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *);
+ struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *, uint);
+ struct xfs_dquot *, struct xfs_dquot *,
+ struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *, uint);
extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -342,10 +363,12 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
- uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+ uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
+ struct xfs_dquot **pdqp)
{
*udqp = NULL;
*gdqp = NULL;
+ *pdqp = NULL;
return 0;
}
#define xfs_trans_dup_dqinfo(tp, tp2)
@@ -360,14 +383,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
}
static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
struct xfs_mount *mp, struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ long nblks, long nions, uint flags)
{
return 0;
}
-#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip, fl) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
@@ -381,8 +405,8 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
+ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
f | XFS_QMOPT_RES_REGBLKS)
extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
flags |= XFS_GQUOTA_ACCT;
if (uflags & FS_QUOTA_UDQ_ENFD)
flags |= XFS_UQUOTA_ENFD;
- if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ flags |= XFS_GQUOTA_ENFD;
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ flags |= XFS_PQUOTA_ENFD;
switch (op) {
case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
return (sbp->sb_features_log_incompat & feature) != 0;
}
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
+}
+
/*
* end of superblock version macros
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
!strcmp(this_char, MNTOPT_PRJQUOTA)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_PQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
!strcmp(this_char, MNTOPT_GRPQUOTA)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_GQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
xfs_warn(mp,
"delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
}
done:
- if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
/*
* At this point the superblock has not been read
* in, therefore we do not know the block size.
* Before the mount call ends we will convert
* these to FSBs.
*/
- if (dsunit) {
- mp->m_dalign = dsunit;
- mp->m_flags |= XFS_MOUNT_RETERR;
- }
-
- if (dswidth)
- mp->m_swidth = dswidth;
+ mp->m_dalign = dsunit;
+ mp->m_swidth = dswidth;
}
if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
/* Either project or group quotas can be active, not both */
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
seq_puts(m, "," MNTOPT_PRJQUOTA);
else
seq_puts(m, "," MNTOPT_PQUOTANOENF);
} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
seq_puts(m, "," MNTOPT_GRPQUOTA);
else
seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
- (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
return 0;
}
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
+ /* version 5 superblocks support inode version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ sb->s_flags |= MS_I_VERSION;
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
KM_ZONE_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
+ xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+ "xfs_icr");
+ if (!xfs_icreate_zone)
+ goto out_destroy_ili_zone;
return 0;
+ out_destroy_ili_zone:
+ kmem_zone_destroy(xfs_ili_zone);
out_destroy_inode_zone:
kmem_zone_destroy(xfs_inode_zone);
out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_icreate_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..f4895b662fcb 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,9 @@ xfs_symlink(
int n;
xfs_buf_t *bp;
prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
uint resblks;
*ipp = NULL;
@@ -385,7 +387,7 @@ xfs_symlink(
* Make sure that we have allocated dquot(s) on disk.
*/
error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
if (error)
goto std_return;
@@ -426,7 +428,8 @@ xfs_symlink(
/*
* Reserve disk quota : blocks and inode.
*/
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
if (error)
goto error_return;
@@ -464,7 +467,7 @@ xfs_symlink(
/*
* Also attach the dquot(s) to it, if applicable.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
if (resblks)
resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -562,6 +565,7 @@ xfs_symlink(
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
*ipp = ip;
return 0;
@@ -575,6 +579,7 @@ xfs_symlink(
xfs_trans_cancel(tp, cancel_flags);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -585,7 +590,7 @@ xfs_symlink(
/*
* Free a symlink that has blocks associated with it.
*/
-int
+STATIC int
xfs_inactive_symlink_rmt(
xfs_inode_t *ip,
xfs_trans_t **tpp)
@@ -606,7 +611,7 @@ xfs_inactive_symlink_rmt(
tp = *tpp;
mp = ip->i_mount;
- ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+ ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
/*
* We're freeing a symlink that has some
* blocks allocated to it. Free the
@@ -720,3 +725,47 @@ xfs_inactive_symlink_rmt(
error0:
return error;
}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+ struct xfs_inode *ip,
+ struct xfs_trans **tp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int pathlen;
+
+ trace_xfs_inactive_symlink(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Zero length symlinks _can_ exist.
+ */
+ pathlen = (int)ip->i_d.di_size;
+ if (!pathlen)
+ return 0;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+ __func__, (unsigned long long)ip->i_ino, pathlen);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ if (ip->i_df.if_bytes > 0)
+ xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+ XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ return 0;
+ }
+
+ /* remove the remote symlink */
+ return xfs_inactive_symlink_rmt(ip, tp);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
#endif /* __KERNEL__ */
#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
#ifdef CONFIG_PROC_FS
STATIC int
xfs_stats_clear_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
STATIC int
xfs_panic_mask_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int ret, *valp = ctl->data;
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
{}
};
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
{
.procname = "xfs",
.mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
{}
};
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a04701de6bbd..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
TP_PROTO(struct xfs_buf_log_item *bip), \
TP_ARGS(bip))
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
DEFINE_INODE_EVENT(xfs_getattr);
DEFINE_INODE_EVENT(xfs_setattr);
DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
}
/*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
* the parent directory inode: inode size
* the new inode: inode size
- * the inode btree entry: 1 block
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
* the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
* the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(1, 1024)),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_create_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
}
/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
+ * For icreate we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
struct xfs_mount *mp)
{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- (uint)XFS_FSB_TO_B(mp, 1) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- mp->m_sb.sb_sectsize +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_icreate_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return xfs_calc_icreate_reservation(mp);
+ return __xfs_calc_create_reservation(mp);
+
}
/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
return xfs_calc_create_reservation(mp);
}
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp) +
+ xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
/*
* In freeing an inode we can modify:
* the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_ICREATE 0x123f
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_SWAPEXT 40
#define XFS_TRANS_SB_COUNT 41
#define XFS_TRANS_CHECKPOINT 42
-#define XFS_TRANS_TYPE_MAX 42
+#define XFS_TRANS_ICREATE 43
+#define XFS_TRANS_TYPE_MAX 43
/* new transaction types need to be reflected in xfs_logprint(8) */
#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
/*
* Per-extent log reservation for the allocation btree changes
* involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
*/
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
/*
* Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
+ * dir blocks: (1 btree block per level + data block + free block)
+ * bmap btree: (levels + 2) * max depth
* v2 directory blocks can be fragmented below the dirblksize down to the fsb
* size, so account for that in the DAENTER macros.
*/
-#define XFS_DIROP_LOG_RES(mp) \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
#define XFS_DIROP_LOG_COUNT(mp) \
(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
return XFS_ERROR(EIO);
}
-
/*
* Release the buffer bp which was previously acquired with one of the
* xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
- bip->bli_flags |= XFS_BLI_LOGGED;
- xfs_buf_item_log(bip, first, last);
+
+ /*
+ * If we have an ordered buffer we are not logging any dirty range but
+ * it still needs to be marked dirty and that it has been logged.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ xfs_buf_item_log(bip, first, last);
}
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
}
/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
+}
+
+/*
* Set the type of the buffer for log recovery so that it can correctly identify
* and hence attach the correct buffer ops to the buffer after replay.
*/
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..61407a847b86 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
return;
xfs_trans_alloc_dqinfo(ntp);
- oqa = otp->t_dqinfo->dqa_usrdquots;
- nqa = ntp->t_dqinfo->dqa_usrdquots;
/*
* Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
- for (j = 0; j < 2; j++) {
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ oqa = otp->t_dqinfo->dqs[j];
+ nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (oqa[i].qt_dquot == NULL)
break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
oq->qt_ino_res = oq->qt_ino_res_used;
}
- oqa = otp->t_dqinfo->dqa_grpdquots;
- nqa = ntp->t_dqinfo->dqa_grpdquots;
}
}
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
if (!XFS_IS_QUOTA_RUNNING(mp) ||
!XFS_IS_QUOTA_ON(mp) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
if (tp->t_dqinfo == NULL)
@@ -166,20 +163,28 @@ xfs_trans_mod_dquot_byino(
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
- if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+ if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+ (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
}
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
xfs_trans_get_dqtrx(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
- int i;
- xfs_dqtrx_t *qa;
-
- qa = XFS_QM_ISUDQ(dqp) ?
- tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+ int i;
+ struct xfs_dqtrx *qa;
+
+ if (XFS_QM_ISUDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+ else if (XFS_QM_ISGDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+ else if (XFS_QM_ISPDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+ else
+ return NULL;
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -292,11 +297,10 @@ xfs_trans_mod_dquot(
/*
- * Given an array of dqtrx structures, lock all the dquots associated
- * and join them to the transaction, provided they have been modified.
- * We know that the highest number of dquots (of one type - usr OR grp),
- * involved in a transaction is 2 and that both usr and grp combined - 3.
- * So, we don't attempt to make this very generic.
+ * Given an array of dqtrx structures, lock all the dquots associated and join
+ * them to the transaction, provided they have been modified. We know that the
+ * highest number of dquots of one type - usr, grp OR prj - involved in a
+ * transaction is 2 so we don't need to make this very generic.
*/
STATIC void
xfs_trans_dqlockedjoin(
@@ -339,12 +343,10 @@ xfs_trans_apply_dquot_deltas(
return;
ASSERT(tp->t_dqinfo);
- qa = tp->t_dqinfo->dqa_usrdquots;
- for (j = 0; j < 2; j++) {
- if (qa[0].qt_dquot == NULL) {
- qa = tp->t_dqinfo->dqa_grpdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
+ if (qa[0].qt_dquot == NULL)
continue;
- }
/*
* Lock all of the dquots and join them to the transaction.
@@ -495,10 +497,6 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_res_rtbcount >=
be64_to_cpu(dqp->q_core.d_rtbcount));
}
- /*
- * Do the group quotas next
- */
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -521,9 +519,9 @@ xfs_trans_unreserve_and_mod_dquots(
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
- qa = tp->t_dqinfo->dqa_usrdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
- for (j = 0; j < 2; j++) {
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
@@ -565,7 +563,6 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqunlock(dqp);
}
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -640,8 +637,8 @@ xfs_trans_dqresv(
if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
dqp->q_core.d_id &&
((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
- (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+ (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+ (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
if (nblks > 0) {
/*
* dquot is locked already. See if we'd go over the
@@ -736,8 +733,8 @@ error_return:
/*
* Given dquot(s), make disk block and/or inode reservations against them.
- * The fact that this does the reservation against both the usr and
- * grp/prj quotas is important, because this follows a both-or-nothing
+ * The fact that this does the reservation against user, group and
+ * project quotas is important, because this follows a all-or-nothing
* approach.
*
* flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +745,16 @@ error_return:
*/
int
xfs_trans_reserve_quota_bydquots(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- long nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ long nblks,
+ long ninos,
+ uint flags)
{
- int resvd = 0, error;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
@@ -771,28 +769,34 @@ xfs_trans_reserve_quota_bydquots(
(flags & ~XFS_QMOPT_ENOSPC));
if (error)
return error;
- resvd = 1;
}
if (gdqp) {
error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
- if (error) {
- /*
- * can't do it, so backout previous reservation
- */
- if (resvd) {
- flags |= XFS_QMOPT_FORCE_RES;
- xfs_trans_dqresv(tp, mp, udqp,
- -nblks, -ninos, flags);
- }
- return error;
- }
+ if (error)
+ goto unwind_usr;
+ }
+
+ if (pdqp) {
+ error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
+ if (error)
+ goto unwind_grp;
}
/*
* Didn't change anything critical, so, no need to log
*/
return 0;
+
+unwind_grp:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (gdqp)
+ xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (udqp)
+ xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+ return error;
}
@@ -816,8 +820,7 @@ xfs_trans_reserve_quota_nblks(
if (XFS_IS_PQUOTA_ON(mp))
flags |= XFS_QMOPT_ENOSPC;
- ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +833,7 @@ xfs_trans_reserve_quota_nblks(
*/
return xfs_trans_reserve_quota_bydquots(tp, mp,
ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot,
nblks, ninos, flags);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /*
+ * First time we log the inode in a transaction, bump the inode change
+ * counter if it is configured for this to occur.
+ */
+ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ IS_I_VERSION(VFS_I(ip))) {
+ inode_inc_iversion(VFS_I(ip));
+ ip->i_d.di_changecount = VFS_I(ip)->i_version;
+ flags |= XFS_ILOG_CORE;
+ }
+
tp->t_flags |= XFS_TRANS_DIRTY;
ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..dc730ac272be 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
xfs_trans_ijoin(tp, ip, 0);
if (S_ISLNK(ip->i_d.di_mode)) {
- /*
- * Zero length symlinks _can_ exist.
- */
- if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
- error = xfs_inactive_symlink_rmt(ip, &tp);
- if (error)
- goto out_cancel;
- } else if (ip->i_df.if_bytes > 0) {
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- }
+ error = xfs_inactive_symlink(ip, &tp);
+ if (error)
+ goto out_cancel;
} else if (truncate) {
ip->i_d.di_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -498,6 +489,7 @@ xfs_create(
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
uint resblks;
uint log_res;
uint log_count;
@@ -516,7 +508,8 @@ xfs_create(
* Make sure that we have allocated dquot(s) on disk.
*/
error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -568,7 +561,8 @@ xfs_create(
/*
* Reserve disk quota and the inode.
*/
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
if (error)
goto out_trans_cancel;
@@ -632,7 +626,7 @@ xfs_create(
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
@@ -644,6 +638,7 @@ xfs_create(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
*ipp = ip;
return 0;
@@ -665,6 +660,7 @@ xfs_create(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1577,7 +1573,7 @@ xfs_free_file_space(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot,
+ ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
resblks, 0, XFS_QMOPT_RES_REGBLKS);
if (error)
goto error1;