257 files changed, 16881 insertions, 6990 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e81980207..32883589ee54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
 	select CONFIGFS_FS
 	select JBD2
 	select CRC32
+	select QUOTA
+	select QUOTA_TREE
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
 	  system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,14 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 
-config OCFS2_COMPAT_JBD
-	bool "Use JBD for compatibility"
+config OCFS2_FS_POSIX_ACL
+	bool "OCFS2 POSIX Access Control Lists"
 	depends on OCFS2_FS
+	select FS_POSIX_ACL
 	default n
-	select JBD
 	help
-	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
-	  is backwards compatible with JBD.  It is safe to say N here.
-	  However, if you really want to use the original JBD, say Y here.
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
 
 endif # BLOCK
 
@@ -303,6 +304,10 @@ config PRINT_QUOTA_WARNING
 	  Note that this behavior is currently deprecated and may go away in
 	  future. Please use notification via netlink socket instead.
 
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+	 tristate
+
 config QFMT_V1
 	tristate "Old quota format support"
 	depends on QUOTA
@@ -314,6 +319,7 @@ config QFMT_V1
 config QFMT_V2
 	tristate "Quota format v2 support"
 	depends on QUOTA
+	select QUOTA_TREE
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
 	  need this functionality say Y here.
@@ -715,7 +721,20 @@ config CONFIGFS_FS
 
 endmenu
 
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+	bool "Miscellaneous filesystems"
+	default y
+	---help---
+	  Say Y here to get to see options for various miscellaneous
+	  filesystems, such as filesystems that came from other
+	  operating systems.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if MISC_FILESYSTEMS
 
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
@@ -1085,7 +1104,7 @@ config UFS_DEBUG
 	  Y here.  This will result in _many_ additional debugging messages to be
 	  written to the system log.
 
-endmenu
+endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
 	bool "Network File Systems"
diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d228..c830611550d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 
 obj-$(CONFIG_PROC_FS)		+= proc/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		goto bad_inode;
 #else
 		inode->i_mode |= S_IFDIR;
-		inode->i_op = NULL;
-		inode->i_fop = NULL;
+		/* ... and leave ->i_op and ->i_fop pointing to empty */
 		break;
 #endif
 	case ST_LINKFILE:
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 	inode->i_nlink = 2;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks = 0;
 
 	if (ino == AUTOFS_ROOT_INO) {
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 		inode->i_op = &autofs_root_inode_operations;
 		inode->i_fop = &autofs_root_operations;
-		inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
 		goto done;
 	} 
 	
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
 #define AUTOFS_DEV_IOCTL_IOC_FIRST	(AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT	(AUTOFS_IOC_COUNT - 11)
 
-#define AUTOFS_TYPE_TRIGGER	(AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
-
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 
 /*
  * Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 	}
 
 	if (param->size > sizeof(*param)) {
-		err = check_name(param->path);
+		err = invalid_str(param->path,
+				 (void *) ((size_t) param + param->size));
 		if (err) {
-			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
-				    cmd);
+			AUTOFS_WARN(
+			  "path string terminator missing for cmd(0x%08x)",
+			  cmd);
 			goto out;
 		}
 
-		err = invalid_str(param->path,
-				 (void *) ((size_t) param + param->size));
+		err = check_name(param->path);
 		if (err) {
 			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
 				    cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
 				     struct autofs_sb_info *sbi,
 				     struct autofs_dev_ioctl *param)
 {
-	param->arg1 = sbi->version;
+	param->protover.version = sbi->version;
 	return 0;
 }
 
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 					struct autofs_sb_info *sbi,
 					struct autofs_dev_ioctl *param)
 {
-	param->arg1 = sbi->sub_version;
+	param->protosubver.sub_version = sbi->sub_version;
 	return 0;
 }
 
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
 	int err, fd;
 
 	/* param->path has already been checked */
-	if (!param->arg1)
+	if (!param->openmount.devid)
 		return -EINVAL;
 
 	param->ioctlfd = -1;
 
 	path = param->path;
-	devid = param->arg1;
+	devid = param->openmount.devid;
 
 	err = 0;
 	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
 {
 	autofs_wqt_t token;
 
-	token = (autofs_wqt_t) param->arg1;
+	token = (autofs_wqt_t) param->ready.token;
 	return autofs4_wait_release(sbi, token, 0);
 }
 
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
 	autofs_wqt_t token;
 	int status;
 
-	token = (autofs_wqt_t) param->arg1;
-	status = param->arg2 ? param->arg2 : -ENOENT;
+	token = (autofs_wqt_t) param->fail.token;
+	status = param->fail.status ? param->fail.status : -ENOENT;
 	return autofs4_wait_release(sbi, token, status);
 }
 
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 	int pipefd;
 	int err = 0;
 
-	if (param->arg1 == -1)
+	if (param->setpipefd.pipefd == -1)
 		return -EINVAL;
 
-	pipefd = param->arg1;
+	pipefd = param->setpipefd.pipefd;
 
 	mutex_lock(&sbi->wq_mutex);
 	if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
 {
 	unsigned long timeout;
 
-	timeout = param->arg1;
-	param->arg1 = sbi->exp_timeout / HZ;
+	timeout = param->timeout.timeout;
+	param->timeout.timeout = sbi->exp_timeout / HZ;
 	sbi->exp_timeout = timeout * HZ;
 	return 0;
 }
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 	path = param->path;
 	devid = sbi->sb->s_dev;
 
-	param->arg1 = param->arg2 = -1;
+	param->requester.uid = param->requester.gid = -1;
 
 	/* Get nameidata of the parent directory */
 	err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(nd.path.dentry);
 		spin_lock(&sbi->fs_lock);
-		param->arg1 = ino->uid;
-		param->arg2 = ino->gid;
+		param->requester.uid = ino->uid;
+		param->requester.gid = ino->gid;
 		spin_unlock(&sbi->fs_lock);
 	}
 
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
 	int err = -EAGAIN;
 	int how;
 
-	how = param->arg1;
+	how = param->expire.how;
 	mnt = fp->f_path.mnt;
 
-	if (sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (autofs_type_trigger(sbi->type))
 		dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
 	else
 		dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
 				      struct autofs_sb_info *sbi,
 				      struct autofs_dev_ioctl *param)
 {
-	param->arg1 = 0;
+	param->askumount.may_umount = 0;
 	if (may_umount(fp->f_path.mnt))
-		param->arg1 = 1;
+		param->askumount.may_umount = 1;
 	return 0;
 }
 
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 	struct nameidata nd;
 	const char *path;
 	unsigned int type;
+	unsigned int devid, magic;
 	int err = -ENOENT;
 
 	if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 	}
 
 	path = param->path;
-	type = param->arg1;
+	type = param->ismountpoint.in.type;
 
-	param->arg1 = 0;
-	param->arg2 = 0;
+	param->ismountpoint.out.devid = devid = 0;
+	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
-		if (type == AUTOFS_TYPE_ANY) {
+		if (autofs_type_any(type)) {
 			struct super_block *sb;
 
 			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 				goto out;
 
 			sb = nd.path.dentry->d_sb;
-			param->arg1 = new_encode_dev(sb->s_dev);
+			devid = new_encode_dev(sb->s_dev);
 		} else {
 			struct autofs_info *ino;
 
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 				goto out_release;
 
 			ino = autofs4_dentry_ino(nd.path.dentry);
-			param->arg1 = autofs4_get_dev(ino->sbi);
+			devid = autofs4_get_dev(ino->sbi);
 		}
 
 		err = 0;
 		if (nd.path.dentry->d_inode &&
 		    nd.path.mnt->mnt_root == nd.path.dentry) {
 			err = 1;
-			param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+			magic = nd.path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
-		dev_t devid = new_encode_dev(sbi->sb->s_dev);
+		dev_t dev = autofs4_get_dev(sbi);
 
 		err = path_lookup(path, LOOKUP_PARENT, &nd);
 		if (err)
 			goto out;
 
-		err = autofs_dev_ioctl_find_super(&nd, devid);
+		err = autofs_dev_ioctl_find_super(&nd, dev);
 		if (err)
 			goto out_release;
 
-		param->arg1 = autofs4_get_dev(sbi);
+		devid = dev;
 
 		err = have_submounts(nd.path.dentry);
 
 		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
 			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
 				struct inode *inode = nd.path.dentry->d_inode;
-				param->arg2 = inode->i_sb->s_magic;
+				magic = inode->i_sb->s_magic;
 			}
 		}
 	}
 
+	param->ismountpoint.out.devid = devid;
+	param->ismountpoint.out.magic = magic;
+
 out_release:
 	path_put(&nd.path);
 out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
-		if (sbi->type == AUTOFS_TYPE_INDIRECT)
+		if (autofs_type_indirect(sbi->type))
 			goto done;
 
 		/*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	if (arg && get_user(do_now, arg))
 		return -EFAULT;
 
-	if (sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (autofs_type_trigger(sbi->type))
 		dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
 	else
 		dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
 
-	if (sbi->type & AUTOFS_TYPE_OFFSET)
+	if (autofs_type_offset(sbi->type))
 		seq_printf(m, ",offset");
-	else if (sbi->type & AUTOFS_TYPE_DIRECT)
+	else if (autofs_type_direct(sbi->type))
 		seq_printf(m, ",direct");
 	else
 		seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 			*maxproto = option;
 			break;
 		case Opt_indirect:
-			*type = AUTOFS_TYPE_INDIRECT;
+			set_autofs_type_indirect(type);
 			break;
 		case Opt_direct:
-			*type = AUTOFS_TYPE_DIRECT;
+			set_autofs_type_direct(type);
 			break;
 		case Opt_offset:
-			*type = AUTOFS_TYPE_OFFSET;
+			set_autofs_type_offset(type);
 			break;
 		default:
 			return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
-	sbi->type = AUTOFS_TYPE_INDIRECT;
+	set_autofs_type_indirect(&sbi->type);
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	}
 
 	root_inode->i_fop = &autofs4_root_operations;
-	root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+	root_inode->i_op = autofs_type_trigger(sbi->type) ?
 			&autofs4_direct_root_inode_operations :
 			&autofs4_indirect_root_inode_operations;
 
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 	if (sb->s_root) {
 		inode->i_uid = sb->s_root->d_inode->i_uid;
 		inode->i_gid = sb->s_root->d_inode->i_gid;
-	} else {
-		inode->i_uid = 0;
-		inode->i_gid = 0;
 	}
-	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		 * is very similar for indirect mounts except only dentrys
 		 * in the root of the autofs file system may be negative.
 		 */
-		if (sbi->type & AUTOFS_TYPE_TRIGGER)
+		if (autofs_type_trigger(sbi->type))
 			return -ENOENT;
 		else if (!IS_ROOT(dentry->d_parent))
 			return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		return -ENOMEM;
 
 	/* If this is a direct mount request create a dummy name */
-	if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
 		qstr.len = sprintf(name, "%p", dentry);
 	else {
 		qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 				type = autofs_ptype_expire_multi;
 		} else {
 			if (notify == NFY_MOUNT)
-				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+				type = autofs_type_trigger(sbi->type) ?
 					autofs_ptype_missing_direct :
 					 autofs_ptype_missing_indirect;
 			else
-				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+				type = autofs_type_trigger(sbi->type) ?
 					autofs_ptype_expire_direct :
 					autofs_ptype_expire_indirect;
 		}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
 
+	if (!info)
+		return;
+
 	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	unsigned i, imap_len;
 	struct bfs_sb_info *info;
 	long ret = -EINVAL;
+	unsigned long i_sblock, i_eblock, i_eoff, s_size;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_magic = BFS_MAGIC;
 	info->si_sbh = bh;
+
+	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+		printf("Superblock is corrupted\n");
+		goto out;
+	}
+
 	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
 					sizeof(struct bfs_inode)
 					+ BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
+
+	/* can we read the last block? */
+	bh = sb_bread(s, info->si_blocks - 1);
+	if (!bh) {
+		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		iput(inode);
+		ret = -EIO;
+		kfree(info->si_imap);
+		goto out;
+	}
+	brelse(bh);
+
 	bh = NULL;
 	for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
 		struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 		di = (struct bfs_inode *)bh->b_data + off;
 
+		/* test if filesystem is not corrupted */
+
+		i_eoff = le32_to_cpu(di->i_eoffset);
+		i_sblock = le32_to_cpu(di->i_sblock);
+		i_eblock = le32_to_cpu(di->i_eblock);
+		s_size = le32_to_cpu(bfs_sb->s_end);
+
+		if (i_sblock > info->si_blocks ||
+			i_eblock > info->si_blocks ||
+			i_sblock > i_eblock ||
+			i_eoff > s_size ||
+			i_sblock * BFS_BSIZE > i_eoff) {
+
+			printf("Inode 0x%08x corrupted\n", i);
+
+			brelse(bh);
+			s->s_root = NULL;
+			kfree(info->si_imap);
+			kfree(info);
+			s->s_fs_info = NULL;
+			return -EIO;
+		}
+
 		if (!di->i_ino) {
 			info->si_freei++;
 			continue;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t __user *sp;
 	elf_addr_t __user *u_platform;
 	elf_addr_t __user *u_base_platform;
+	elf_addr_t __user *u_rand_bytes;
 	const char *k_platform = ELF_PLATFORM;
 	const char *k_base_platform = ELF_BASE_PLATFORM;
+	unsigned char k_rand_bytes[16];
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 	}
 
+	/*
+	 * Generate 16 random bytes for userspace PRNG seeding.
+	 */
+	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+	u_rand_bytes = (elf_addr_t __user *)
+		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+		return -EFAULT;
+
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_GID, cred->gid);
 	NEW_AUX_ENT(AT_EGID, cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
 	}
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled" : "disabled";
+	char *s = enabled ? "enabled\n" : "disabled\n";
 
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	int i, ret;
 	int nr_pages = 0;
 	unsigned int len = 0;
+	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
 
 	for (i = 0; i < iov_count; i++) {
 		unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
-	i = 0;
+
+	if (map_data) {
+		nr_pages = 1 << map_data->page_order;
+		i = map_data->offset / PAGE_SIZE;
+	}
 	while (len) {
-		unsigned int bytes;
+		unsigned int bytes = PAGE_SIZE;
 
-		if (map_data)
-			bytes = 1U << (PAGE_SHIFT + map_data->page_order);
-		else
-			bytes = PAGE_SIZE;
+		bytes -= offset;
 
 		if (bytes > len)
 			bytes = len;
 
 		if (map_data) {
-			if (i == map_data->nr_entries) {
+			if (i == map_data->nr_entries * nr_pages) {
 				ret = -ENOMEM;
 				break;
 			}
-			page = map_data->pages[i++];
-		} else
+
+			page = map_data->pages[i / nr_pages];
+			page += (i % nr_pages);
+
+			i++;
+		} else {
 			page = alloc_page(q->bounce_gfp | gfp_mask);
-		if (!page) {
-			ret = -ENOMEM;
-			break;
+			if (!page) {
+				ret = -ENOMEM;
+				break;
+			}
 		}
 
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
 			break;
 
 		len -= bytes;
+		offset = 0;
 	}
 
 	if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	/*
 	 * success
 	 */
-	if (!write_to_vm) {
+	if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
 		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 349a26c10001..ac7031f12ea5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	}
 
 	lock_kernel();
+ restart:
 
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
+				if (ret == -ERESTARTSYS) {
+					/* Lost a race with 'disk' being
+					 * deleted, try again.
+					 * See md.c
+					 */
+					disk_put_part(bdev->bd_part);
+					bdev->bd_part = NULL;
+					module_put(disk->fops->owner);
+					put_disk(disk);
+					bdev->bd_disk = NULL;
+					mutex_unlock(&bdev->bd_mutex);
+					goto restart;
+				}
 				if (ret)
 					goto out_clear;
 			}
@@ -1220,6 +1234,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+	if (super && super->s_op->bdev_try_to_free_page)
+		return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+	return try_to_free_buffers(page);
+}
+
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
@@ -1227,6 +1255,7 @@ static const struct address_space_operations def_blk_aops = {
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
+	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 };
 
@@ -1262,7 +1291,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
- * @path:	special file representing the block device
+ * @pathname:	special file representing the block device
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
diff --git a/fs/buffer.c b/fs/buffer.c
index a13f09b696f7..c26da785938a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		}
-		goto out;
 	}
 
 out:
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 	cd->major = major;
 	cd->baseminor = baseminor;
 	cd->minorct = minorct;
-	strncpy(cd->name,name, 64);
+	strlcpy(cd->name, name, sizeof(cd->name));
 
 	i = major_to_index(major);
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 out_truncate:
-	if (inode->i_op && inode->i_op->truncate)
+	if (inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 	return 0;
 out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
 	struct file *host_file;
-	struct dentry *host_dentry;
-	struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
+	struct inode *coda_inode = coda_dentry->d_inode;
 	struct coda_file_info *cfi;
 	int err = 0;
 
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (host_file->f_op && host_file->f_op->fsync) {
-		host_dentry = host_file->f_path.dentry;
-		host_inode = host_dentry->d_inode;
-		mutex_lock(&host_inode->i_mutex);
-		err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-		mutex_unlock(&host_inode->i_mutex);
-	}
-
+	err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
 	if ( !err && !datasync ) {
 		lock_kernel();
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 
 #include "coda_int.h"
 
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 
 static ctl_table coda_table[] = {
 	{
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
 	{}
 };
 
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
 	},
 	{}
 };
-
+#endif
 
 void coda_sysctl_init(void)
 {
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..30f2faa22f5c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
 	ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
 
 out:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
 	fput(file);
 	return ret;
 }
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
 
 out:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
 	fput(file);
 	return ret;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
 
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 			inode->i_op = &page_symlink_inode_operations;
 			inode->i_data.a_ops = &cramfs_aops;
 		} else {
-			inode->i_size = 0;
-			inode->i_blocks = 0;
 			init_special_inode(inode, inode->i_mode,
 				old_decode_dev(cramfs_inode->size));
 		}
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a32..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
 	spin_unlock(&dcache_lock);
 }
 
-#define do_switch(x,y) do { \
-	__typeof__ (x) __tmp = x; \
-	x = y; y = __tmp; } while (0)
-
 /*
  * When switching names, the actual string doesn't strictly have to
  * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			/*
 			 * Both external: swap the pointers
 			 */
-			do_switch(target->d_name.name, dentry->d_name.name);
+			swap(target->d_name.name, dentry->d_name.name);
 		} else {
 			/*
 			 * dentry:internal, target:external.  Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			return;
 		}
 	}
-	do_switch(dentry->d_name.len, target->d_name.len);
+	swap(dentry->d_name.len, target->d_name.len);
 }
 
 /*
@@ -1680,7 +1676,7 @@ already_unhashed:
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
-	do_switch(dentry->d_name.hash, target->d_name.hash);
+	swap(dentry->d_name.hash, target->d_name.hash);
 
 	/* ... and switch the parents */
 	if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
 		target->d_parent = target;
 		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
-		do_switch(dentry->d_parent, target->d_parent);
+		swap(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
 		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	struct dentry *dparent, *aparent;
 
 	switch_names(dentry, anon);
-	do_switch(dentry->d_name.hash, anon->d_name.hash);
+	swap(dentry->d_name.hash, anon->d_name.hash);
 
 	dparent = dentry->d_parent;
 	aparent = anon->d_parent;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
 
+
+static int debugfs_size_t_set(void *data, u64 val)
+{
+	*(size_t *)data = val;
+	return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+	*val = *(size_t *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+			"%llu\n");	/* %llu and %zu are more or less the same */
+
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+				     struct dentry *parent, size_t *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+
+
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fff96e152c0c..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
 	}
 
 	inode->i_ino = 2;
-	inode->i_uid = inode->i_gid = 0;
-	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
 	mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 		goto free_fsi;
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks = 0;
-	inode->i_uid = inode->i_gid = 0;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io, dio);
 
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+	 * it's own meaner.
+	 */
+	if (unlikely(retval < 0 && (rw & WRITE))) {
+		loff_t isize = i_size_read(inode);
+
+		if (end > isize && dio_lock_type == DIO_LOCKING)
+			vmtruncate(inode, isize);
+	}
+
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
 		release_i_mutex = 0;
 
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type);
+		dlm_user_add_ast(lkb, type, bastmode);
 		return;
 	}
 
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
 	}
 	lkb->lkb_ast_type |= type;
+	if (bastmode)
+		lkb->lkb_bastmode = bastmode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
 	struct dlm_lkb *lkb;
 	void (*cast) (void *astparam);
 	void (*bast) (void *astparam, int mode);
-	int type = 0, found, bmode;
-
-	for (;;) {
-		found = 0;
-		spin_lock(&ast_queue_lock);
-		list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
-			r = lkb->lkb_resource;
-			ls = r->res_ls;
-
-			if (dlm_locking_stopped(ls))
-				continue;
-
-			list_del(&lkb->lkb_astqueue);
-			type = lkb->lkb_ast_type;
-			lkb->lkb_ast_type = 0;
-			found = 1;
-			break;
-		}
-		spin_unlock(&ast_queue_lock);
+	int type = 0, bastmode;
+
+repeat:
+	spin_lock(&ast_queue_lock);
+	list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+		r = lkb->lkb_resource;
+		ls = r->res_ls;
+
+		if (dlm_locking_stopped(ls))
+			continue;
 
-		if (!found)
-			break;
+		list_del(&lkb->lkb_astqueue);
+		type = lkb->lkb_ast_type;
+		lkb->lkb_ast_type = 0;
+		bastmode = lkb->lkb_bastmode;
 
+		spin_unlock(&ast_queue_lock);
 		cast = lkb->lkb_astfn;
 		bast = lkb->lkb_bastfn;
-		bmode = lkb->lkb_bastmode;
 
 		if ((type & AST_COMP) && cast)
 			cast(lkb->lkb_astparam);
 
-		/* FIXME: Is it safe to look at lkb_grmode here
-		   without doing a lock_rsb() ?
-		   Look at other checks in v1 to avoid basts. */
-
 		if ((type & AST_BAST) && bast)
-			if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
-				bast(lkb->lkb_astparam, bmode);
+			bast(lkb->lkb_astparam, bastmode);
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
 		dlm_put_lkb(lkb);
 
-		schedule();
+		cond_resched();
+		goto repeat;
 	}
+	spin_unlock(&ast_queue_lock);
 }
 
 static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 
 void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..2f107d1a6a45 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
 
 struct rsb_iter {
 	int entry;
-	int locks;
+	int format;
 	int header;
 	struct dlm_ls *ls;
 	struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
 	}
 }
 
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
-				struct dlm_rsb *res)
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *res)
 {
 	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
 
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	seq_printf(s, "\n");
 }
 
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
 	/* Print the locks attached to this resource */
 	seq_printf(s, "Granted Queue\n");
 	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	seq_printf(s, "Conversion Queue\n");
 	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	seq_printf(s, "Waiting Queue\n");
 	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	if (list_empty(&res->res_lookup))
 		goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
 	return 0;
 }
 
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *r)
 {
-	unsigned int waiting = 0;
-	uint64_t xid = 0;
+	u64 xid = 0;
+	u64 us;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
 
-	if (lkb->lkb_timestamp)
-		waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+	/* microseconds since lkb was added to current queue */
+	us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
 
-	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
 	   r_nodeid r_len r_name */
 
-	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
 		   lkb->lkb_id,
 		   lkb->lkb_nodeid,
 		   lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
 		   lkb->lkb_status,
 		   lkb->lkb_grmode,
 		   lkb->lkb_rqmode,
-		   waiting,
+		   (unsigned long long)us,
 		   r->res_nodeid,
 		   r->res_length,
 		   r->res_name);
 }
 
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 
 	lock_rsb(r);
 
 	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
 
 	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
 
 	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
+
+	unlock_rsb(r);
+	return 0;
+}
+
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       int rsb_lookup)
+{
+	u64 xid = 0;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+		   lkb->lkb_id,
+		   lkb->lkb_nodeid,
+		   lkb->lkb_remid,
+		   lkb->lkb_ownpid,
+		   (unsigned long long)xid,
+		   lkb->lkb_exflags,
+		   lkb->lkb_flags,
+		   lkb->lkb_status,
+		   lkb->lkb_grmode,
+		   lkb->lkb_rqmode,
+		   lkb->lkb_highbast,
+		   rsb_lookup,
+		   lkb->lkb_wait_type,
+		   lkb->lkb_lvbseq,
+		   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+		   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+}
+
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = r->res_ls->ls_lvblen;
+	int print_name = 1;
+
+	lock_rsb(r);
+
+	seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+		   r,
+		   r->res_nodeid,
+		   r->res_first_lkid,
+		   r->res_flags,
+		   !list_empty(&r->res_root_list),
+		   !list_empty(&r->res_recover_list),
+		   r->res_recover_locks_count,
+		   r->res_length);
+
+	for (i = 0; i < r->res_length; i++) {
+		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+			print_name = 0;
+	}
+
+	seq_printf(s, "%s", print_name ? "str " : "hex");
+
+	for (i = 0; i < r->res_length; i++) {
+		if (print_name)
+			seq_printf(s, "%c", r->res_name[i]);
+		else
+			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+	}
+	seq_printf(s, "\n");
+
+	if (!r->res_lvbptr)
+		goto do_locks;
+
+	seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+	for (i = 0; i < lvblen; i++)
+		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+	seq_printf(s, "\n");
+
+ do_locks:
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+		print_format3_lock(s, lkb, 1);
 
 	unlock_rsb(r);
 	return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
 				break;
 			}
 			read_unlock(&ls->ls_rsbtbl[i].lock);
-                }
+		}
 		ri->entry = i;
 
 		if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
 			read_unlock(&ls->ls_rsbtbl[i].lock);
 			dlm_put_rsb(old);
 			goto top;
-                }
+		}
 		ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
 		dlm_hold_rsb(ri->rsb);
 		read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
 	ri->ls = ls;
 	ri->entry = 0;
 	ri->next = NULL;
+	ri->format = 1;
 
 	if (rsb_iter_next(ri)) {
 		rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
 {
 	struct rsb_iter *ri = iter_ptr;
 
-	if (ri->locks) {
+	switch (ri->format) {
+	case 1:
+		print_format1(ri->rsb, file);
+		break;
+	case 2:
 		if (ri->header) {
-			seq_printf(file, "id nodeid remid pid xid exflags flags "
-					 "sts grmode rqmode time_ms r_nodeid "
-					 "r_len r_name\n");
+			seq_printf(file, "id nodeid remid pid xid exflags "
+					 "flags sts grmode rqmode time_ms "
+					 "r_nodeid r_len r_name\n");
 			ri->header = 0;
 		}
-		print_locks(ri->rsb, file);
-	} else {
-		print_resource(ri->rsb, file);
+		print_format2(ri->rsb, file);
+		break;
+	case 3:
+		if (ri->header) {
+			seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			ri->header = 0;
+		}
+		print_format3(ri->rsb, file);
+		break;
 	}
 
 	return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
 	ri->ls = ls;
 	ri->entry = 0;
 	ri->next = NULL;
-	ri->locks = 1;
+	ri->format = 2;
 
 	if (*pos == 0)
 		ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
 };
 
 /*
+ * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
+ * This can replace both formats 1 and 2 eventually.
+ */
+
+static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
+{
+	struct rsb_iter *ri;
+
+	ri = kzalloc(sizeof *ri, GFP_KERNEL);
+	if (!ri)
+		return NULL;
+
+	ri->ls = ls;
+	ri->entry = 0;
+	ri->next = NULL;
+	ri->format = 3;
+
+	if (*pos == 0)
+		ri->header = 1;
+
+	if (rsb_iter_next(ri)) {
+		rsb_iter_free(ri);
+		return NULL;
+	}
+
+	return ri;
+}
+
+static void *all_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct rsb_iter *ri;
+	loff_t n = *pos;
+
+	ri = all_iter_init(file->private, pos);
+	if (!ri)
+		return NULL;
+
+	while (n--) {
+		if (rsb_iter_next(ri)) {
+			rsb_iter_free(ri);
+			return NULL;
+		}
+	}
+
+	return ri;
+}
+
+static struct seq_operations all_seq_ops = {
+	.start = all_seq_start,
+	.next  = rsb_seq_next,
+	.stop  = rsb_seq_stop,
+	.show  = rsb_seq_show,
+};
+
+static int all_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &all_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations all_fops = {
+	.owner   = THIS_MODULE,
+	.open    = all_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
  * dump lkb's on the ls_waiters list
  */
 
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
 	.read    = waiters_read
 };
 
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+	if (ls->ls_debug_rsb_dentry)
+		debugfs_remove(ls->ls_debug_rsb_dentry);
+	if (ls->ls_debug_waiters_dentry)
+		debugfs_remove(ls->ls_debug_waiters_dentry);
+	if (ls->ls_debug_locks_dentry)
+		debugfs_remove(ls->ls_debug_locks_dentry);
+	if (ls->ls_debug_all_dentry)
+		debugfs_remove(ls->ls_debug_all_dentry);
+}
+
 int dlm_create_debug_file(struct dlm_ls *ls)
 {
 	char name[DLM_LOCKSPACE_LEN+8];
 
+	/* format 1 */
+
 	ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
 						      &rsb_fops);
 	if (!ls->ls_debug_rsb_dentry)
-		return -ENOMEM;
+		goto fail;
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-
-	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
-							  S_IFREG | S_IRUGO,
-							  dlm_root,
-							  ls,
-							  &waiters_fops);
-	if (!ls->ls_debug_waiters_dentry) {
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-		return -ENOMEM;
-	}
+	/* format 2 */
 
 	memset(name, 0, sizeof(name));
 	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 							dlm_root,
 							ls,
 							&locks_fops);
-	if (!ls->ls_debug_locks_dentry) {
-		debugfs_remove(ls->ls_debug_waiters_dentry);
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-		return -ENOMEM;
-	}
+	if (!ls->ls_debug_locks_dentry)
+		goto fail;
+
+	/* format 3 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+
+	ls->ls_debug_all_dentry = debugfs_create_file(name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &all_fops);
+	if (!ls->ls_debug_all_dentry)
+		goto fail;
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+							  S_IFREG | S_IRUGO,
+							  dlm_root,
+							  ls,
+							  &waiters_fops);
+	if (!ls->ls_debug_waiters_dentry)
+		goto fail;
 
 	return 0;
-}
 
-void dlm_delete_debug_file(struct dlm_ls *ls)
-{
-	if (ls->ls_debug_rsb_dentry)
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-	if (ls->ls_debug_waiters_dentry)
-		debugfs_remove(ls->ls_debug_waiters_dentry);
-	if (ls->ls_debug_locks_dentry)
-		debugfs_remove(ls->ls_debug_locks_dentry);
+ fail:
+	dlm_delete_debug_file(ls);
+	return -ENOMEM;
 }
 
 int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 	struct list_head *list;
 	struct dlm_rsb *r;
 	int offset = 0, dir_nodeid;
-	uint16_t be_namelen;
+	__be16 be_namelen;
 
 	down_read(&ls->ls_root_sem);
 
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 
 		if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
 			/* Write end-of-block record */
-			be_namelen = 0;
-			memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-			offset += sizeof(uint16_t);
+			be_namelen = cpu_to_be16(0);
+			memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+			offset += sizeof(__be16);
 			goto out;
 		}
 
 		be_namelen = cpu_to_be16(r->res_length);
-		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-		offset += sizeof(uint16_t);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
 		memcpy(outbuf + offset, r->res_name, r->res_length);
 		offset += r->res_length;
 	}
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 
 	if ((list == &ls->ls_root_list) &&
 	    (offset + sizeof(uint16_t) <= outlen)) {
-		be_namelen = 0xFFFF;
-		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-		offset += sizeof(uint16_t);
+		be_namelen = cpu_to_be16(0xFFFF);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
 	}
 
  out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..ef2f1e353966 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
 	struct list_head	lkb_astqueue;	/* need ast to be sent */
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
-	unsigned long		lkb_timestamp;
+	ktime_t			lkb_time_bast;	/* for debugging */
+	ktime_t			lkb_timestamp;
 	unsigned long		lkb_timeout_cs;
 
 	char			*lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
 	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
 	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
 	struct dentry		*ls_debug_locks_dentry; /* debugfs */
+	struct dentry		*ls_debug_all_dentry; /* debugfs */
 
 	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
 	int			ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..6cfe65bbf4a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	lkb->lkb_lksb->sb_status = rv;
 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
 
-	dlm_add_ast(lkb, AST_COMP);
+	dlm_add_ast(lkb, AST_COMP, 0);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
+	lkb->lkb_time_bast = ktime_get();
+
 	if (is_master_copy(lkb))
 		send_bast(r, lkb, rqmode);
-	else {
-		lkb->lkb_bastmode = rqmode;
-		dlm_add_ast(lkb, AST_BAST);
-	}
+	else
+		dlm_add_ast(lkb, AST_BAST, rqmode);
 }
 
 /*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
 
 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
 
+	lkb->lkb_timestamp = ktime_get();
+
 	lkb->lkb_status = status;
 
 	switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 
-	if (is_master_copy(lkb)) {
-		lkb->lkb_timestamp = jiffies;
+	if (is_master_copy(lkb))
 		return;
-	}
 
 	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
 	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
 	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
 	mutex_lock(&ls->ls_timeout_mutex);
 	hold_lkb(lkb);
-	lkb->lkb_timestamp = jiffies;
 	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
 	mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 	struct dlm_rsb *r;
 	struct dlm_lkb *lkb;
 	int do_cancel, do_warn;
+	s64 wait_us;
 
 	for (;;) {
 		if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 		mutex_lock(&ls->ls_timeout_mutex);
 		list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
 
+			wait_us = ktime_to_us(ktime_sub(ktime_get(),
+					      		lkb->lkb_timestamp));
+
 			if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-			    time_after_eq(jiffies, lkb->lkb_timestamp +
-					  lkb->lkb_timeout_cs * HZ/100))
+			    wait_us >= (lkb->lkb_timeout_cs * 10000))
 				do_cancel = 1;
 
 			if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-			    time_after_eq(jiffies, lkb->lkb_timestamp +
-				   	   dlm_config.ci_timewarn_cs * HZ/100))
+			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
 				do_warn = 1;
 
 			if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 void dlm_adjust_timeouts(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	long adj = jiffies - ls->ls_recover_begin;
+	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
 
 	ls->ls_recover_begin = 0;
 	mutex_lock(&ls->ls_timeout_mutex);
 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-		lkb->lkb_timestamp += adj;
+		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
 	mutex_unlock(&ls->ls_timeout_mutex);
 }
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
 	con->sock->sk->sk_write_space = lowcomms_write_space;
 	con->sock->sk->sk_state_change = lowcomms_state_change;
 	con->sock->sk->sk_user_data = con;
+	con->sock->sk->sk_allocation = GFP_NOFS;
 	return 0;
 }
 
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
 	len = e->len;
 	offset = e->offset;
 	spin_unlock(&con->writequeue_lock);
-	kmap(e->page);
 
 	/* Send the first block off the write queue */
 	iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
-			kunmap(e->page);
 			free_entry(e);
 		}
 		spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 
 	if (e) {
 	got_one:
-		if (users == 0)
-			kmap(e->page);
 		*ppc = page_address(e->page) + offset;
 		return e;
 	}
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
 	if (users)
 		goto out;
 	e->len = e->end - e->offset;
-	kunmap(e->page);
 	spin_unlock(&con->writequeue_lock);
 
 	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
 		offset = e->offset;
 		BUG_ON(len == 0 && e->users == 0);
 		spin_unlock(&con->writequeue_lock);
-		kmap(e->page);
 
 		ret = 0;
 		if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
-			kunmap(e->page);
 			free_entry(e);
 			continue;
 		}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
-	p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+	p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
 	return p;
 }
 
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 
 	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
 
-	r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+	r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
 	return r;
 }
 
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
-	lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+	lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
 	return lkb;
 }
 
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		   ordinary messages). */
 
 		if (msglen > sizeof(__tmp) && p == &__tmp.p) {
-			p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+			p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
 			if (p == NULL)
 				return ret;
 		}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
 	data->status = lkb->lkb_status;
 	data->grmode = lkb->lkb_grmode;
 	data->rqmode = lkb->lkb_rqmode;
-	data->timestamp = lkb->lkb_timestamp;
 	if (lkb->lkb_ua)
 		data->xid = lkb->lkb_ua->xid;
 	if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 
 	ast_type = lkb->lkb_ast_type;
 	lkb->lkb_ast_type |= type;
+	if (bastmode)
+		lkb->lkb_bastmode = bastmode;
 
 	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 
 struct dqstats dqstats;
 
-static void dqput(struct dquot *dquot);
-
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -415,6 +413,17 @@ out_dqlock:
 	return ret;
 }
 
+void dquot_destroy(struct dquot *dquot)
+{
+	kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+	dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
 /* Invalidate all dquots on the list. Note that this function is called after
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
+	}
+	spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv)
+{
+	struct dquot *dquot, *old_dquot = NULL;
+	int ret = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	spin_lock(&dq_list_lock);
+	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+			continue;
+		if (dquot->dq_sb != sb)
+			continue;
+		/* Now we have active dquot so we can just increase use count */
+		atomic_inc(&dquot->dq_count);
+		dqstats.lookups++;
+		spin_unlock(&dq_list_lock);
+		dqput(old_dquot);
+		old_dquot = dquot;
+		ret = fn(dquot, priv);
+		if (ret < 0)
+			goto out;
+		spin_lock(&dq_list_lock);
+		/* We are safe to continue now because our dquot could not
+		 * be moved out of the inuse list while we hold the reference */
 	}
 	spin_unlock(&dq_list_lock);
+out:
+	dqput(old_dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return ret;
 }
 
 int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
-			&& info_dirty(&dqopt->info[cnt]))
+		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+		    && info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
 		count--;
 		head = free_dquots.prev;
 	}
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
  * NOTE: If you change this function please check whether dqput_blocks() works right...
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
 	int ret;
 
@@ -584,7 +628,7 @@ we_slept:
 		/* We have more than one user... nothing to do */
 		atomic_dec(&dquot->dq_count);
 		/* Releasing dquot during quotaoff phase? */
-		if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
 		    atomic_read(&dquot->dq_count) == 1)
 			wake_up(&dquot->dq_wait_unused);
 		spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
 	spin_unlock(&dq_list_lock);
 }
 
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
 	struct dquot *dquot;
 
-	dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
 		return NODQUOT;
 
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 }
 
 /*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+	unsigned int hashent = hashfn(sb, id, type);
+	int ret = 0;
+
+        if (!sb_has_quota_active(sb, type))
+		return 0;
+	spin_lock(&dq_list_lock);
+	if (find_dquot(hashent, sb, id, type) != NODQUOT)
+		ret = 1;
+	spin_unlock(&dq_list_lock);
+	return ret;
+}
+
+/*
  * Get reference to dquot
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
 	struct dquot *dquot, *empty = NODQUOT;
 
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
 		return NODQUOT;
 we_slept:
 	spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
 		if (empty)
-			kmem_cache_free(dquot_cachep, empty);
+			do_destroy_dquot(empty);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	 * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
 	}
 }
 
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
 	dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 }
 
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-	if (dquot->dq_dqb.dqb_curinodes > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curinodes >= number)
 		dquot->dq_dqb.dqb_curinodes -= number;
 	else
 		dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-	if (dquot->dq_dqb.dqb_curspace > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curspace >= number)
 		dquot->dq_dqb.dqb_curspace -= number;
 	else
 		dquot->dq_dqb.dqb_curspace = 0;
-	if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		dquot->dq_dqb.dqb_btime = (time_t) 0;
 	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	return QUOTA_OK;
 }
 
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
 		return QUOTA_NL_NOWARN;
 
 	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	    dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_NOWARN;
 
-	if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
-	    dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_BSOFTBELOW;
-	if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
-	    toqb(dquot->dq_dqb.dqb_curspace - space) <
-						dquot->dq_dqb.dqb_bhardlimit)
+	if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
 		return QUOTA_NL_BHARDBELOW;
 	return QUOTA_NL_NOWARN;
 }
@@ -1166,17 +1237,23 @@ out_err:
  * 	Release all quotas referenced by inode
  *	Transaction must be started at an entry
  */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
 	int cnt;
 
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] != NODQUOT) {
 			dqput(inode->i_dquot[cnt]);
 			inode->i_dquot[cnt] = NODQUOT;
 		}
 	}
+	return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	dquot_drop_locked(inode);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return 0;
 }
@@ -1264,7 +1341,7 @@ warn_put_all:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
 		vfs_dq_init(inode);
 		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
 			return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
 	.mark_dirty	= dquot_mark_dquot_dirty,
-	.write_info	= dquot_commit_info
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-	switch (type) {
-		case USRQUOTA:
-			dqopt->flags |= DQUOT_USR_ENABLED;
-			dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			break;
-		case GRPQUOTA:
-			dqopt->flags |= DQUOT_GRP_ENABLED;
-			dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			break;
-	}
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-				      int remount)
-{
-	switch (type) {
-		case USRQUOTA:
-			dqopt->flags &= ~DQUOT_USR_ENABLED;
-			if (remount)
-				dqopt->flags |= DQUOT_USR_SUSPENDED;
-			else
-				dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			break;
-		case GRPQUOTA:
-			dqopt->flags &= ~DQUOT_GRP_ENABLED;
-			if (remount)
-				dqopt->flags |= DQUOT_GRP_SUSPENDED;
-			else
-				dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			break;
-	}
-}
-
-
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
 	int cnt, ret = 0;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *toputinode[MAXQUOTAS];
 
+	/* Cannot turn off usage accounting without turning off limits, or
+	 * suspend quotas and simultaneously turn quotas off. */
+	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+	    || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+	    DQUOT_USAGE_ENABLED)))
+		return -EINVAL;
+
 	/* We need to serialize quota_off() for device */
 	mutex_lock(&dqopt->dqonoff_mutex);
 
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 	 * sometimes we are called when fill_super() failed and calling
 	 * sync_fs() in such cases does no good.
 	 */
-	if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+	if (!sb_any_quota_loaded(sb)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 	}
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
-		/* If we keep inodes of quota files after remount and quotaoff
-		 * is called, drop kept inodes. */
-		if (!remount && sb_has_quota_suspended(sb, cnt)) {
-			iput(dqopt->files[cnt]);
-			dqopt->files[cnt] = NULL;
-			reset_enable_flags(dqopt, cnt, 0);
+		if (!sb_has_quota_loaded(sb, cnt))
 			continue;
+
+		if (flags & DQUOT_SUSPENDED) {
+			dqopt->flags |=
+				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+		} else {
+			dqopt->flags &= ~dquot_state_flag(flags, cnt);
+			/* Turning off suspended quotas? */
+			if (!sb_has_quota_loaded(sb, cnt) &&
+			    sb_has_quota_suspended(sb, cnt)) {
+				dqopt->flags &=	~dquot_state_flag(
+							DQUOT_SUSPENDED, cnt);
+				iput(dqopt->files[cnt]);
+				dqopt->files[cnt] = NULL;
+				continue;
+			}
 		}
-		if (!sb_has_quota_enabled(sb, cnt))
+
+		/* We still have to keep quota loaded? */
+		if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
 			continue;
-		reset_enable_flags(dqopt, cnt, remount);
 
 		/* Note: these are blocking operations */
 		drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		put_quota_format(dqopt->info[cnt].dqi_format);
 
 		toputinode[cnt] = dqopt->files[cnt];
-		if (!remount)
+		if (!sb_has_quota_loaded(sb, cnt))
 			dqopt->files[cnt] = NULL;
 		dqopt->info[cnt].dqi_flags = 0;
 		dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		dqopt->ops[cnt] = NULL;
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	/* Skip syncing and setting flags if quota files are hidden */
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		goto put_inodes;
+
 	/* Sync the superblock so that buffers with quota data are written to
 	 * disk (and so userspace sees correct data afterwards). */
 	if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 			mutex_lock(&dqopt->dqonoff_mutex);
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
-			if (!sb_has_quota_enabled(sb, cnt)) {
+			if (!sb_has_quota_loaded(sb, cnt)) {
 				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 				mark_inode_dirty(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
+		}
+	if (sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+put_inodes:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
 			/* On remount RO, we keep the inode pointer so that we
-			 * can reenable quota on the subsequent remount RW.
-			 * But we have better not keep inode pointer when there
-			 * is pending delete on the quota file... */
-			if (!remount)
+			 * can reenable quota on the subsequent remount RW. We
+			 * have to check 'flags' variable and not use sb_has_
+			 * function because another quotaon / quotaoff could
+			 * change global state before we got here. We refuse
+			 * to suspend quotas when there is pending delete on
+			 * the quota file... */
+			if (!(flags & DQUOT_SUSPENDED))
 				iput(toputinode[cnt]);
 			else if (!toputinode[cnt]->i_nlink)
 				ret = -EBUSY;
 		}
-	if (sb->s_bdev)
-		invalidate_bdev(sb->s_bdev);
 	return ret;
 }
 
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
 /*
  *	Turn quotas on on a device
  */
 
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+	unsigned int flags)
 {
 	struct quota_format_type *fmt = find_quota_format(format_id);
 	struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 		error = -EINVAL;
 		goto out_fmt;
 	}
+	/* Usage always has to be set... */
+	if (!(flags & DQUOT_USAGE_ENABLED)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
 
-	/* As we bypass the pagecache we must now flush the inode so that
-	 * we see all the changes from userspace... */
-	write_inode_now(inode, 1);
-	/* And now flush the block cache so that kernel sees the changes */
-	invalidate_bdev(sb->s_bdev);
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* As we bypass the pagecache we must now flush the inode so
+		 * that we see all the changes from userspace... */
+		write_inode_now(inode, 1);
+		/* And now flush the block cache so that kernel sees the
+		 * changes */
+		invalidate_bdev(sb->s_bdev);
+	}
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
-	if (sb_has_quota_enabled(sb, type) ||
-			sb_has_quota_suspended(sb, type)) {
+	if (sb_has_quota_loaded(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
 	}
-	/* We don't want quota and atime on quota files (deadlocks possible)
-	 * Also nobody should write to the file - we use special IO operations
-	 * which ignore the immutable bit. */
-	down_write(&dqopt->dqptr_sem);
-	oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-	inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-	up_write(&dqopt->dqptr_sem);
-	sb->dq_op->drop(inode);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* We don't want quota and atime on quota files (deadlocks
+		 * possible) Also nobody should write to the file - we use
+		 * special IO operations which ignore the immutable bit. */
+		down_write(&dqopt->dqptr_sem);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		up_write(&dqopt->dqptr_sem);
+		sb->dq_op->drop(inode);
+	}
 
 	error = -EIO;
 	dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
-	set_enable_flags(dqopt, type);
+	dqopt->flags |= dquot_state_flag(flags, type);
 
 	add_dquot_ref(sb, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *inode;
 	int ret;
+	unsigned int flags;
 
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (!sb_has_quota_suspended(sb, type)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 	}
-	BUG_ON(sb_has_quota_enabled(sb, type));
-
 	inode = dqopt->files[type];
 	dqopt->files[type] = NULL;
-	reset_enable_flags(dqopt, type, 0);
+	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+						DQUOT_LIMITS_ENABLED, type);
+	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
-	ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+	flags = dquot_generic_flag(flags, type);
+	ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+				   flags);
 	iput(inode);
 
 	return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 	if (path->mnt->mnt_sb != sb)
 		error = -EXDEV;
 	else
-		error = vfs_quota_on_inode(path->dentry->d_inode, type,
-					   format_id);
+		error = vfs_load_quota_inode(path->dentry->d_inode, type,
+					     format_id, DQUOT_USAGE_ENABLED |
+					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
 
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
 {
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 }
 
 /*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+		unsigned int flags)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	/* Just unsuspend quotas? */
+	if (flags & DQUOT_SUSPENDED)
+		return vfs_quota_on_remount(sb, type);
+	if (!flags)
+		return 0;
+	/* Just updating flags needed? */
+	if (sb_has_quota_loaded(sb, type)) {
+		mutex_lock(&dqopt->dqonoff_mutex);
+		/* Now do a reliable test... */
+		if (!sb_has_quota_loaded(sb, type)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			goto load_quota;
+		}
+		if (flags & DQUOT_USAGE_ENABLED &&
+		    sb_has_quota_usage_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		if (flags & DQUOT_LIMITS_ENABLED &&
+		    sb_has_quota_limits_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return ret;
+	}
+
+load_quota:
+	return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
+/*
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
  */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 
 	error = security_quota_on(dentry);
 	if (!error)
-		error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+		error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+				DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 out:
 	dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
 	return ret;
 }
 
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 	spin_lock(&dq_data_lock);
-	di->dqb_bhardlimit = dm->dqb_bhardlimit;
-	di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
 	di->dqb_curspace = dm->dqb_curspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	if (di->dqb_valid & QIF_SPACE) {
 		dm->dqb_curspace = di->dqb_curspace;
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_BLIMITS) {
-		dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
-		dm->dqb_bhardlimit = di->dqb_bhardlimit;
+		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_INODES) {
 		dm->dqb_curinodes = di->dqb_curinodes;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_ILIMITS) {
 		dm->dqb_isoftlimit = di->dqb_isoftlimit;
 		dm->dqb_ihardlimit = di->dqb_ihardlimit;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_BTIME)
+	if (di->dqb_valid & QIF_BTIME) {
 		dm->dqb_btime = di->dqb_btime;
-	if (di->dqb_valid & QIF_ITIME)
+		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	}
+	if (di->dqb_valid & QIF_ITIME) {
 		dm->dqb_itime = di->dqb_itime;
+		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	}
 
 	if (check_blim) {
-		if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 		}
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	int rc;
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!(dquot = dqget(sb, id, type))) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	dquot = dqget(sb, id, type);
+	if (!dquot) {
+		rc = -ESRCH;
+		goto out;
 	}
 	rc = do_set_dqblk(dquot, di);
 	dqput(dquot);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return rc;
 }
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	struct mem_dqinfo *mi;
   
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
+	if (!sb_has_quota_active(sb, type)) {
 		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
+	int err = 0;
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	if (!sb_has_quota_active(sb, type)) {
+		err = -ESRCH;
+		goto out;
 	}
 	mi = sb_dqopt(sb)->info + type;
 	spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
 	sb->dq_op->write_info(sb, type);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-	return 0;
+	return err;
 }
 
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
  *
  * Returns zero on success; non-zero on error.
  */
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
-			      loff_t offset)
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset)
 {
 	int rc = 0;
 	char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
 		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
 		crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+		if (mount_crypt_stat->flags
+		    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+		else if (mount_crypt_stat->flags
+			 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+	}
 }
 
 static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
 static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
 	{0x00000001, ECRYPTFS_ENABLE_HMAC},
 	{0x00000002, ECRYPTFS_ENCRYPTED},
-	{0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+	{0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+	{0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
 };
 
 /**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
 
 /**
  * ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
  *
  * Returns zero on no match, or the cipher code on match
  */
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
 {
 	int i;
 	u8 code = 0;
 	struct ecryptfs_cipher_code_str_map_elem *map =
 		ecryptfs_cipher_code_str_map;
 
-	if (strcmp(crypt_stat->cipher, "aes") == 0) {
-		switch (crypt_stat->key_size) {
+	if (strcmp(cipher_name, "aes") == 0) {
+		switch (key_bytes) {
 		case 16:
 			code = RFC2440_CIPHER_AES_128;
 			break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
 		}
 	} else {
 		for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
-			if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+			if (strcmp(cipher_name, map[i].cipher_str) == 0) {
 				code = map[i].cipher_code;
 				break;
 			}
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
 	int rc;
 
+	if (crypt_stat->extent_size == 0)
+		crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
 	rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
 				 ecryptfs_inode);
 	if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
 	}
 	if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
 	}
 out:
 	return rc;
@@ -1628,95 +1640,95 @@ out:
 }
 
 /**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
+ * ecryptfs_encrypt_filename - encrypt filename
  *
- * Encrypts and encodes a filename into something that constitutes a
- * valid filename for a filesystem, with printable characters.
+ * CBC-encrypts the filename. We do not want to encrypt the same
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
  *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
+ * Returns zero on success; non-zero otherwise
  */
-int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			 const char *name, int length, char **encoded_name)
+static int
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
+			  struct ecryptfs_crypt_stat *crypt_stat,
+			  struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
-	int error = 0;
+	int rc = 0;
 
-	(*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
-	if (!(*encoded_name)) {
-		error = -ENOMEM;
+	filename->encrypted_filename = NULL;
+	filename->encrypted_filename_size = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+		size_t packet_size;
+		size_t remaining_bytes;
+
+		rc = ecryptfs_write_tag_70_packet(
+			NULL, NULL,
+			&filename->encrypted_filename_size,
+			mount_crypt_stat, NULL,
+			filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to get packet "
+			       "size for tag 72; rc = [%d]\n", __func__,
+			       rc);
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename =
+			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+		if (!filename->encrypted_filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       filename->encrypted_filename_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		remaining_bytes = filename->encrypted_filename_size;
+		rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+						  &remaining_bytes,
+						  &packet_size,
+						  mount_crypt_stat,
+						  filename->filename,
+						  filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to generate "
+			       "tag 70 packet; rc = [%d]\n", __func__,
+			       rc);
+			kfree(filename->encrypted_filename);
+			filename->encrypted_filename = NULL;
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename_size = packet_size;
+	} else {
+		printk(KERN_ERR "%s: No support for requested filename "
+		       "encryption method in this release\n", __func__);
+		rc = -ENOTSUPP;
 		goto out;
 	}
-	/* TODO: Filename encryption is a scheduled feature for a
-	 * future version of eCryptfs. This function is here only for
-	 * the purpose of providing a framework for other developers
-	 * to easily implement filename encryption. Hint: Replace this
-	 * memcpy() with a call to encrypt and encode the
-	 * filename, the set the length accordingly. */
-	memcpy((void *)(*encoded_name), (void *)name, length);
-	(*encoded_name)[length] = '\0';
-	error = length + 1;
 out:
-	return error;
+	return rc;
 }
 
-/**
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			 const char *name, int length, char **decrypted_name)
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
+				  const char *name, size_t name_size)
 {
-	int error = 0;
+	int rc = 0;
 
-	(*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
-	if (!(*decrypted_name)) {
-		error = -ENOMEM;
+	(*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
+	if (!(*copied_name)) {
+		rc = -ENOMEM;
 		goto out;
 	}
-	/* TODO: Filename encryption is a scheduled feature for a
-	 * future version of eCryptfs. This function is here only for
-	 * the purpose of providing a framework for other developers
-	 * to easily implement filename encryption. Hint: Replace this
-	 * memcpy() with a call to decode and decrypt the
-	 * filename, the set the length accordingly. */
-	memcpy((void *)(*decrypted_name), (void *)name, length);
-	(*decrypted_name)[length + 1] = '\0';	/* Only for convenience
+	memcpy((void *)(*copied_name), (void *)name, name_size);
+	(*copied_name)[(name_size)] = '\0';	/* Only for convenience
 						 * in printing out the
 						 * string in debug
 						 * messages */
-	error = length;
+	(*copied_name_size) = (name_size + 1);
 out:
-	return error;
+	return rc;
 }
 
 /**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
 	*key_tfm = NULL;
 	if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
 		rc = -EINVAL;
-		printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+		printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
 		      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
 		goto out;
 	}
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
 	get_random_bytes(dummy_key, *key_size);
 	rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
 	if (rc) {
-		printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+		printk(KERN_ERR "Error attempting to set key of size [%zd] for "
 		       "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
 		rc = -EINVAL;
 		goto out;
@@ -1910,3 +1922,341 @@ out:
 	mutex_unlock(&key_tfm_list_mutex);
 	return rc;
 }
+
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+						 "EFGHIJKLMNOPQRST"
+						 "UVWXYZabcdefghij"
+						 "klmnopqrstuvwxyz");
+
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+	0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+	0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+	0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+	0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+	0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+	0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+	0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+	0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+	0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+	0x3D, 0x3E, 0x3F
+};
+
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+				  unsigned char *src, size_t src_size)
+{
+	size_t num_blocks;
+	size_t block_num = 0;
+	size_t dst_offset = 0;
+	unsigned char last_block[3];
+
+	if (src_size == 0) {
+		(*dst_size) = 0;
+		goto out;
+	}
+	num_blocks = (src_size / 3);
+	if ((src_size % 3) == 0) {
+		memcpy(last_block, (&src[src_size - 3]), 3);
+	} else {
+		num_blocks++;
+		last_block[2] = 0x00;
+		switch (src_size % 3) {
+		case 1:
+			last_block[0] = src[src_size - 1];
+			last_block[1] = 0x00;
+			break;
+		case 2:
+			last_block[0] = src[src_size - 2];
+			last_block[1] = src[src_size - 1];
+		}
+	}
+	(*dst_size) = (num_blocks * 4);
+	if (!dst)
+		goto out;
+	while (block_num < num_blocks) {
+		unsigned char *src_block;
+		unsigned char dst_block[4];
+
+		if (block_num == (num_blocks - 1))
+			src_block = last_block;
+		else
+			src_block = &src[block_num * 3];
+		dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+		dst_block[1] = (((src_block[0] << 4) & 0x30)
+				| ((src_block[1] >> 4) & 0x0F));
+		dst_block[2] = (((src_block[1] << 2) & 0x3C)
+				| ((src_block[2] >> 6) & 0x03));
+		dst_block[3] = (src_block[2] & 0x3F);
+		dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+		block_num++;
+	}
+out:
+	return;
+}
+
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+			      const unsigned char *src, size_t src_size)
+{
+	u8 current_bit_offset = 0;
+	size_t src_byte_offset = 0;
+	size_t dst_byte_offset = 0;
+
+	if (dst == NULL) {
+		/* Not exact; conservatively long. Every block of 4
+		 * encoded characters decodes into a block of 3
+		 * decoded characters. This segment of code provides
+		 * the caller with the maximum amount of allocated
+		 * space that @dst will need to point to in a
+		 * subsequent call. */
+		(*dst_size) = (((src_size + 1) * 3) / 4);
+		goto out;
+	}
+	while (src_byte_offset < src_size) {
+		unsigned char src_byte =
+				filename_rev_map[(int)src[src_byte_offset]];
+
+		switch (current_bit_offset) {
+		case 0:
+			dst[dst_byte_offset] = (src_byte << 2);
+			current_bit_offset = 6;
+			break;
+		case 6:
+			dst[dst_byte_offset++] |= (src_byte >> 4);
+			dst[dst_byte_offset] = ((src_byte & 0xF)
+						 << 4);
+			current_bit_offset = 4;
+			break;
+		case 4:
+			dst[dst_byte_offset++] |= (src_byte >> 2);
+			dst[dst_byte_offset] = (src_byte << 6);
+			current_bit_offset = 2;
+			break;
+		case 2:
+			dst[dst_byte_offset++] |= (src_byte);
+			dst[dst_byte_offset] = 0;
+			current_bit_offset = 0;
+			break;
+		}
+		src_byte_offset++;
+	}
+	(*dst_size) = dst_byte_offset;
+out:
+	return;
+}
+
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size)
+{
+	size_t encoded_name_no_prefix_size;
+	int rc = 0;
+
+	(*encoded_name) = NULL;
+	(*encoded_name_size) = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+		struct ecryptfs_filename *filename;
+
+		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+		if (!filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       sizeof(*filename));
+			rc = -ENOMEM;
+			goto out;
+		}
+		filename->filename = (char *)name;
+		filename->filename_size = name_size;
+		rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+					       mount_crypt_stat);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encrypt "
+			       "filename; rc = [%d]\n", __func__, rc);
+			kfree(filename);
+			goto out;
+		}
+		ecryptfs_encode_for_filename(
+			NULL, &encoded_name_no_prefix_size,
+			filename->encrypted_filename,
+			filename->encrypted_filename_size);
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		else
+			(*encoded_name_size) =
+				(ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+		if (!(*encoded_name)) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%zd] bytes\n", __func__,
+			       (*encoded_name_size));
+			rc = -ENOMEM;
+			kfree(filename->encrypted_filename);
+			kfree(filename);
+			goto out;
+		}
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+			memcpy((*encoded_name),
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+			ecryptfs_encode_for_filename(
+			    ((*encoded_name)
+			     + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+			    &encoded_name_no_prefix_size,
+			    filename->encrypted_filename,
+			    filename->encrypted_filename_size);
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+			(*encoded_name)[(*encoded_name_size)] = '\0';
+			(*encoded_name_size)++;
+		} else {
+			rc = -ENOTSUPP;
+		}
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encode "
+			       "encrypted filename; rc = [%d]\n", __func__,
+			       rc);
+			kfree((*encoded_name));
+			(*encoded_name) = NULL;
+			(*encoded_name_size) = 0;
+		}
+		kfree(filename->encrypted_filename);
+		kfree(filename);
+	} else {
+		rc = ecryptfs_copy_filename(encoded_name,
+					    encoded_name_size,
+					    name, name_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+					 size_t *plaintext_name_size,
+					 struct dentry *ecryptfs_dir_dentry,
+					 const char *name, size_t name_size)
+{
+	char *decoded_name;
+	size_t decoded_name_size;
+	size_t packet_size;
+	int rc = 0;
+
+	if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+		struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+			&ecryptfs_superblock_to_private(
+				ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+		const char *orig_name = name;
+		size_t orig_name_size = name_size;
+
+		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+					      name, name_size);
+		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+		if (!decoded_name) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%zd] bytes\n", __func__,
+			       decoded_name_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+					      name, name_size);
+		rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+						  plaintext_name_size,
+						  &packet_size,
+						  mount_crypt_stat,
+						  decoded_name,
+						  decoded_name_size);
+		if (rc) {
+			printk(KERN_INFO "%s: Could not parse tag 70 packet "
+			       "from filename; copying through filename "
+			       "as-is\n", __func__);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out_free;
+		}
+	} else {
+		rc = ecryptfs_copy_filename(plaintext_name,
+					    plaintext_name_size,
+					    name, name_size);
+		goto out;
+	}
+out_free:
+	kfree(decoded_name);
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
 #define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
 				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
 				  | ECRYPTFS_VERSIONING_PUBKEY \
 				  | ECRYPTFS_VERSIONING_XATTR \
 				  | ECRYPTFS_VERSIONING_MULTKEY \
-				  | ECRYPTFS_VERSIONING_DEVMISC)
+				  | ECRYPTFS_VERSIONING_DEVMISC \
+				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
 #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+					  * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+					  * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+					  * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+					  * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 
 struct ecryptfs_key_sig {
 	struct list_head crypt_stat_list;
 	char keysig[ECRYPTFS_SIG_SIZE_HEX];
 };
 
+struct ecryptfs_filename {
+	struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+	u32 flags;
+	u32 seq_no;
+	char *filename;
+	char *encrypted_filename;
+	size_t filename_size;
+	size_t encrypted_filename_size;
+	char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+	char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
+
 /**
  * This is the primary struct associated with each encrypted file.
  *
  * TODO: cache align/pack?
  */
 struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
-#define ECRYPTFS_POLICY_APPLIED     0x00000002
-#define ECRYPTFS_NEW_FILE           0x00000004
-#define ECRYPTFS_ENCRYPTED          0x00000008
-#define ECRYPTFS_SECURITY_WARNING   0x00000010
-#define ECRYPTFS_ENABLE_HMAC        0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
-#define ECRYPTFS_KEY_VALID          0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR  0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
-#define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
+#define ECRYPTFS_NEW_FILE             0x00000004
+#define ECRYPTFS_ENCRYPTED            0x00000008
+#define ECRYPTFS_SECURITY_WARNING     0x00000010
+#define ECRYPTFS_ENABLE_HMAC          0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
+#define ECRYPTFS_KEY_VALID            0x00000080
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
+#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
 	u32 flags;
 	struct list_head global_auth_tok_list;
 	struct mutex global_auth_tok_list_mutex;
 	size_t num_global_auth_toks;
 	size_t global_default_cipher_key_size;
+	size_t global_default_fn_cipher_key_bytes;
 	unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
 						 + 1];
+	unsigned char global_default_fn_cipher_name[
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+	char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 
 /* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
 		       u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+					struct dentry *lower_dentry,
+					struct ecryptfs_crypt_stat *crypt_stat,
+					struct inode *ecryptfs_dir_inode,
+					struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+					 size_t *decrypted_name_size,
+					 struct dentry *ecryptfs_dentry,
+					 const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			     const char *name, int length,
-			     char **decrypted_name);
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			     const char *name, int length,
-			     char **encoded_name);
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
 void ecryptfs_dump_hex(char *data, int bytes);
 int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
 					     struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
 					    struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
 			     struct vfsmount *lower_mnt,
 			     const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
 
 /* Inspired by generic filldir in fs/readdir.c */
 static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
-		 u64 ino, unsigned int d_type)
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
+		 loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct ecryptfs_crypt_stat *crypt_stat;
 	struct ecryptfs_getdents_callback *buf =
 	    (struct ecryptfs_getdents_callback *)dirent;
+	size_t name_size;
+	char *name;
 	int rc;
-	int decoded_length;
-	char *decoded_name;
 
-	crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
 	buf->filldir_called++;
-	decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
-						  &decoded_name);
-	if (decoded_length < 0) {
-		rc = decoded_length;
+	rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+						  buf->dentry, lower_name,
+						  lower_namelen);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+		       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+		       rc);
 		goto out;
 	}
-	rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
-			  ino, d_type);
-	kfree(decoded_name);
+	rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+	kfree(name);
 	if (rc >= 0)
 		buf->entries_written++;
 out:
@@ -106,8 +106,8 @@ out:
 
 /**
  * ecryptfs_readdir
- * @file: The ecryptfs file struct
- * @dirent: Directory entry
+ * @file: The eCryptfs directory file
+ * @dirent: Directory entry handle
  * @filldir: The filldir callback function
  */
 static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	struct file *lower_file = ecryptfs_file_to_lower(file);
-	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	struct inode *lower_inode = lower_dentry->d_inode;
-	int rc = -EINVAL;
-
-	if (lower_inode->i_fop->fsync) {
-		mutex_lock(&lower_inode->i_mutex);
-		rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-					       datasync);
-		mutex_unlock(&lower_inode->i_mutex);
-	}
-	return rc;
+	return vfs_fsync(ecryptfs_file_to_lower(file),
+			 ecryptfs_dentry_to_lower(dentry),
+			 datasync);
 }
 
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e78fc179886..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
 /**
  * ecryptfs_create_underlying_file
  * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
- * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @dentry: New file's dentry
  * @mode: The mode of the new file
  * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
  *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 {
 	int rc;
 
-	/* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
-	 * the crypt_stat->lower_file (persistent file) */
+	/* ecryptfs_do_create() calls ecryptfs_interpose() */
 	rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
 	if (unlikely(rc)) {
 		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
 }
 
 /**
- * ecryptfs_lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
  */
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nd)
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+					struct dentry *lower_dentry,
+					struct ecryptfs_crypt_stat *crypt_stat,
+					struct inode *ecryptfs_dir_inode,
+					struct nameidata *ecryptfs_nd)
 {
-	int rc = 0;
 	struct dentry *lower_dir_dentry;
-	struct dentry *lower_dentry;
 	struct vfsmount *lower_mnt;
-	char *encoded_name;
-	int encoded_namelen;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct inode *lower_inode;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	char *page_virt = NULL;
-	struct inode *lower_inode;
 	u64 file_size;
+	int rc = 0;
 
-	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
-	dentry->d_op = &ecryptfs_dops;
-	if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
-	    || (dentry->d_name.len == 2
-		&& !strcmp(dentry->d_name.name, ".."))) {
-		d_drop(dentry);
-		goto out;
-	}
-	encoded_namelen = ecryptfs_encode_filename(crypt_stat,
-						   dentry->d_name.name,
-						   dentry->d_name.len,
-						   &encoded_name);
-	if (encoded_namelen < 0) {
-		rc = encoded_namelen;
-		d_drop(dentry);
-		goto out;
-	}
-	ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
-			"= [%d]\n", encoded_name, encoded_namelen);
-	lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
-				      encoded_namelen - 1);
-	kfree(encoded_name);
-	if (IS_ERR(lower_dentry)) {
-		ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
-		rc = PTR_ERR(lower_dentry);
-		d_drop(dentry);
-		goto out;
-	}
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-	ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
-       		"d_name.name = [%s]\n", lower_dentry,
-		lower_dentry->d_name.name);
+	lower_dir_dentry = lower_dentry->d_parent;
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
+				   ecryptfs_dentry->d_parent));
 	lower_inode = lower_dentry->d_inode;
-	fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
 	BUG_ON(!atomic_read(&lower_dentry->d_count));
-	ecryptfs_set_dentry_private(dentry,
+	ecryptfs_set_dentry_private(ecryptfs_dentry,
 				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
 						     GFP_KERNEL));
-	if (!ecryptfs_dentry_to_private(dentry)) {
+	if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
-				"to allocate ecryptfs_dentry_info struct\n");
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to allocate ecryptfs_dentry_info struct\n",
+			__func__);
 		goto out_dput;
 	}
-	ecryptfs_set_dentry_lower(dentry, lower_dentry);
-	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+	ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
+	ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
 	if (!lower_dentry->d_inode) {
 		/* We want to add because we couldn't find in lower */
-		d_add(dentry, NULL);
+		d_add(ecryptfs_dentry, NULL);
 		goto out;
 	}
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
-				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+				ecryptfs_dir_inode->i_sb, 1);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error interposing\n");
+		printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+		       __func__, rc);
 		goto out;
 	}
-	if (S_ISDIR(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+	if (S_ISDIR(lower_inode->i_mode))
 		goto out;
-	}
-	if (S_ISLNK(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+	if (S_ISLNK(lower_inode->i_mode))
 		goto out;
-	}
-	if (special_file(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
+	if (special_file(lower_inode->i_mode))
 		goto out;
-	}
-	if (!nd) {
-		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
-				"as we *think* we are about to unlink\n");
+	if (!ecryptfs_nd)
 		goto out;
-	}
 	/* Released in this function */
-	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
-				      GFP_USER);
+	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
 	if (!page_virt) {
+		printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+		       __func__);
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR,
-				"Cannot ecryptfs_kmalloc a page\n");
 		goto out;
 	}
-	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-		ecryptfs_set_default_sizes(crypt_stat);
-	if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(dentry);
+	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to initialize "
 			       "the persistent file for the dentry with name "
 			       "[%s]; rc = [%d]\n", __func__,
-			       dentry->d_name.name, rc);
-			goto out;
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out_free_kmem;
 		}
 	}
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
-						      dentry->d_inode);
+						      ecryptfs_dentry->d_inode);
 	if (rc) {
-		rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+							     ecryptfs_dentry);
 		if (rc) {
-			printk(KERN_DEBUG "Valid metadata not found in header "
-			       "region or xattr region; treating file as "
-			       "unencrypted\n");
 			rc = 0;
-			kmem_cache_free(ecryptfs_header_cache_2, page_virt);
-			goto out;
+			goto out_free_kmem;
 		}
 		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	}
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
-		dentry->d_sb)->mount_crypt_stat;
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
 		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 			file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 	} else {
 		file_size = get_unaligned_be64(page_virt);
 	}
-	i_size_write(dentry->d_inode, (loff_t)file_size);
+	i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
 	goto out;
-
 out_dput:
 	dput(lower_dentry);
-	d_drop(dentry);
+	d_drop(ecryptfs_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+				      struct dentry *ecryptfs_dentry,
+				      struct nameidata *ecryptfs_nd)
+{
+	char *encrypted_and_encoded_name = NULL;
+	size_t encrypted_and_encoded_name_size;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+	struct ecryptfs_inode_info *inode_info;
+	struct dentry *lower_dir_dentry, *lower_dentry;
+	int rc = 0;
+
+	ecryptfs_dentry->d_op = &ecryptfs_dops;
+	if ((ecryptfs_dentry->d_name.len == 1
+	     && !strcmp(ecryptfs_dentry->d_name.name, "."))
+	    || (ecryptfs_dentry->d_name.len == 2
+		&& !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+		goto out_d_drop;
+	}
+	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+		       "lower_dentry = [%s]\n", __func__, rc,
+		       ecryptfs_dentry->d_name.name);
+		goto out_d_drop;
+	}
+	if (lower_dentry->d_inode)
+		goto lookup_and_interpose;
+	inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+	if (inode_info) {
+		crypt_stat = &inode_info->crypt_stat;
+		/* TODO: lock for crypt_stat comparison */
+		if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+			ecryptfs_set_default_sizes(crypt_stat);
+	}
+	if (crypt_stat)
+		mount_crypt_stat = crypt_stat->mount_crypt_stat;
+	else
+		mount_crypt_stat = &ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    && !(mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+		goto lookup_and_interpose;
+	dput(lower_dentry);
+	rc = ecryptfs_encrypt_and_encode_filename(
+		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+		crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		ecryptfs_dentry->d_name.len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+		       "filename; rc = [%d]\n", __func__, rc);
+		goto out_d_drop;
+	}
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size - 1);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+		       "lower_dentry = [%s]\n", __func__, rc,
+		       encrypted_and_encoded_name);
+		goto out_d_drop;
+	}
+lookup_and_interpose:
+	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+						 crypt_stat, ecryptfs_dir_inode,
+						 ecryptfs_nd);
+	goto out;
+out_d_drop:
+	d_drop(ecryptfs_dentry);
 out:
+	kfree(encrypted_and_encoded_name);
 	return ERR_PTR(rc);
 }
 
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
 	char *encoded_symname;
-	int encoded_symlen;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	size_t encoded_symlen;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
-						  strlen(symname),
-						  &encoded_symname);
-	if (encoded_symlen < 0) {
-		rc = encoded_symlen;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		dir->i_sb)->mount_crypt_stat;
+	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
+						  &encoded_symlen,
+						  NULL,
+						  mount_crypt_stat, symname,
+						  strlen(symname));
+	if (rc)
 		goto out_lock;
-	}
 	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
 			 encoded_symname);
 	kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
 }
 
 static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
-	int rc;
-	struct dentry *lower_dentry;
-	char *decoded_name;
 	char *lower_buf;
-	mm_segment_t old_fs;
+	struct dentry *lower_dentry;
 	struct ecryptfs_crypt_stat *crypt_stat;
+	char *plaintext_name;
+	size_t plaintext_name_size;
+	mm_segment_t old_fs;
+	int rc;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!lower_dentry->d_inode->i_op ||
-	    !lower_dentry->d_inode->i_op->readlink) {
+	if (!lower_dentry->d_inode->i_op->readlink) {
 		rc = -EINVAL;
 		goto out;
 	}
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
 	/* Released in this function */
 	lower_buf = kmalloc(bufsiz, GFP_KERNEL);
 	if (lower_buf == NULL) {
-		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%d] bytes\n", __func__, bufsiz);
 		rc = -ENOMEM;
 		goto out;
 	}
 	old_fs = get_fs();
 	set_fs(get_ds());
-	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-			"lower_dentry->d_name.name = [%s]\n",
-			lower_dentry->d_name.name);
 	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
 						   (char __user *)lower_buf,
 						   bufsiz);
 	set_fs(old_fs);
 	if (rc >= 0) {
-		crypt_stat = NULL;
-		rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
-					      &decoded_name);
-		if (rc == -ENOMEM)
+		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+							  &plaintext_name_size,
+							  dentry, lower_buf,
+							  rc);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to decode and "
+			       "decrypt filename; rc = [%d]\n", __func__,
+				rc);
 			goto out_free_lower_buf;
-		if (rc > 0) {
-			ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
-					"to userspace: [%*s]\n", rc,
-					decoded_name);
-			if (copy_to_user(buf, decoded_name, rc))
-				rc = -EFAULT;
 		}
-		kfree(decoded_name);
-		fsstack_copy_attr_atime(dentry->d_inode,
-					lower_dentry->d_inode);
+		rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+		if (rc)
+			rc = -EFAULT;
+		else
+			rc = plaintext_name_size;
+		kfree(plaintext_name);
+		fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
 	}
 out_free_lower_buf:
 	kfree(lower_buf);
@@ -670,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 	old_fs = get_fs();
 	set_fs(get_ds());
-	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-			"dentry->d_name.name = [%s]\n", dentry->d_name.name);
 	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
 	set_fs(old_fs);
 	if (rc < 0)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	/* verify that everything through the encrypted FEK size is present */
 	if (message_len < 4) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+		printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
 		       "message length is [%d]\n", __func__, message_len, 4);
 		goto out;
 	}
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	i += data_len;
 	if (message_len < (i + key_rec->enc_key_size)) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+		printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
 		       __func__, message_len, (i + key_rec->enc_key_size));
 		goto out;
 	}
 	if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+		printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
 		       "the maximum key size [%d]\n", __func__,
 		       key_rec->enc_key_size,
 		       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
 }
 
 static int
+ecryptfs_find_global_auth_tok_for_sig(
+	struct ecryptfs_global_auth_tok **global_auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+	struct ecryptfs_global_auth_tok *walker;
+	int rc = 0;
+
+	(*global_auth_tok) = NULL;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+			(*global_auth_tok) = walker;
+			goto out;
+		}
+	}
+	rc = -EINVAL;
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	char *sig)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	int rc = 0;
+
+	(*auth_tok) = NULL;
+	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+						  mount_crypt_stat, sig)) {
+		struct key *auth_tok_key;
+
+		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+						       sig);
+	} else
+		(*auth_tok) = global_auth_tok->global_auth_tok;
+	return rc;
+}
+
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	size_t j;
+	size_t num_rand_bytes;
+	struct mutex *tfm_mutex;
+	char *block_aligned_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg;
+	struct scatterlist dst_sg;
+	struct blkcipher_desc desc;
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	struct hash_desc hash_desc;
+	struct scatterlist hash_sg;
+};
+
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size)
+{
+	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	int rc = 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	(*packet_size) = 0;
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+		&s->desc.tfm,
+		&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       mount_crypt_stat->global_default_fn_cipher_name, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+	/* Plus one for the \0 separator between the random prefix
+	 * and the plaintext filename */
+	s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+	s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+	if ((s->block_aligned_filename_size % s->block_size) != 0) {
+		s->num_rand_bytes += (s->block_size
+				      - (s->block_aligned_filename_size
+					 % s->block_size));
+		s->block_aligned_filename_size = (s->num_rand_bytes
+						  + filename_size);
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random characters, a \0
+	 *    separator, and then the filename */
+	s->max_packet_size = (1                   /* Tag 70 identifier */
+			      + 3                 /* Max Tag 70 packet size */
+			      + ECRYPTFS_SIG_SIZE /* FNEK sig */
+			      + 1                 /* Cipher identifier */
+			      + s->block_aligned_filename_size);
+	if (dest == NULL) {
+		(*packet_size) = s->max_packet_size;
+		goto out_unlock;
+	}
+	if (s->max_packet_size > (*remaining_bytes)) {
+		printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+		       "[%zd] available\n", __func__, s->max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+	s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+					    GFP_KERNEL);
+	if (!s->block_aligned_filename) {
+		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+		       "kzalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	s->i = 0;
+	dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[s->i],
+					  (ECRYPTFS_SIG_SIZE
+					   + 1 /* Cipher code */
+					   + s->block_aligned_filename_size),
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error generating tag 70 packet "
+		       "header; cannot generate packet length; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i += s->packet_size_len;
+	ecryptfs_from_hex(&dest[s->i],
+			  mount_crypt_stat->global_default_fnek_sig,
+			  ECRYPTFS_SIG_SIZE);
+	s->i += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = ecryptfs_code_for_cipher_string(
+		mount_crypt_stat->global_default_fn_cipher_name,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (s->cipher_code == 0) {
+		printk(KERN_WARNING "%s: Unable to generate code for "
+		       "cipher [%s] with key bytes [%zd]\n", __func__,
+		       mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	dest[s->i++] = s->cipher_code;
+	rc = ecryptfs_find_auth_tok_for_sig(
+		&s->auth_tok, mount_crypt_stat,
+		mount_crypt_stat->global_default_fnek_sig);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__,
+		       mount_crypt_stat->global_default_fnek_sig, rc);
+		goto out_free_unlock;
+	}
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+	sg_init_one(
+		&s->hash_sg,
+		(u8 *)s->auth_tok->token.password.session_key_encryption_key,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+					     CRYPTO_ALG_ASYNC);
+	if (IS_ERR(s->hash_desc.tfm)) {
+			rc = PTR_ERR(s->hash_desc.tfm);
+			printk(KERN_ERR "%s: Error attempting to "
+			       "allocate hash crypto context; rc = [%d]\n",
+			       __func__, rc);
+			goto out_free_unlock;
+	}
+	rc = crypto_hash_init(&s->hash_desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_update(
+		&s->hash_desc, &s->hash_sg,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_final(&s->hash_desc, s->hash);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+		s->block_aligned_filename[s->j] =
+			s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+		if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+		    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+			sg_init_one(&s->hash_sg, (u8 *)s->hash,
+				    ECRYPTFS_TAG_70_DIGEST_SIZE);
+			rc = crypto_hash_init(&s->hash_desc);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error initializing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+						ECRYPTFS_TAG_70_DIGEST_SIZE);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error updating crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error finalizing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			memcpy(s->hash, s->tmp_hash,
+			       ECRYPTFS_TAG_70_DIGEST_SIZE);
+		}
+		if (s->block_aligned_filename[s->j] == '\0')
+			s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+	}
+	memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+	       filename_size);
+	rc = virt_to_scatterlist(s->block_aligned_filename,
+				 s->block_aligned_filename_size, &s->src_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+				 &s->dst_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	/* The characters in the first block effectively do the job
+	 * of the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_release_free_unlock;
+	}
+	s->i += s->block_aligned_filename_size;
+	(*packet_size) = s->i;
+	(*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+	crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+	memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+	kfree(s->block_aligned_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	kfree(s);
+	return rc;
+}
+
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t parsed_tag_70_packet_size;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	struct mutex *tfm_mutex;
+	char *decrypted_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg;
+	struct scatterlist dst_sg;
+	struct blkcipher_desc desc;
+	char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size)
+{
+	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*filename_size) = 0;
+	(*filename) = NULL;
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+		       "at least [%d]\n", __func__, max_packet_size,
+			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random numbers, a \0
+	 *    separator, and then the filename */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+		printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+		       "tag [0x%.2x]\n", __func__,
+		       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+					  &s->parsed_tag_70_packet_size,
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out;
+	}
+	s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+					  - ECRYPTFS_SIG_SIZE - 1);
+	if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+	    > max_packet_size) {
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+		       "size is [%zd]\n", __func__, max_packet_size,
+		       (1 + s->packet_size_len + 1
+			+ s->block_aligned_filename_size));
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += s->packet_size_len;
+	ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+			ECRYPTFS_SIG_SIZE);
+	s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = data[(*packet_size)++];
+	rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+	if (rc) {
+		printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+		       __func__, s->cipher_code);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+							&s->tfm_mutex,
+							s->cipher_string);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       s->cipher_string, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	rc = virt_to_scatterlist(&data[(*packet_size)],
+				 s->block_aligned_filename_size, &s->src_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_unlock;
+	}
+	(*packet_size) += s->block_aligned_filename_size;
+	s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+					GFP_KERNEL);
+	if (!s->decrypted_filename) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	rc = virt_to_scatterlist(s->decrypted_filename,
+				 s->block_aligned_filename_size, &s->dst_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert decrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_free_unlock;
+	}
+	/* The characters in the first block effectively do the job of
+	 * the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+					    s->fnek_sig_hex);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+		       rc);
+		goto out_free_unlock;
+	}
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i = 0;
+	while (s->decrypted_filename[s->i] != '\0'
+	       && s->i < s->block_aligned_filename_size)
+		s->i++;
+	if (s->i == s->block_aligned_filename_size) {
+		printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+		       "find valid separator between random characters and "
+		       "the filename\n", __func__);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	s->i++;
+	(*filename_size) = (s->block_aligned_filename_size - s->i);
+	if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+		printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+		       "invalid\n", __func__, (*filename_size));
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+	if (!(*filename)) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%zd] bytes\n", __func__,
+		       ((*filename_size) + 1));
+		rc = -ENOMEM;
+		goto out_free_unlock;
+	}
+	memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+	(*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+	kfree(s->decrypted_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*filename_size) = 0;
+		(*filename) = NULL;
+	}
+	kfree(s);
+	return rc;
+}
+
+static int
 ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
 {
 	int rc = 0;
@@ -897,30 +1471,6 @@ out:
 	return rc;
 }
 
-static int
-ecryptfs_find_global_auth_tok_for_sig(
-	struct ecryptfs_global_auth_tok **global_auth_tok,
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
-	struct ecryptfs_global_auth_tok *walker;
-	int rc = 0;
-
-	(*global_auth_tok) = NULL;
-	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
-	list_for_each_entry(walker,
-			    &mount_crypt_stat->global_auth_tok_list,
-			    mount_crypt_stat_list) {
-		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
-			(*global_auth_tok) = walker;
-			goto out;
-		}
-	}
-	rc = -EINVAL;
-out:
-	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-	return rc;
-}
-
 /**
  * ecryptfs_verify_version
  * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
 }
 
 /**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
-	struct ecryptfs_auth_tok **auth_tok,
-	struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-		crypt_stat->mount_crypt_stat;
-	struct ecryptfs_global_auth_tok *global_auth_tok;
-	int rc = 0;
-
-	(*auth_tok) = NULL;
-	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
-						  mount_crypt_stat, sig)) {
-		struct key *auth_tok_key;
-
-		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
-						       sig);
-	} else
-		(*auth_tok) = global_auth_tok->global_auth_tok;
-	return rc;
-}
-
-/**
  * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
  * @auth_tok: The passphrase authentication token to use to encrypt the FEK
  * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
 			rc = -EINVAL;
 			goto out_wipe_list;
 		}
-		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+					       crypt_stat->mount_crypt_stat,
 					       candidate_auth_tok_sig);
 		if (matching_auth_tok) {
 			found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
 	int rc;
 
 	rc = write_tag_66_packet(auth_tok->token.private_key.signature,
-				 ecryptfs_code_for_cipher_string(crypt_stat),
+				 ecryptfs_code_for_cipher_string(
+					 crypt_stat->cipher,
+					 crypt_stat->key_size),
 				 crypt_stat, &payload, &payload_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
 	dest[(*packet_size)++] = 0x04; /* version 4 */
 	/* TODO: Break from RFC2440 so that arbitrary ciphers can be
 	 * specified with strings */
-	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+						      crypt_stat->key_size);
 	if (cipher_code == 0) {
 		ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
 				"cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
        ecryptfs_opt_ecryptfs_key_bytes,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
-       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
 	{ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
 	{ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	int rc = 0;
 	int sig_set = 0;
 	int cipher_name_set = 0;
+	int fn_cipher_name_set = 0;
 	int cipher_key_bytes;
 	int cipher_key_bytes_set = 0;
+	int fn_cipher_key_bytes;
+	int fn_cipher_key_bytes_set = 0;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
 		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
 	substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	char *sig_src;
 	char *cipher_name_dst;
 	char *cipher_name_src;
+	char *fn_cipher_name_dst;
+	char *fn_cipher_name_src;
+	char *fnek_dst;
+	char *fnek_src;
 	char *cipher_key_bytes_src;
+	char *fn_cipher_key_bytes_src;
 
 	if (!options) {
 		rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				global_default_cipher_name;
 			strncpy(cipher_name_dst, cipher_name_src,
 				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-			ecryptfs_printk(KERN_DEBUG,
-					"The mount_crypt_stat "
-					"global_default_cipher_name set to: "
-					"[%s]\n", cipher_name_dst);
+			cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
 			cipher_name_set = 1;
 			break;
 		case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 						   &cipher_key_bytes_src, 0);
 			mount_crypt_stat->global_default_cipher_key_size =
 				cipher_key_bytes;
-			ecryptfs_printk(KERN_DEBUG,
-					"The mount_crypt_stat "
-					"global_default_cipher_key_size "
-					"set to: [%d]\n", mount_crypt_stat->
-					global_default_cipher_key_size);
 			cipher_key_bytes_set = 1;
 			break;
 		case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 			mount_crypt_stat->flags |=
 				ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
 			break;
+		case ecryptfs_opt_fnek_sig:
+			fnek_src = args[0].from;
+			fnek_dst =
+				mount_crypt_stat->global_default_fnek_sig;
+			strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+			mount_crypt_stat->global_default_fnek_sig[
+				ECRYPTFS_SIG_SIZE_HEX] = '\0';
+			rc = ecryptfs_add_global_auth_tok(
+				mount_crypt_stat,
+				mount_crypt_stat->global_default_fnek_sig);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global fnek sig [%s]; rc = [%d]\n",
+				       mount_crypt_stat->global_default_fnek_sig,
+				       rc);
+				goto out;
+			}
+			mount_crypt_stat->flags |=
+				(ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+				 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+			break;
+		case ecryptfs_opt_fn_cipher:
+			fn_cipher_name_src = args[0].from;
+			fn_cipher_name_dst =
+				mount_crypt_stat->global_default_fn_cipher_name;
+			strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			mount_crypt_stat->global_default_fn_cipher_name[
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			fn_cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_fn_cipher_key_bytes:
+			fn_cipher_key_bytes_src = args[0].from;
+			fn_cipher_key_bytes =
+				(int)simple_strtol(fn_cipher_key_bytes_src,
+						   &fn_cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_fn_cipher_key_bytes =
+				fn_cipher_key_bytes;
+			fn_cipher_key_bytes_set = 1;
+			break;
 		case ecryptfs_opt_err:
 		default:
-			ecryptfs_printk(KERN_WARNING,
-					"eCryptfs: unrecognized option '%s'\n",
-					p);
+			printk(KERN_WARNING
+			       "%s: eCryptfs: unrecognized option [%s]\n",
+			       __func__, p);
 		}
 	}
 	if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
 
 		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-
 		strcpy(mount_crypt_stat->global_default_cipher_name,
 		       ECRYPTFS_DEFAULT_CIPHER);
 	}
-	if (!cipher_key_bytes_set) {
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_name_set)
+		strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_cipher_name);
+	if (!cipher_key_bytes_set)
 		mount_crypt_stat->global_default_cipher_key_size = 0;
-	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_key_bytes_set)
+		mount_crypt_stat->global_default_fn_cipher_key_bytes =
+			mount_crypt_stat->global_default_cipher_key_size;
 	mutex_lock(&key_tfm_list_mutex);
 	if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
-				 NULL))
+				 NULL)) {
 		rc = ecryptfs_add_new_key_tfm(
 			NULL, mount_crypt_stat->global_default_cipher_name,
 			mount_crypt_stat->global_default_cipher_key_size);
-	mutex_unlock(&key_tfm_list_mutex);
-	if (rc) {
-		printk(KERN_ERR "Error attempting to initialize cipher with "
-		       "name = [%s] and key size = [%td]; rc = [%d]\n",
-		       mount_crypt_stat->global_default_cipher_name,
-		       mount_crypt_stat->global_default_cipher_key_size, rc);
-		rc = -EINVAL;
-		goto out;
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_cipher_name,
+			       mount_crypt_stat->global_default_cipher_key_size,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
 	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !ecryptfs_tfm_exists(
+		    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_fn_cipher_name,
+			mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_fn_cipher_name,
+			       mount_crypt_stat->global_default_fn_cipher_key_bytes,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&key_tfm_list_mutex);
 	rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
-	if (rc) {
+	if (rc)
 		printk(KERN_WARNING "One or more global auth toks could not "
 		       "properly register; rc = [%d]\n", rc);
-	}
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
 	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
 	if (!(*daemon)) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
 		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
 		goto out;
 	}
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 	msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
 	if (!msg_ctx->msg) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
 		       "GFP_KERNEL memory\n", __func__, msg_size);
 		goto unlock;
 	}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 		if (!msg_ctx->msg) {
 			rc = -ENOMEM;
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+			       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
 			       (sizeof(*msg_ctx->msg) + data_size));
 			goto out_unlock;
 		}
@@ -322,7 +322,7 @@ check_list:
 	if (count < total_length) {
 		rc = 0;
 		printk(KERN_WARNING "%s: Only given user buffer of "
-		       "size [%Zd], but we need [%Zd] to read the "
+		       "size [%zd], but we need [%zd] to read the "
 		       "pending message\n", __func__, count, total_length);
 		goto out_unlock_msg_ctx;
 	}
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
 
 	if ((sizeof(*msg) + msg->data_len) != data_size) {
 		printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
-		       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+		       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
 		       (sizeof(*msg) + msg->data_len), data_size);
 		rc = -EINVAL;
 		goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	data = kmalloc(count, GFP_KERNEL);
 	if (!data) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+		       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
 		goto out;
 	}
 	rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	case ECRYPTFS_MSG_RESPONSE:
 		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
 			printk(KERN_WARNING "%s: Minimum acceptable packet "
-			       "size is [%Zd], but amount of data written is "
-			       "only [%Zd]. Discarding response packet.\n",
+			       "size is [%zd], but amount of data written is "
+			       "only [%zd]. Discarding response packet.\n",
 			       __func__,
 			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
 			       count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		}
 		i += packet_size_length;
 		if ((1 + 4 + packet_size_length + packet_size) != count) {
-			printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
-			       " + packet_size([%Zd]))([%Zd]) != "
-			       "count([%Zd]). Invalid packet format.\n",
+			printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
+			       " + packet_size([%zd]))([%zd]) != "
+			       "count([%zd]). Invalid packet format.\n",
 			       __func__, packet_size_length, packet_size,
 			       (1 + packet_size_length + packet_size), count);
 			goto out_free;
diff --git a/fs/exec.c b/fs/exec.c
index 3ef9cf9b1871..71a6efe5d8bd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (IS_ERR(file))
 		goto out;
 
+	fsnotify_open(file->f_path.dentry);
+
 	error = -ENOEXEC;
 	if(file->f_op) {
 		struct linux_binfmt * fmt;
@@ -229,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
-	int err = -ENOMEM;
+	int err;
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
 	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma)
-		goto err;
+		return -ENOMEM;
 
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
@@ -248,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	 */
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-
 	vma->vm_flags = VM_STACK_FLAGS;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	err = insert_vm_struct(mm, vma);
-	if (err) {
-		up_write(&mm->mmap_sem);
+	if (err)
 		goto err;
-	}
 
 	mm->stack_vm = mm->total_vm = 1;
 	up_write(&mm->mmap_sem);
-
 	bprm->p = vma->vm_end - sizeof(void *);
-
 	return 0;
-
 err:
-	if (vma) {
-		bprm->vma = NULL;
-		kmem_cache_free(vm_area_cachep, vma);
-	}
-
+	up_write(&mm->mmap_sem);
+	bprm->vma = NULL;
+	kmem_cache_free(vm_area_cachep, vma);
 	return err;
 }
 
@@ -684,6 +679,8 @@ struct file *open_exec(const char *name)
 	if (IS_ERR(file))
 		return file;
 
+	fsnotify_open(file->f_path.dentry);
+
 	err = deny_write_access(file);
 	if (err) {
 		fput(file);
@@ -1689,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm)
 	return (ret >= 2) ? 2 : ret;
 }
 
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
@@ -1773,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 
  	if (ispipe) {
 		helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_unlock;
+		}
 		/* Terminate the string before the first option */
 		delimit = strchr(corename, ' ');
 		if (delimit)
@@ -1840,5 +1842,5 @@ fail_unlock:
 	put_cred(cred);
 	coredump_finish(mm);
 fail:
-	return retval;
+	return;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-	/* dirsync is only applied to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT2_DIRSYNC_FL;
+	ei->i_flags =
+		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
 	ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb74..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
  * ext2_splice_branch - splice the allocated branch onto inode.
  * @inode: owner
  * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext2_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto setflags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT2_DIRSYNC_FL;
+		flags = ext2_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
 	return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
 
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
 	if (!sbi->s_debts) {
 		printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
 	while (len--) {
-		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-		if (hash & 0x80000000) hash -= 0x7fffffff;
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
 		hash1 = hash0;
 		hash0 = hash;
 	}
-	return (hash0 << 1);
+	return hash0 << 1;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
 	__u32	pad, val;
 	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
 
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	for (i=0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
-		val = msg[i] + (val << 8);
+		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
 			val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	}
 
 	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
 	case DX_HASH_LEGACY:
-		hash = dx_hack_hash(name, len);
+		hash = dx_hack_hash_signed(name, len);
 		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 8);
+			(*str2hashbuf)(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		minor_hash = buf[2];
 		hash = buf[1];
 		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 4);
+			(*str2hashbuf)(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT3_DIRSYNC_FL;
+	ei->i_flags =
+		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
 #ifdef EXT3_FRAGMENTS
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			goto flags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT3_DIRSYNC_FL;
+		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1dd2abe6313e..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
 
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 	if (entry)
 		ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
 		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT3_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size, i;
+	unsigned split, move, size;
 	struct ext3_dir_entry_2 *de = NULL, *de2;
-	int	err = 0;
+	int	err = 0, i;
 
 	bh2 = ext3_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 	ext3fs_dirhash(name, namelen, &hinfo);
 	frame = frames;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..5d047a030a73 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
 		ext3_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext3_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return journal_try_to_free_buffers(journal, page, 
+						   wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
 	.mark_dirty	= ext3_mark_dquot_dirty,
-	.write_info	= ext3_write_info
+	.write_info	= ext3_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops ext3_qctl_operations = {
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
 	.quota_read	= ext3_quota_read,
 	.quota_write	= ext3_quota_write,
 #endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 					"EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb) ||
-			    sb_any_quota_suspended(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	for (i=0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		sb->s_dirt = 1;
+	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
 		printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..6bba06b09dd1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "group.h"
+#include "mballoc.h"
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 			ext4_error(sb, __func__,
-				  "Checksum bad for group %lu\n", block_group);
-			gdp->bg_free_blocks_count = 0;
-			gdp->bg_free_inodes_count = 0;
-			gdp->bg_itable_unused = 0;
+				  "Checksum bad for group %u", block_group);
+			ext4_free_blks_set(sb, gdp, 0);
+			ext4_free_inodes_set(sb, gdp, 0);
+			ext4_itable_unused_set(sb, gdp, 0);
 			memset(bh->b_data, 0xff, sb->s_blocksize);
 			return 0;
 		}
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
 					     struct buffer_head **bh)
 {
-	unsigned long group_desc;
-	unsigned long offset;
+	unsigned int group_desc;
+	unsigned int offset;
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= sbi->s_groups_count) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "block_group >= groups_count - "
-			   "block_group = %lu, groups_count = %lu",
+			   "block_group = %u, groups_count = %u",
 			   block_group, sbi->s_groups_count);
 
 		return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	if (!sbi->s_group_desc[group_desc]) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "Group descriptor not loaded - "
-			   "block_group = %lu, group_desc = %lu, desc = %lu",
+			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
 		return NULL;
 	}
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (unlikely(!bh)) {
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %lu, block_bitmap = %llu",
+			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (buffer_uptodate(bh) &&
-	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+
+	if (bitmap_uptodate(bh))
 		return bh;
 
 	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %lu, block_bitmap = %llu",
+			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
  * @handle:			handle to this transaction
  * @sb:				super block
- * @block:			start physcial block to free
+ * @block:			start physcial block to add to the block group
  * @count:			number of blocks to free
- * @pdquot_freed_blocks:	pointer to quota
  *
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant.  There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
  */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 ext4_fsblk_t block, unsigned long count,
-			 unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	ext4_grpblk_t bit;
-	unsigned long i;
-	unsigned long overflow;
+	unsigned int i;
 	struct ext4_group_desc *desc;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int err = 0, ret;
-	ext4_grpblk_t group_freed;
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+	struct ext4_group_info *grp;
 
-	*pdquot_freed_blocks = 0;
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_free_blocks",
-			   "Freeing blocks not in datazone - "
-			   "block = %llu, count = %lu", block, count);
-		goto error_return;
-	}
-
-	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-do_more:
-	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	grp = ext4_get_group_info(sb, block_group);
 	/*
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
 	 */
 	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
-		count -= overflow;
+		goto error_return;
 	}
-	brelse(bitmap_bh);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
@@ -418,18 +422,17 @@ do_more:
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error(sb, "ext4_free_blocks",
-			   "Freeing blocks in system zones - "
+		ext4_error(sb, __func__,
+			   "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
 		goto error_return;
 	}
 
 	/*
-	 * We are about to start releasing blocks in the bitmap,
+	 * We are about to add blocks to the bitmap,
 	 * so we need undo access.
 	 */
-	/* @@@ check errors */
 	BUFFER_TRACE(bitmap_bh, "getting undo access");
 	err = ext4_journal_get_undo_access(handle, bitmap_bh);
 	if (err)
@@ -444,107 +447,55 @@ do_more:
 	err = ext4_journal_get_write_access(handle, gd_bh);
 	if (err)
 		goto error_return;
-
-	jbd_lock_bh_state(bitmap_bh);
-
-	for (i = 0, group_freed = 0; i < count; i++) {
-		/*
-		 * An HJ special.  This is expensive...
-		 */
-#ifdef CONFIG_JBD2_DEBUG
-		jbd_unlock_bh_state(bitmap_bh);
-		{
-			struct buffer_head *debug_bh;
-			debug_bh = sb_find_get_block(sb, block + i);
-			if (debug_bh) {
-				BUFFER_TRACE(debug_bh, "Deleted!");
-				if (!bh2jh(bitmap_bh)->b_committed_data)
-					BUFFER_TRACE(debug_bh,
-						"No commited data in bitmap");
-				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-				__brelse(debug_bh);
-			}
-		}
-		jbd_lock_bh_state(bitmap_bh);
-#endif
-		if (need_resched()) {
-			jbd_unlock_bh_state(bitmap_bh);
-			cond_resched();
-			jbd_lock_bh_state(bitmap_bh);
-		}
-		/* @@@ This prevents newly-allocated data from being
-		 * freed and then reallocated within the same
-		 * transaction.
-		 *
-		 * Ideally we would want to allow that to happen, but to
-		 * do so requires making jbd2_journal_forget() capable of
-		 * revoking the queued write of a data block, which
-		 * implies blocking on the journal lock.  *forget()
-		 * cannot block due to truncate races.
-		 *
-		 * Eventually we can fix this by making jbd2_journal_forget()
-		 * return a status indicating whether or not it was able
-		 * to revoke the buffer.  On successful revoke, it is
-		 * safe not to set the allocation bit in the committed
-		 * bitmap, because we know that there is no outstanding
-		 * activity on the buffer any more and so it is safe to
-		 * reallocate it.
-		 */
-		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-		J_ASSERT_BH(bitmap_bh,
-				bh2jh(bitmap_bh)->b_committed_data != NULL);
-		ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-				bh2jh(bitmap_bh)->b_committed_data);
-
-		/*
-		 * We clear the bit in the bitmap after setting the committed
-		 * data bit, because this is the reverse order to that which
-		 * the allocator uses.
-		 */
+	/*
+	 * make sure we don't allow a parallel init on other groups in the
+	 * same buddy cache
+	 */
+	down_write(&grp->alloc_sem);
+	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 						bit + i, bitmap_bh->b_data)) {
-			jbd_unlock_bh_state(bitmap_bh);
 			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
 				   (ext4_fsblk_t)(block + i));
-			jbd_lock_bh_state(bitmap_bh);
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
-			group_freed++;
+			blocks_freed++;
 		}
 	}
-	jbd_unlock_bh_state(bitmap_bh);
-
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += count;
+		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
 		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
+	/*
+	 * request to reload the buddy with the
+	 * new bitmap information
+	 */
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+	ext4_mb_update_group_info(grp, blocks_freed);
+	up_write(&grp->alloc_sem);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
-	if (!err) err = ret;
-	*pdquot_freed_blocks += group_freed;
-
-	if (overflow && !err) {
-		block += count;
-		count = overflow;
-		goto do_more;
-	}
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
 	sb->s_dirt = 1;
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
-					dirty_blocks);
+					(long long)dirty_blocks);
 		}
 	}
 	/* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-#define EXT4_META_BLOCK 0x1
-
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				unsigned long *count, int *errp, int flags)
-{
-	struct ext4_allocation_request ar;
-	ext4_fsblk_t ret;
-
-	memset(&ar, 0, sizeof(ar));
-	/* Fill with neighbour allocated blocks */
-
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ar.logical = iblock;
-
-	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
-		/* enable in-core preallocation for data block allocation */
-		ar.flags = EXT4_MB_HINT_DATA;
-	else
-		/* disable in-core preallocation for non-regular files */
-		ar.flags = 0;
-
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
-	return ret;
-}
-
 /*
  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
  *
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:		total number of blocks need
+ * @count:		pointer to total number of blocks needed
  * @errp:               error code
  *
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
  * error stores in errp pointer
  */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
+	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
-	ret = do_blk_alloc(handle, inode, 0, goal,
-				count, errp, EXT4_META_BLOCK);
+
+	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = count ? *count : 1;
+
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	if (count)
+		*count = ar.len;
+
 	/*
 	 * Account for the allocated meta blocks
 	 */
 	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	}
 	return ret;
 }
 
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @errp:               error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
-{
-	unsigned long count = 1;
-	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:		total number of blocks need
- * @errp:               error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				unsigned long *count, int *errp)
-{
-	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
-
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:		superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
-	unsigned long x;
+	unsigned int x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
 			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
 		bitmap_count += x;
 	}
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		desc_count += ext4_free_blks_count(sb, gdp);
 	}
 
 	return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
-	unsigned int i;
-	unsigned long sum = 0;
+	unsigned int i, sum = 0;
 
 	if (!map)
 		return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 struct ext4_dir_entry_2 *de,
 			 struct buffer_head *bh,
-			 unsigned long offset)
+			 unsigned int offset)
 {
 	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 	if (error_msg != NULL)
 		ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
-			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			"offset=%u, inode=%u, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
-			(unsigned long) le32_to_cpu(de->inode),
+			le32_to_cpu(de->inode),
 			rlen, de->name_len);
 	return error_msg == NULL ? 1 : 0;
 }
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
 			 void *dirent, filldir_t filldir)
 {
 	int error = 0;
-	unsigned long offset;
+	unsigned int offset;
 	int i, stored;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk(KERN_ERR "ext4: call_filldir: called with "
+		printk(KERN_ERR "EXT4-fs: call_filldir: called with "
 		       "null fname?!?\n");
 		return 0;
 	}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
+#include <linux/jbd2.h>
 #include "ext4_i.h"
 
 /*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
 	/* phys. block for ^^^ */
 	ext4_fsblk_t pright;
 	/* how many blocks we want to allocate */
-	unsigned long len;
+	unsigned int len;
 	/* flags. see above EXT4_MB_HINT_* */
-	unsigned long flags;
+	unsigned int flags;
 };
 
 /*
@@ -156,12 +157,12 @@ struct ext4_group_desc
 	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
 	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
 	__le32	bg_inode_table_lo;	/* Inodes table block */
-	__le16	bg_free_blocks_count;	/* Free blocks count */
-	__le16	bg_free_inodes_count;	/* Free inodes count */
-	__le16	bg_used_dirs_count;	/* Directories count */
+	__le16	bg_free_blocks_count_lo;/* Free blocks count */
+	__le16	bg_free_inodes_count_lo;/* Free inodes count */
+	__le16	bg_used_dirs_count_lo;	/* Directories count */
 	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
 	__u32	bg_reserved[2];		/* Likely block/inode bitmap checksum */
-	__le16  bg_itable_unused;	/* Unused inodes count */
+	__le16  bg_itable_unused_lo;	/* Unused inodes count */
 	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
 	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
 	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
 	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
-	__le16	bg_itable_unused_hi;	/* Unused inodes count MSB */
+	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
 	__u32	bg_reserved2[3];
 };
 
@@ -328,6 +329,7 @@ struct ext4_mount_options {
 	uid_t s_resuid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do {									       \
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 
 /*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
+/*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY		0
 #define DX_HASH_HALF_MD4	1
 #define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
 
 #ifdef __KERNEL__
 
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 #define ERR_BAD_DX_DIR	-75000
 
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
 extern struct proc_dir_entry *ext4_proc_root;
 
@@ -987,6 +997,9 @@ do {									\
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
 			ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-					ext4_lblk_t iblock, ext4_fsblk_t goal,
-					unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-				ext4_fsblk_t block, unsigned long count,
-				unsigned long *pdquot_freed_blocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
 				struct ext4_dir_entry_2 *,
-				struct buffer_head *, unsigned long);
+				struct buffer_head *, unsigned int);
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 		ext4_grpblk_t add);
-
-
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+						ext4_group_t, int);
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, unsigned long maxblocks,
-				struct buffer_head *bh_result,
-				int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+				const char *, const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 				     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+				 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+				   struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+			       struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+				   struct ext4_group_desc *bg, __u32 count);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1225,11 +1247,11 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
  * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 	return ;
 }
 
+struct ext4_group_info {
+	unsigned long   bb_state;
+	struct rb_root  bb_free_root;
+	unsigned short  bb_first_free;
+	unsigned short  bb_free;
+	unsigned short  bb_fragments;
+	struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void            *bb_bitmap;
+#endif
+	struct rw_semaphore alloc_sem;
+	unsigned short  bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
+#define EXT4_GROUP_INFO_LOCKED_BIT	1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+						&(grinfo->bb_state));
+}
+
 /*
  * Inodes and files operations
  */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned long max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize);
+			       ext4_lblk_t iblock, unsigned int max_blocks,
+			       struct buffer_head *bh_result,
+			       int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-			sector_t block, unsigned long max_blocks,
+			sector_t block, unsigned int max_blocks,
 			struct buffer_head *bh, int create,
 			int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
+
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+	return (buffer_uptodate(bh) &&
+			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
 	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
 
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
-	EXT4_I(inode)->i_ext_generation++;
-}
-
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
 typedef __u32 ext4_lblk_t;
 
 /* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
 
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
 	 */
 	loff_t	i_disksize;
 
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
 	/*
 	 * i_data_sem is for serialising ext4_truncate() against
 	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
 	struct inode vfs_inode;
 	struct jbd2_inode jinode;
 
-	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
 	/*
 	 * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
 	spinlock_t i_prealloc_lock;
 
 	/* allocation reservation info for delalloc */
-	unsigned long i_reserved_data_blocks;
-	unsigned long i_reserved_meta_blocks;
-	unsigned long i_allocated_meta_blocks;
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
 	unsigned short i_delalloc_reserved_flag;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
 	spinlock_t i_block_reservation_lock;
 };
 
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_undo_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_undo_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_write_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_write_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_forget(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_forget(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_forget(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_revoke(const char *where, handle_t *handle,
 				ext4_fsblk_t blocknr, struct buffer_head *bh)
 {
-	int err = jbd2_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_revoke(handle, blocknr, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_get_create_access(const char *where,
 				handle_t *handle, struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_create_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_create_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
-int __ext4_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+				 struct inode *inode, struct buffer_head *bh)
 {
-	int err = jbd2_journal_dirty_metadata(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	} else {
+		mark_buffer_dirty(bh);
+		if (inode && inode_needs_sync(inode)) {
+			sync_dirty_buffer(bh);
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				ext4_error(inode->i_sb, __func__,
+					   "IO error syncing inode, "
+					   "inode=%lu, block=%llu",
+					   inode->i_ino,
+					   (unsigned long long) bh->b_blocknr);
+				err = -EIO;
+			}
+		}
+	}
 	return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
  * 5 levels of tree + root which are stored in the inode. */
 
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
-	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)	\
-		|| test_opt(sb, EXTENTS) ? 27U : 8U)
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+	 ? 27U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
  * been done yet.
  */
 
-static inline void ext4_journal_release_buffer(handle_t *handle,
-						struct buffer_head *bh)
-{
-	jbd2_journal_release_buffer(handle, bh);
-}
-
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 int __ext4_journal_get_create_access(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
-int __ext4_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh);
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+				 struct inode *inode, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
 	__ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
 	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
 	__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
-	__ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
 	__ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+	__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
+#define EXT4_NOJOURNAL_HANDLE	((handle_t *) 0x1)
+
+static inline int ext4_handle_valid(handle_t *handle)
+{
+	if (handle == EXT4_NOJOURNAL_HANDLE)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		return is_handle_aborted(handle);
+	return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+	if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
 	return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
 
 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
 {
-	return jbd2_journal_extend(handle, nblocks);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_extend(handle, nblocks);
+	return 0;
 }
 
 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
 {
-	return jbd2_journal_restart(handle, nblocks);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_restart(handle, nblocks);
+	return 0;
 }
 
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
-	return jbd2_journal_blocks_per_page(inode);
+	if (EXT4_JOURNAL(inode) != NULL)
+		return jbd2_journal_blocks_per_page(inode);
+	return 0;
 }
 
 static inline int ext4_journal_force_commit(journal_t *journal)
 {
-	return jbd2_journal_force_commit(journal);
+	if (journal)
+		return jbd2_journal_force_commit(journal);
+	return 0;
 }
 
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+	return 0;
 }
 
 /* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
 
 static inline int ext4_should_journal_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 1;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
 
 static inline int ext4_should_order_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
 	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
 	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index b21f16713db0..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
 	spinlock_t s_reserve_lock;
 	spinlock_t s_md_lock;
 	tid_t s_last_transaction;
-	unsigned short *s_mb_offsets, *s_mb_maxs;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
 
 	/* tunables */
 	unsigned long s_stripe;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
 	if (handle->h_buffer_credits > needed)
 		return 0;
 	err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
 	int err;
 	if (path->p_bh) {
 		/* path points to block */
-		err = ext4_journal_dirty_metadata(handle, path->p_bh);
+		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_meta_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
 	return newblock;
 }
 
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto cleanup;
 	brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto cleanup;
 		brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto out;
 
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	while (--depth >= 0) {
 		ix = path[depth].p_idx;
 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-			break;
+			goto got_index;
 	}
 
-	if (depth < 0) {
-		/* we've gone up to the root and
-		 * found no index to the right */
-		return 0;
-	}
+	/* we've gone up to the root and found no index to the right */
+	return 0;
 
+got_index:
 	/* we've found index to the right, let's
 	 * follow it and find the closest allocated
 	 * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	*phys = ext_pblock(ex);
 	put_bh(bh);
 	return 0;
-
 }
 
 /*
@@ -1622,7 +1621,6 @@ cleanup:
 		ext4_ext_drop_refs(npath);
 		kfree(npath);
 	}
-	ext4_ext_tree_changed(inode);
 	ext4_ext_invalidate_cache(inode);
 	return err;
 }
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		}
 	}
 out:
-	ext4_ext_tree_changed(inode);
 	ext4_ext_drop_refs(path);
 	kfree(path);
 	ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
 	 * possible initialization would be here
 	 */
 
-	if (test_opt(sb, EXTENTS)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
  */
 void ext4_ext_release(struct super_block *sb)
 {
-	if (!test_opt(sb, EXTENTS))
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
 		return;
 
 #ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 						struct inode *inode,
 						struct ext4_ext_path *path,
 						ext4_lblk_t iblock,
-						unsigned long max_blocks)
+						unsigned int max_blocks)
 {
 	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 */
 		newdepth = ext_depth(inode);
 		/*
-		 * update the extent length after successfull insert of the
+		 * update the extent length after successful insert of the
 		 * split extent
 		 */
 		orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
  */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
-			unsigned long max_blocks, struct buffer_head *bh_result,
+			unsigned int max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
-	ext4_fsblk_t goal, newblock;
-	int err = 0, depth, ret;
-	unsigned long allocated = 0;
+	ext4_fsblk_t newblock;
+	int err = 0, depth, ret, cache_type;
+	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %u/%lu requested for inode %u\n",
+	ext_debug("blocks %u/%u requested for inode %u\n",
 			iblock, max_blocks, inode->i_ino);
 
 	/* check in cache */
-	goal = ext4_ext_in_cache(inode, iblock, &newex);
-	if (goal) {
-		if (goal == EXT4_EXT_CACHE_GAP) {
+	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+	if (cache_type) {
+		if (cache_type == EXT4_EXT_CACHE_GAP) {
 			if (!create) {
 				/*
 				 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				goto out2;
 			}
 			/* we should allocate requested block */
-		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
+		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
 			newblock = iblock
 				   - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (!newblock)
 		goto out2;
 	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-			goal, newblock, allocated);
+		  ar.goal, newblock, allocated);
 
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
 	 * transaction synchronous.
 	 */
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 out_stop:
 	up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	handle_t *handle;
 	ext4_lblk_t block;
 	loff_t new_size;
-	unsigned long max_blocks;
+	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
 /*
  * Callback function called for each extent to gather FIEMAP information.
  */
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
 		       void *data)
 {
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
 {
 	__u64 physical = 0;
 	__u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len);
-
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
 	while (len--) {
-		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-		if (hash & 0x80000000) hash -= 0x7fffffff;
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
 		hash1 = hash0;
 		hash0 = hash;
 	}
-	return (hash0 << 1);
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 {
 	__u32	pad, val;
 	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
 
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
-		val = msg[i] + (val << 8);
+		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
 			val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	}
 
 	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
 	case DX_HASH_LEGACY:
-		hash = dx_hack_hash(name, len);
+		hash = dx_hack_hash_signed(name, len);
 		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 8);
+			(*str2hashbuf)(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		minor_hash = buf[2];
 		hash = buf[1];
 		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 4);
+			(*str2hashbuf)(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6e6052879aa2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+		ext4_error(sb, __func__, "Checksum bad for group %u",
 			   block_group);
-		gdp->bg_free_blocks_count = 0;
-		gdp->bg_free_inodes_count = 0;
-		gdp->bg_itable_unused = 0;
+		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_inodes_set(sb, gdp, 0);
+		ext4_itable_unused_set(sb, gdp, 0);
 		memset(bh->b_data, 0xff, sb->s_blocksize);
 		return 0;
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
 
 	return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (unlikely(!bh)) {
 		ext4_error(sb, __func__,
 			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %llu",
+			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (buffer_uptodate(bh) &&
-	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+	if (bitmap_uptodate(bh))
 		return bh;
 
 	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
 		ext4_error(sb, __func__,
 			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %llu",
+			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err;
+	int fatal = 0, err, count;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	ino = inode->i_ino;
 	ext4_debug("freeing inode %lu\n", ino);
+	trace_mark(ext4_free_inode,
+		   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+		   sb->s_id, inode->i_ino, inode->i_mode,
+		   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+		   (unsigned long long) inode->i_blocks);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 		if (gdp) {
 			spin_lock(sb_bgl_lock(sbi, block_group));
-			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
-			if (is_directory)
-				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+			count = ext4_free_inodes_count(sb, gdp) + 1;
+			ext4_free_inodes_set(sb, gdp, count);
+			if (is_directory) {
+				count = ext4_used_dirs_count(sb, gdp) - 1;
+				ext4_used_dirs_set(sb, gdp, count);
+			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				spin_unlock(sb_bgl_lock(sbi, flex_group));
 			}
 		}
-		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh2);
+		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bh2);
 		if (!fatal) fatal = err;
 	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 	if (!fatal)
 		fatal = err;
 	sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 
 	for (group = 0; group < ngroups; group++) {
 		desc = ext4_get_group_desc(sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
+		if (!desc || !ext4_free_inodes_count(sb, desc))
 			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+		if (ext4_free_inodes_count(sb, desc) < avefreei)
 			continue;
 		if (!best_desc ||
-		    (le16_to_cpu(desc->bg_free_blocks_count) >
-		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+		    (ext4_free_blks_count(sb, desc) >
+		     ext4_free_blks_count(sb, best_desc))) {
 			*best_group = group;
 			best_desc = desc;
 			ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
 	for (i = best_flex * flex_size; i < ngroups &&
 		     i < (best_flex + 1) * flex_size; i++) {
 		desc = ext4_get_group_desc(sb, i, &bh);
-		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+		if (ext4_free_inodes_count(sb, desc)) {
 			*best_group = i;
 			goto out;
 		}
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		for (i = 0; i < ngroups; i++) {
 			grp = (parent_group + i) % ngroups;
 			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !desc->bg_free_inodes_count)
+			if (!desc || !ext4_free_inodes_count(sb, desc))
 				continue;
-			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
 				continue;
-			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			if (ext4_free_inodes_count(sb, desc) < avefreei)
 				continue;
-			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+			if (ext4_free_blks_count(sb, desc) < avefreeb)
 				continue;
 			*group = grp;
 			ret = 0;
-			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+			best_ndir = ext4_used_dirs_count(sb, desc);
 		}
 		if (ret == 0)
 			return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	for (i = 0; i < ngroups; i++) {
 		*group = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
+		if (!desc || !ext4_free_inodes_count(sb, desc))
 			continue;
-		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
 			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+		if (ext4_free_inodes_count(sb, desc) < min_inodes)
 			continue;
-		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+		if (ext4_free_blks_count(sb, desc) < min_blocks)
 			continue;
 		return 0;
 	}
@@ -494,8 +522,8 @@ fallback:
 	for (i = 0; i < ngroups; i++) {
 		*group = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && desc->bg_free_inodes_count &&
-			le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+			ext4_free_inodes_count(sb, desc) >= avefreei)
 			return 0;
 	}
 
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 	 */
 	*group = parent_group;
 	desc = ext4_get_group_desc(sb, *group, NULL);
-	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-			le16_to_cpu(desc->bg_free_blocks_count))
+	if (desc && ext4_free_inodes_count(sb, desc) &&
+			ext4_free_blks_count(sb, desc))
 		return 0;
 
 	/*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (*group >= ngroups)
 			*group -= ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-				le16_to_cpu(desc->bg_free_blocks_count))
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+				ext4_free_blks_count(sb, desc))
 			return 0;
 	}
 
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (++*group >= ngroups)
 			*group = 0;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+		if (desc && ext4_free_inodes_count(sb, desc))
 			return 0;
 	}
 
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 
 /*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+			struct buffer_head *inode_bitmap_bh,
+			unsigned long ino, ext4_group_t group, int mode)
+{
+	int free = 0, retval = 0, count;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+	spin_lock(sb_bgl_lock(sbi, group));
+	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+		/* not a free inode */
+		retval = 1;
+		goto err_ret;
+	}
+	ino++;
+	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+			ino > EXT4_INODES_PER_GROUP(sb)) {
+		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_error(sb, __func__,
+			   "reserved inode or inode > inodes count - "
+			   "block_group = %u, inode=%lu", group,
+			   ino + group * EXT4_INODES_PER_GROUP(sb));
+		return 1;
+	}
+	/* If we didn't allocate from within the initialized part of the inode
+	 * table then we need to initialize up to this inode. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+			/* When marking the block group with
+			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
+			 * on the value of bg_itable_unused even though
+			 * mke2fs could have initialized the same for us.
+			 * Instead we calculated the value below
+			 */
+
+			free = 0;
+		} else {
+			free = EXT4_INODES_PER_GROUP(sb) -
+				ext4_itable_unused_count(sb, gdp);
+		}
+
+		/*
+		 * Check the relative inode number against the last used
+		 * relative inode number in this group. if it is greater
+		 * we need to  update the bg_itable_unused count
+		 *
+		 */
+		if (ino > free)
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+	}
+	count = ext4_free_inodes_count(sb, gdp) - 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (S_ISDIR(mode)) {
+		count = ext4_used_dirs_count(sb, gdp) + 1;
+		ext4_used_dirs_set(sb, gdp, count);
+	}
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+	spin_unlock(sb_bgl_lock(sbi, group));
+	return retval;
+}
+
+/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
 	struct super_block *sb;
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
+	struct buffer_head *inode_bitmap_bh = NULL;
+	struct buffer_head *group_desc_bh;
 	ext4_group_t group = 0;
 	unsigned long ino = 0;
 	struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+		   dir->i_ino, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		err = -EIO;
 
-		gdp = ext4_get_group_desc(sb, group, &bh2);
+		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
 			goto fail;
 
-		brelse(bitmap_bh);
-		bitmap_bh = ext4_read_inode_bitmap(sb, group);
-		if (!bitmap_bh)
+		brelse(inode_bitmap_bh);
+		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+		if (!inode_bitmap_bh)
 			goto fail;
 
 		ino = 0;
 
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
-				bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+					      inode_bitmap_bh->b_data,
+					      EXT4_INODES_PER_GROUP(sb), ino);
+
 		if (ino < EXT4_INODES_PER_GROUP(sb)) {
 
-			BUFFER_TRACE(bitmap_bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, bitmap_bh);
+			BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+							    inode_bitmap_bh);
 			if (err)
 				goto fail;
 
-			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-						ino, bitmap_bh->b_data)) {
+			BUFFER_TRACE(group_desc_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+								group_desc_bh);
+			if (err)
+				goto fail;
+			if (!ext4_claim_inode(sb, inode_bitmap_bh,
+						ino, group, mode)) {
 				/* we won it */
-				BUFFER_TRACE(bitmap_bh,
-					"call ext4_journal_dirty_metadata");
-				err = ext4_journal_dirty_metadata(handle,
-								bitmap_bh);
+				BUFFER_TRACE(inode_bitmap_bh,
+					"call ext4_handle_dirty_metadata");
+				err = ext4_handle_dirty_metadata(handle,
+								 inode,
+							inode_bitmap_bh);
 				if (err)
 					goto fail;
+				/* zero bit is inode number 1*/
+				ino++;
 				goto got;
 			}
 			/* we lost it */
-			jbd2_journal_release_buffer(handle, bitmap_bh);
+			ext4_handle_release_buffer(handle, inode_bitmap_bh);
+			ext4_handle_release_buffer(handle, group_desc_bh);
 
 			if (++ino < EXT4_INODES_PER_GROUP(sb))
 				goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
 	goto out;
 
 got:
-	ino++;
-	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-	    ino > EXT4_INODES_PER_GROUP(sb)) {
-		ext4_error(sb, __func__,
-			   "reserved inode or inode > inodes count - "
-			   "block_group = %lu, inode=%lu", group,
-			   ino + group * EXT4_INODES_PER_GROUP(sb));
-		err = -EIO;
-		goto fail;
-	}
-
-	BUFFER_TRACE(bh2, "get_write_access");
-	err = ext4_journal_get_write_access(handle, bh2);
-	if (err) goto fail;
-
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+		struct buffer_head *block_bitmap_bh;
 
-		BUFFER_TRACE(block_bh, "get block bitmap access");
-		err = ext4_journal_get_write_access(handle, block_bh);
+		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
-			brelse(block_bh);
+			brelse(block_bitmap_bh);
 			goto fail;
 		}
 
@@ -715,9 +816,9 @@ got:
 		spin_lock(sb_bgl_lock(sbi, group));
 		/* recheck and clear flag under lock if we still need to */
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
-			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+			ext4_free_blks_set(sb, gdp, free);
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
@@ -725,55 +826,19 @@ got:
 
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
-			BUFFER_TRACE(block_bh, "dirty block bitmap");
-			err = ext4_journal_dirty_metadata(handle, block_bh);
+			BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+			err = ext4_handle_dirty_metadata(handle,
+							NULL, block_bitmap_bh);
 		}
 
-		brelse(block_bh);
+		brelse(block_bitmap_bh);
 		if (err)
 			goto fail;
 	}
-
-	spin_lock(sb_bgl_lock(sbi, group));
-	/* If we didn't allocate from within the initialized part of the inode
-	 * table then we need to initialize up to this inode. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
-			/* When marking the block group with
-			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
-			 * on the value of bg_itable_unused even though
-			 * mke2fs could have initialized the same for us.
-			 * Instead we calculated the value below
-			 */
-
-			free = 0;
-		} else {
-			free = EXT4_INODES_PER_GROUP(sb) -
-				le16_to_cpu(gdp->bg_itable_unused);
-		}
-
-		/*
-		 * Check the relative inode number against the last used
-		 * relative inode number in this group. if it is greater
-		 * we need to  update the bg_itable_unused count
-		 *
-		 */
-		if (ino > free)
-			gdp->bg_itable_unused =
-				cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
-	}
-
-	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-	if (S_ISDIR(mode)) {
-		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-	}
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group));
-	BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh2);
-	if (err) goto fail;
+	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+	if (err)
+		goto fail;
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
@@ -825,7 +890,7 @@ got:
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
 		err = -EINVAL;
 		goto fail_drop;
@@ -852,7 +917,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	if (test_opt(sb, EXTENTS)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -867,6 +932,8 @@ got:
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
+	trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+		   sb->s_id, inode->i_ino, dir->i_ino, mode);
 	goto really_out;
 fail:
 	ext4_std_error(sb, err);
@@ -874,7 +941,7 @@ out:
 	iput(inode);
 	ret = ERR_PTR(err);
 really_out:
-	brelse(bitmap_bh);
+	brelse(inode_bitmap_bh);
 	return ret;
 
 fail_free_drop:
@@ -886,7 +953,7 @@ fail_drop:
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
 	iput(inode);
-	brelse(bitmap_bh);
+	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);
 }
 
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_inode_bitmap(sb, i);
 		if (!bitmap_bh)
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 
 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+			i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		cond_resched();
 	}
 	return desc_count;
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		count += le16_to_cpu(gdp->bg_used_dirs_count);
+		count += ext4_used_dirs_count(sb, gdp);
 	}
 	return count;
 }
-
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6702a49992a6..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 			struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	might_sleep();
 
 	BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
+	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
 	}
 
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
-	if (handle->h_buffer_credits < 3) {
+	if (!ext4_handle_has_enough_credits(handle, 3)) {
 		err = ext4_journal_extend(handle, 3);
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 		int blocks_to_boundary)
 {
-	unsigned long count = 0;
+	unsigned int count = 0;
 
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 				int indirect_blks, int blks,
 				ext4_fsblk_t new_blocks[4], int *err)
 {
+	struct ext4_allocation_request ar;
 	int target, i;
 	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	if (!target)
 		goto allocated;
 	/* Now allocate data blocks */
-	count = target;
-	/* allocating blocks for data blocks */
-	current_block = ext4_new_blocks(handle, inode, iblock,
-						goal, &count, err);
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = target;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		/* enable in-core preallocation only for regular files */
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	current_block = ext4_mb_new_blocks(handle, &ar, err);
+
 	if (*err && (target == blks)) {
 		/*
 		 * if the allocation failed and we didn't allocate
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		 */
 			new_blocks[index] = current_block;
 		}
-		blk_allocated += count;
+		blk_allocated += ar.len;
 	}
 allocated:
 	/* total number of blocks allocated for direct blocks */
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto failed;
 	}
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
-		BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, where->bh);
+		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 		if (err)
 			goto err_out;
 	} else {
@@ -840,10 +856,10 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-		ext4_lblk_t iblock, unsigned long maxblocks,
-		struct buffer_head *bh_result,
-		int create, int extend_disksize)
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+				  ext4_lblk_t iblock, unsigned int maxblocks,
+				  struct buffer_head *bh_result,
+				  int create, int extend_disksize)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * It returns the error in case of allocation failure.
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned long max_blocks, struct buffer_head *bh,
+			unsigned int max_blocks, struct buffer_head *bh,
 			int create, int extend_disksize, int flag)
 {
 	int retval;
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			err = ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
 			if (!fatal)
 				fatal = err;
 		} else {
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	pgoff_t index;
 	unsigned from, to;
 
+	trace_mark(ext4_write_begin,
+		   "dev %s ino %lu pos %llu len %u flags %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, flags);
  	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
-	return ext4_journal_dirty_metadata(handle, bh);
+	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
 /*
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
+	trace_mark(ext4_ordered_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
+	trace_mark(ext4_writeback_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	new_i_size = pos + copied;
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	loff_t new_i_size;
 
+	trace_mark(ext4_journalled_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -1625,7 +1657,7 @@ struct mpage_da_data {
 	get_block_t *get_block;
 	struct writeback_control *wbc;
 	int io_done;
-	long pages_written;
+	int pages_written;
 	int retval;
 };
 
@@ -1645,35 +1677,39 @@ struct mpage_da_data {
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-	struct address_space *mapping = mpd->inode->i_mapping;
-	int ret = 0, err, nr_pages, i;
-	unsigned long index, end;
-	struct pagevec pvec;
 	long pages_skipped;
+	struct pagevec pvec;
+	unsigned long index, end;
+	int ret = 0, err, nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
-	pagevec_init(&pvec, 0);
+	/*
+	 * We need to start from the first_page to the next_page - 1
+	 * to make sure we also write the mapped dirty buffer_heads.
+	 * If we look at mpd->lbh.b_blocknr we would only be looking
+	 * at the currently mapped buffer_heads.
+	 */
 	index = mpd->first_page;
 	end = mpd->next_page - 1;
 
+	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		/*
-		 * We can use PAGECACHE_TAG_DIRTY lookup here because
-		 * even though we have cleared the dirty flag on the page
-		 * We still keep the page in the radix tree with tag
-		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
-		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
-		 * which is called via the below writepage callback.
-		 */
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					PAGECACHE_TAG_DIRTY,
-					min(end - index,
-					(pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+
 			pages_skipped = mpd->wbc->pages_skipped;
 			err = mapping->a_ops->writepage(page, mpd->wbc);
 			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
 			ext4_count_free_blocks(inode->i_sb));
 	printk(KERN_EMERG "Free/Dirty block details\n");
 	printk(KERN_EMERG "free_blocks=%lld\n",
-			percpu_counter_sum(&sbi->s_freeblocks_counter));
+			(long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
 	printk(KERN_EMERG "dirty_blocks=%lld\n",
-			percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+			(long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 	printk(KERN_EMERG "Block reservation details\n");
-	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+	printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
 			EXT4_I(inode)->i_reserved_data_blocks);
-	printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+	printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
 			EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
+			/*
+			 * We need to try to allocate
+			 * unmapped blocks in the same page.
+			 * Otherwise we won't make progress
+			 * with the page in ext4_da_writepage
+			 */
 			if (buffer_dirty(bh) &&
 				(!buffer_mapped(bh) || buffer_delay(bh))) {
 				mpage_add_bh_to_extent(mpd, logical, bh);
 				if (mpd->io_done)
 					return MPAGE_DA_EXTENT_TAIL;
+			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+				/*
+				 * mapped dirty buffer. We need to update
+				 * the b_state because we look at
+				 * b_state in mpage_da_map_blocks. We don't
+				 * update b_size because if we find an
+				 * unmapped buffer_head later we need to
+				 * use the b_state flag of that buffer_head.
+				 */
+				if (mpd->lbh.b_size == 0)
+					mpd->lbh.b_state =
+						bh->b_state & BH_FLAGS;
 			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
 {
 	int ret = 0;
 	loff_t size;
-	unsigned long len;
+	unsigned int len;
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
+	trace_mark(ext4_da_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int no_nrwrite_index_update;
-	long pages_written = 0, pages_skipped;
+	int pages_written = 0;
+	long pages_skipped;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
+	trace_mark(ext4_da_writepages,
+		   "dev %s ino %lu nr_t_write %ld "
+		   "pages_skipped %ld range_start %llu "
+		   "range_end %llu nonblocking %d "
+		   "for_kupdate %d for_reclaim %d "
+		   "for_writepages %d range_cyclic %d",
+		   inode->i_sb->s_id, inode->i_ino,
+		   wbc->nr_to_write, wbc->pages_skipped,
+		   (unsigned long long) wbc->range_start,
+		   (unsigned long long) wbc->range_end,
+		   wbc->nonblocking, wbc->for_kupdate,
+		   wbc->for_reclaim, wbc->for_writepages,
+		   wbc->range_cyclic);
+
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
+
+	/*
+	 * If the filesystem has aborted, it is read-only, so return
+	 * right away instead of dumping stack traces later on that
+	 * will obscure the real source of the problem.  We test
+	 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+	 * the latter could be true if the filesystem is mounted
+	 * read-only, and in that case, ext4_da_writepages should
+	 * *never* be called, so if that ever happens, we would want
+	 * the stack trace.
+	 */
+	if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+		return -EROFS;
+
 	/*
 	 * Make sure nr_to_write is >= sbi->s_mb_stream_request
 	 * This make sure small files blocks are allocated in
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			printk(KERN_EMERG "%s: jbd2_start: "
+			printk(KERN_CRIT "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d\n", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			dump_stack();
@@ -2486,6 +2572,14 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
+	trace_mark(ext4_da_writepage_result,
+		   "dev %s ino %lu ret %d pages_written %d "
+		   "pages_skipped %ld congestion %d "
+		   "more_io %d no_nrwrite_index_update %d",
+		   inode->i_sb->s_id, inode->i_ino, ret,
+		   pages_written, wbc->pages_skipped,
+		   wbc->encountered_congestion, wbc->more_io,
+		   wbc->no_nrwrite_index_update);
 	return ret;
 }
 
@@ -2498,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
-	 * counters can get slightly wrong with FBC_BATCH getting
+	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
+
+	trace_mark(ext4_da_write_begin,
+		   "dev %s ino %lu pos %llu len %u flags %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
 		}
 	}
 
+	trace_mark(ext4_da_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+	BUG_ON(!EXT4_JOURNAL(inode) &&
+	       EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
+
+	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
+	trace_mark(ext4_normal_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
+	trace_mark(ext4_journalled_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 	if (offset == 0)
 		ClearPageChecked(page);
 
-	jbd2_journal_invalidatepage(journal, page, offset);
+	if (journal)
+		jbd2_journal_invalidatepage(journal, page, offset);
+	else
+		block_invalidatepage(page, offset);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
-	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	else
+		return try_to_free_buffers(page);
 }
 
 /*
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
 
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
 		if (ext4_should_order_data(inode))
 			err = ext4_jbd2_file_inode(handle, inode);
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			ext4_handle_dirty_metadata(handle, inode, bh);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_test_restart(handle, inode);
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 				  count, block_to_free_p, p);
 
 	if (this_bh) {
-		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
 
 		/*
 		 * The buffer head should have an attached journal head at this
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
 		if (bh2jh(this_bh))
-			ext4_journal_dirty_metadata(handle, this_bh);
+			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			ext4_error(inode->i_sb, __func__,
 				   "circular indirect block detected, "
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t nr;
 	__le32 *p;
 
-	if (is_handle_aborted(handle))
+	if (ext4_handle_is_aborted(handle))
 		return;
 
 	if (depth--) {
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
-			if (is_handle_aborted(handle))
+			if (ext4_handle_is_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
-					"call ext4_journal_dirty_metadata");
-					ext4_journal_dirty_metadata(handle,
-								    parent_bh);
+					"call ext4_handle_dirty_metadata");
+					ext4_handle_dirty_metadata(handle,
+								   inode,
+								   parent_bh);
 				}
 			}
 		}
@@ -3814,7 +3933,7 @@ do_indirects:
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	ext4_fsblk_t		block;
 	int			inodes_per_block, inode_offset;
 
-	iloc->bh = 0;
+	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 
@@ -3951,7 +4070,7 @@ make_io:
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-				num -= le16_to_cpu(gdp->bg_itable_unused);
+				num -= ext4_itable_unused_count(sb, gdp);
 			table += num / inodes_per_block;
 			if (end > table)
 				end = table;
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
-			handle->h_sync = 1;
-			err = ext4_journal_dirty_metadata(handle,
+			ext4_handle_sync(handle);
+			err = ext4_handle_dirty_metadata(handle, inode,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 
-
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	rc = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (!err)
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
 	return ext4_force_commit(inode->i_sb);
 }
 
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+	int err = 0;
+
+	mark_buffer_dirty(bh);
+	if (inode && inode_needs_sync(inode)) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			ext4_error(inode->i_sb, __func__,
+				   "IO error syncing inode, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)bh->b_blocknr);
+			err = -EIO;
+		}
+	}
+	return err;
+}
+
 /*
  * ext4_setattr()
  *
@@ -4710,16 +4847,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
-	int err = 0;
-	if (handle) {
-		err = ext4_get_inode_loc(inode, iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc->bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, iloc->bh);
-			if (err) {
-				brelse(iloc->bh);
-				iloc->bh = NULL;
-			}
+	int err;
+
+	err = ext4_get_inode_loc(inode, iloc);
+	if (!err) {
+		BUFFER_TRACE(iloc->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, iloc->bh);
+		if (err) {
+			brelse(iloc->bh);
+			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 
 	might_sleep();
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
-	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	if (ext4_handle_valid(handle) &&
+	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
 	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 
+	if (!ext4_handle_valid(current_handle)) {
+		ext4_mark_inode_dirty(current_handle, inode);
+		return;
+	}
+
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
-				err = ext4_journal_dirty_metadata(handle,
-								  iloc.bh);
+				err = ext4_handle_dirty_metadata(handle,
+								 inode,
+								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	journal = EXT4_JOURNAL(inode);
+	if (!journal)
+		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
 
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return PTR_ERR(handle);
 
 	err = ext4_mark_inode_dirty(handle, inode);
-	handle->h_sync = 1;
+	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto flags_out;
 		}
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err)
 			goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..918aec0c8a11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
  * inode as:
  *
  *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.  So for each group we
@@ -330,6 +330,18 @@
  *        object
  *
  */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += first + i;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
-			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
+			ext4_grp_locked_error(sb, e4b->bd_group,
+				   __func__, "double-free of inode"
+				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr,
 				   first + i, e4b->bd_group);
 		}
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk(KERN_ERR "corruption in group %lu "
+				printk(KERN_ERR "corruption in group %u "
 				       "at byte %u(%u): %x in copy != %x "
 				       "on disk/prealloc\n",
 				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		ext4_error(sb, __func__,
-			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+		ext4_grp_locked_error(sb, group,  __func__,
+			"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
 			group, free, grp->bb_free);
 		/*
 		 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
  * stored in the inode as
  *
  * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (bh[i] == NULL)
 			goto out;
 
-		if (buffer_uptodate(bh[i]) &&
-		    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+		if (bitmap_uptodate(bh[i]))
 			continue;
 
 		lock_buffer(bh[i]);
+		if (bitmap_uptodate(bh[i])) {
+			unlock_buffer(bh[i]);
+			continue;
+		}
 		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
+			set_bitmap_uptodate(bh[i]);
 			set_buffer_uptodate(bh[i]);
-			unlock_buffer(bh[i]);
 			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+			unlock_buffer(bh[i]);
 			continue;
 		}
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		if (buffer_uptodate(bh[i])) {
+			/*
+			 * if not uninit if bh is uptodate,
+			 * bitmap is also uptodate
+			 */
+			set_bitmap_uptodate(bh[i]);
+			unlock_buffer(bh[i]);
+			continue;
+		}
 		get_bh(bh[i]);
+		/*
+		 * submit the buffer_head for read. We can
+		 * safely mark the bitmap as uptodate now.
+		 * We do it here so the bitmap uptodate bit
+		 * get set with buffer lock held.
+		 */
+		set_bitmap_uptodate(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
-		mb_debug("read bitmap for group %lu\n", first_group + i);
+		mb_debug("read bitmap for group %u\n", first_group + i);
 	}
 
 	/* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	err = 0;
 	first_block = page->index * blocks_per_page;
+	/* init the page  */
+	memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
 		struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore == NULL);
 			mb_debug("put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
-			memset(data, 0xff, blocksize);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			/*
 			 * incore got set to the group block bitmap below
 			 */
+			ext4_lock_group(sb, group);
 			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
 			incore = NULL;
 		} else {
 			/* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 			/* mark all preallocated blks used in in-core bitmap */
 			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group);
 			ext4_unlock_group(sb, group);
 
 			/* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
 	int blocks_per_page;
 	int block;
 	int pnum;
 	int poff;
 	struct page *page;
 	int ret;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
 
-	mb_debug("load group %lu\n", group);
+	mb_debug("load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	grp = ext4_get_group_info(sb, group);
 
 	e4b->bd_blkbits = sb->s_blocksize_bits;
 	e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
+	e4b->alloc_semp = &grp->alloc_sem;
+
+	/* Take the read lock on the group alloc
+	 * sem. This would make sure a parallel
+	 * ext4_mb_init_group happening on other
+	 * groups mapped by the page is blocked
+	 * till we are done with allocation
+	 */
+	down_read(e4b->alloc_semp);
 
 	/*
 	 * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	page = find_get_page(inode->i_mapping, pnum);
 	if (page == NULL || !PageUptodate(page)) {
 		if (page)
+			/*
+			 * drop the page reference and try
+			 * to get the page with lock. If we
+			 * are not uptodate that implies
+			 * somebody just created the page but
+			 * is yet to initialize the same. So
+			 * wait for it to initialize.
+			 */
 			page_cache_release(page);
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
@@ -985,6 +1040,9 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
+
+	/* Done with the buddy cache */
+	up_read(e4b->alloc_semp);
 	return ret;
 }
 
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
+	/* Done with the buddy cache */
+	if (e4b->alloc_semp)
+		up_read(e4b->alloc_semp);
 }
 
 
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_clear_bit_atomic(lock, cur, bm);
+		if (lock)
+			mb_clear_bit_atomic(lock, cur, bm);
+		else
+			mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_set_bit_atomic(lock, cur, bm);
+		if (lock)
+			mb_set_bit_atomic(lock, cur, bm);
+		else
+			mb_set_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += block;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-			ext4_unlock_group(sb, e4b->bd_group);
-			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
+			ext4_grp_locked_error(sb, e4b->bd_group,
+				   __func__, "double-free of inode"
+				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
-			ext4_lock_group(sb, e4b->bd_group);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	ac->ac_tail = ret & 0xffff;
 	ac->ac_buddy = ret >> 16;
 
-	/* XXXXXXX: SUCH A HORRIBLE **CK */
-	/*FIXME!! Why ? */
+	/*
+	 * take the page reference. We want the page to be pinned
+	 * so that we don't get a ext4_mb_init_cache_call for this
+	 * group until we update the bitmap. That would mean we
+	 * double allocate blocks. The reference is dropped
+	 * in ext4_mb_release_context
+	 */
 	ac->ac_bitmap_page = e4b->bd_bitmap_page;
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
 	get_page(ac->ac_buddy_page);
-
+	/* on allocation we use ac to track the held semaphore */
+	ac->alloc_semp =  e4b->alloc_semp;
+	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
 	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
 		spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
 	struct ext4_free_extent ex;
 	int max;
 
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
 	/*
 	 * We don't want to scan for a whole year
 	 */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
-			ext4_error(sb, __func__, "%d free blocks as per "
-					"group info. But bitmap says 0\n",
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					__func__, "%d free blocks as per "
+					"group info. But bitmap says 0",
 					free);
 			break;
 		}
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
-			ext4_error(sb, __func__, "%d free blocks as per "
-					"group info. But got %d blocks\n",
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					__func__, "%d free blocks as per "
+					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
 			 * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	int groups_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+
+		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+			break;
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		down_write_nested(&grp->alloc_sem, i);
+	}
+	return i;
+}
+
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+					ext4_group_t group, int locked_group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+	/* release locks on all the groups */
+	for (i = 0; i < locked_group; i++) {
+
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		up_write(&grp->alloc_sem);
+	}
+
+}
+
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	int ret;
+	void *bitmap;
+	int blocks_per_page;
+	int block, pnum, poff;
+	int num_grp_locked = 0;
+	struct ext4_group_info *this_grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page = NULL, *bitmap_page = NULL;
+
+	mb_debug("init group %lu\n", group);
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures we don't add group
+	 * to this buddy cache via resize
+	 */
+	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		ret = 0;
+		goto err;
+	}
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, NULL);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+	bitmap_page = page;
+	bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	/* init buddy cache */
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page == bitmap_page) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		unlock_page(page);
+	} else if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, bitmap);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+err:
+	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	if (bitmap_page)
+		page_cache_release(bitmap_page);
+	if (page)
+		page_cache_release(page);
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1775,7 +2019,7 @@ repeat:
 				group = 0;
 
 			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(ac->ac_sb, group);
+			grp = ext4_get_group_info(sb, group);
 			if (grp->bb_free == 0)
 				continue;
 
@@ -1788,10 +2032,9 @@ repeat:
 				 * we need full data about the group
 				 * to make a good selection
 				 */
-				err = ext4_mb_load_buddy(sb, group, &e4b);
+				err = ext4_mb_init_group(sb, group);
 				if (err)
 					goto out;
-				ext4_mb_release_desc(&e4b);
 			}
 
 			/*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
 			"%-5u %-5s %-5u %-6u\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
 			hs->orig.fe_start, hs->orig.fe_len,
 			hs->orig.fe_logical);
-		sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+		sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
 			hs->goal.fe_start, hs->goal.fe_len,
 			hs->goal.fe_logical);
 		seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 				hs->buddy ? 1 << hs->buddy : 0);
 	} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
 			hs->orig.fe_start, hs->orig.fe_len,
 			hs->orig.fe_logical);
 		seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
 	} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len);
 		seq_printf(seq, "%-5u %-8u %-23s discard\n",
 				hs->pid, hs->ino, buf2);
 	} else if (hs->op == EXT4_MB_HISTORY_FREE) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len);
 		seq_printf(seq, "%-5u %-8u %-23s free\n",
 				hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 		return NULL;
 
 	group = *pos + 1;
-	return (void *) group;
+	return (void *) ((unsigned long) group);
 }
 
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (*pos < 0 || *pos >= sbi->s_groups_count)
 		return NULL;
 	group = *pos + 1;
-	return (void *) group;;
+	return (void *) ((unsigned long) group);
 }
 
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
 	struct super_block *sb = seq->private;
-	long group = (long) v;
+	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
 	int i;
 	int err;
 	struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 		sizeof(struct ext4_group_info);
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
-		seq_printf(seq, "#%-5lu: I/O error\n", group);
+		seq_printf(seq, "#%-5u: I/O error\n", group);
 		return 0;
 	}
 	ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	ext4_unlock_group(sb, group);
 	ext4_mb_release_desc(&e4b);
 
-	seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
 	for (i = 0; i <= 13; i++)
 		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			ext4_free_blocks_after_init(sb, group, desc);
 	} else {
 		meta_group_info[i]->bb_free =
-			le16_to_cpu(desc->bg_free_blocks_count);
+			ext4_free_blks_count(sb, desc);
 	}
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root.rb_node = NULL;;
 
 #ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
 } /* ext4_mb_add_groupinfo */
 
 /*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
-			       struct ext4_group_desc *desc)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
-	int blocks_per_page;
-	int block;
-	int pnum;
-	struct page *page;
-	int err;
-
-	/* Add group based on group descriptor*/
-	err = ext4_mb_add_groupinfo(sb, group, desc);
-	if (err)
-		return err;
-
-	/*
-	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
-	 * datas) are set not up to date so that they will be re-initilaized
-	 * during the next call to ext4_mb_load_buddy
-	 */
-
-	/* Set buddy page as not up to date */
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	page = find_get_page(inode->i_mapping, pnum);
-	if (page != NULL) {
-		ClearPageUptodate(page);
-		page_cache_release(page);
-	}
-
-	/* Set bitmap page as not up to date */
-	block++;
-	pnum = block / blocks_per_page;
-	page = find_get_page(inode->i_mapping, pnum);
-	if (page != NULL) {
-		ClearPageUptodate(page);
-		page_cache_release(page);
-	}
-
-	return 0;
-}
-
-/*
  * Update an existing group.
  * This function is used for online resize
  */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
-				"EXT4-fs: can't read descriptor %lu\n", i);
+				"EXT4-fs: can't read descriptor %u\n", i);
 			goto err_freebuddy;
 		}
 		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	if (sbi->s_mb_offsets == NULL) {
 		return -ENOMEM;
 	}
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
 		kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
-	sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+	if (sbi->s_journal)
+		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 
 	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
 
-		mb_debug("gonna free %u blocks in group %lu (0x%p):",
+		mb_debug("gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
 			+ entry->start_blk
 			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
-			   (unsigned long long) discard_block, entry->count);
+		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+			   sb->s_id, (unsigned long long) discard_block,
+			   entry->count);
 		sb_issue_discard(sb, discard_block, entry->count);
 
 		kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle, unsigned long reserv_blks)
+				handle_t *handle, unsigned int reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!gdp)
 		goto out_err;
 
-	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
 			gdp->bg_free_blocks_count);
 
 	err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
 		     EXT4_SB(sb)->s_itb_per_group)) {
 		ext4_error(sb, __func__,
-			   "Allocating block in system zone - block = %llu",
-			   block);
+			   "Allocating block %llu in system zone of %d group\n",
+			   block, ac->ac_b_ex.fe_group);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
 				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
 				ac->ac_b_ex.fe_len);
-		err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!err)
 			err = -EAGAIN;
 		goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
 	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	mb_set_bits(NULL, bitmap_bh->b_data,
+				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-		gdp->bg_free_blocks_count =
-			cpu_to_le16(ext4_free_blocks_after_init(sb,
-						ac->ac_b_ex.fe_group,
-						gdp));
+		ext4_free_blks_set(sb, gdp,
+					ext4_free_blocks_after_init(sb,
+					ac->ac_b_ex.fe_group, gdp));
 	}
-	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_blks_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 	if (err)
 		goto out_err;
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 
 out_err:
 	sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	/* check we don't cross already preallocated blocks */
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-		unsigned long pa_end;
+		ext4_lblk_t pa_end;
 
 		if (pa->pa_deleted)
 			continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	/* XXX: extra loop to check we really don't overlap preallocations */
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-		unsigned long pa_end;
+		ext4_lblk_t pa_end;
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0) {
 			pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 }
 
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *entry;
+
+	grp = ext4_get_group_info(sb, group);
+	n = rb_first(&(grp->bb_free_root));
+
+	while (n) {
+		entry = rb_entry(n, struct ext4_free_data, node);
+		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+				bitmap, entry->start_blk,
+				entry->count);
+		n = rb_next(n);
+	}
+	return;
+}
+
+/*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
  * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		preallocated += len;
 		count++;
 	}
-	mb_debug("prellocated %u for group %lu\n", preallocated, group);
+	mb_debug("prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 			struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-	unsigned long grp;
+	ext4_group_t grp;
 
 	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
 		return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_mark(ext4_mb_new_inode_pa,
+		   "dev %s ino %lu pstart %llu len %u lstart %u",
+		   sb->s_id, ac->ac_inode->i_ino,
+		   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
 	ext4_mb_use_inode_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_linear = 1;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+		   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
 	ext4_mb_use_group_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 {
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned long end;
-	unsigned long next;
+	unsigned int end;
+	unsigned int next;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
+	unsigned long long grp_blk_start;
 	sector_t start;
 	int err = 0;
 	int free = 0;
 
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	grp_blk_start = pa->pa_pstart - bit;
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			ext4_mb_store_history(ac);
 		}
 
+		trace_mark(ext4_mb_release_inode_pa,
+			   "dev %s ino %lu block %llu count %u",
+			   sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+			   next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		ext4_error(sb, __func__, "free %u, pa_free %u\n",
-						free, pa->pa_free);
+		ext4_grp_locked_error(sb, group,
+					__func__, "free %u, pa_free %u",
+					free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
 		 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	if (ac)
 		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
+	trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+		   sb->s_id, pa->pa_pstart, pa->pa_len);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	int busy = 0;
 	int free = 0;
 
-	mb_debug("discard preallocation for group %lu\n", group);
+	mb_debug("discard preallocation for group %u\n", group);
 
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		ext4_error(sb, __func__, "Error in reading block "
-				"bitmap for %lu\n", group);
+				"bitmap for %u", group);
 		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
 		ext4_error(sb, __func__, "Error in loading buddy "
-				"information for %lu\n", group);
+				"information for %u", group);
 		put_bh(bitmap_bh);
 		return 0;
 	}
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
 	}
 
 	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+	trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+		   inode->i_ino);
 
 	INIT_LIST_HEAD(&list);
 
@@ -3874,14 +4116,14 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu\n", group);
+					"information for %u", group);
 			continue;
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			ext4_error(sb, __func__, "Error in reading block "
-					"bitmap for %lu\n", group);
+					"bitmap for %u", group);
 			ext4_mb_release_desc(&e4b);
 			continue;
 		}
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t group;
-	unsigned long len;
-	unsigned long goal;
+	unsigned int len;
+	ext4_fsblk_t goal;
 	ext4_grpblk_t block;
 
 	/* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ac->ac_pa = NULL;
 	ac->ac_bitmap_page = NULL;
 	ac->ac_buddy_page = NULL;
+	ac->alloc_semp = NULL;
 	ac->ac_lg = NULL;
 
 	/* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu\n", group);
+					"information for %u", group);
 			continue;
 		}
 		ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 		}
 		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
+	if (ac->alloc_semp)
+		up_read(ac->alloc_semp);
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 	int ret;
 	int freed = 0;
 
+	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+		   sb->s_id, needed);
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	unsigned long inquota;
-	unsigned long reserv_blks = 0;
+	unsigned int inquota;
+	unsigned int reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
+	trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+		   "lblk %llu goal %llu lleft %llu lright %llu "
+		   "pleft %llu pright %llu ",
+		   sb->s_id, ar->flags, ar->len,
+		   ar->inode ? ar->inode->i_ino : 0,
+		   (unsigned long long) ar->logical,
+		   (unsigned long long) ar->goal,
+		   (unsigned long long) ar->lleft,
+		   (unsigned long long) ar->lright,
+		   (unsigned long long) ar->pleft,
+		   (unsigned long long) ar->pright);
+
 	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
 		/*
 		 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	if (ar->len == 0) {
 		*errp = -EDQUOT;
-		return 0;
+		goto out3;
 	}
 	inquota = ar->len;
 
@@ -4348,10 +4607,14 @@ repeat:
 				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
 			ext4_mb_new_preallocation(ac);
 	}
-
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
 		if (*errp ==  -EAGAIN) {
+			/*
+			 * drop the reference that we took
+			 * in ext4_mb_use_best_found
+			 */
+			ext4_mb_release_context(ac);
 			ac->ac_b_ex.fe_group = 0;
 			ac->ac_b_ex.fe_start = 0;
 			ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
 out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+	if (!ar->len) {
+		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+			/* release all the reserved blocks if non delalloc */
+			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						reserv_blks);
+	}
+
+	trace_mark(ext4_allocate_blocks,
+		   "dev %s block %llu flags %u len %u ino %lu "
+		   "logical %llu goal %llu lleft %llu lright %llu "
+		   "pleft %llu pright %llu ",
+		   sb->s_id, (unsigned long long) block,
+		   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+		   (unsigned long long) ar->logical,
+		   (unsigned long long) ar->goal,
+		   (unsigned long long) ar->lleft,
+		   (unsigned long long) ar->lright,
+		   (unsigned long long) ar->pleft,
+		   (unsigned long long) ar->pright);
 
 	return block;
 }
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
 
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-			  ext4_group_t group, ext4_grpblk_t block, int count)
+		      struct ext4_free_data *new_entry)
 {
+	ext4_grpblk_t block;
+	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *new_entry;
 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node;
 
-
+	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-	new_entry->start_blk = block;
-	new_entry->group  = group;
-	new_entry->count = count;
-	new_entry->t_tid = handle->h_transaction->t_tid;
 	new_node = &new_entry->node;
+	block = new_entry->start_blk;
 
-	ext4_lock_group(sb, group);
 	if (!*n) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			ext4_unlock_group(sb, group);
-			ext4_error(sb, __func__,
-			    "Double free of blocks %d (%d %d)\n",
-			    block, entry->start_blk, entry->count);
+			ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+					"Double free of blocks %d (%d %d)",
+					block, entry->start_blk, entry->count);
 			return 0;
 		}
 	}
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	spin_lock(&sbi->s_md_lock);
 	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
 	spin_unlock(&sbi->s_md_lock);
-	ext4_unlock_group(sb, group);
 	return 0;
 }
 
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
-	unsigned long overflow;
+	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	ext4_debug("freeing block %lu\n", block);
+	trace_mark(ext4_free_blocks,
+		   "dev %s block %llu count %lu metadata %d ino %lu",
+		   sb->s_id, (unsigned long long) block, count, metadata,
+		   inode ? inode->i_ino : 0);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
 	err = ext4_journal_get_write_access(handle, gd_bh);
 	if (err)
 		goto error_return;
-
-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
-	if (err)
-		goto error_return;
-
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -4593,13 +4869,6 @@ do_more:
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-			bit, count);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-
 	if (ac) {
 		ac->ac_b_ex.fe_group = block_group;
 		ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
 		ext4_mb_store_history(ac);
 	}
 
-	if (metadata) {
-		/* blocks being freed are metadata. these blocks shouldn't
-		 * be used until this transaction is committed */
-		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+	if (metadata && ext4_handle_valid(handle)) {
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+		new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry->start_blk = bit;
+		new_entry->group  = block_group;
+		new_entry->count = count;
+		new_entry->t_tid = handle->h_transaction->t_tid;
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
+		ext4_unlock_group(sb, block_group);
 	} else {
 		ext4_lock_group(sb, block_group);
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&gdp->bg_free_blocks_count, count);
+	ret = ext4_free_blks_count(sb, gdp) + count;
+	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
 
 	*freed += count;
 
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
 	if (!err)
 		err = ret;
 
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
 #include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/marker.h>
+#include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,9 +99,6 @@
  */
 #define MB_DEFAULT_GROUP_PREALLOC	512
 
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
 
 struct ext4_free_data {
 	/* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
 	tid_t	t_tid;
 };
 
-struct ext4_group_info {
-	unsigned long	bb_state;
-	struct rb_root  bb_free_root;
-	unsigned short	bb_first_free;
-	unsigned short	bb_free;
-	unsigned short	bb_fragments;
-	struct		list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-	void		*bb_bitmap;
-#endif
-	unsigned short	bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
-
-#define EXT4_MB_GRP_NEED_INIT(grp)	\
-	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
 struct ext4_prealloc_space {
 	struct list_head	pa_inode_list;
 	struct list_head	pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
 	__u8 ac_op;		/* operation, for history only */
 	struct page *ac_bitmap_page;
 	struct page *ac_buddy_page;
+	/*
+	 * pointer to the held semaphore upon successful
+	 * block allocation
+	 */
+	struct rw_semaphore *alloc_semp;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
 };
@@ -250,6 +233,7 @@ struct ext4_buddy {
 	struct super_block *bd_sb;
 	__u16 bd_blkbits;
 	ext4_group_t bd_group;
+	struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
 	return;
 }
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-					ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b, sector_t block,
-					int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-			struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
 	ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	/*
 	 * Make sure the credit we accumalated is not really high
 	 */
-	if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+	if (needed && ext4_handle_has_enough_credits(handle,
+						EXT4_RESERVE_TRANS_BLOCKS)) {
 		retval = ext4_journal_restart(handle, needed);
 		if (retval)
 			goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
 	int retval = 0, needed;
 
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	/*
 	 * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
 	struct list_blocks_struct lb;
 	unsigned long max_entries;
 
-	if (!test_opt(inode->i_sb, EXTENTS))
-		/*
-		 * if mounted with noextents we don't allow the migrate
-		 */
-		return -EINVAL;
-
-	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	/*
+	 * If the filesystem does not support extents, or the inode
+	 * already is extent-based, error out.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9fd2a5e1be4d..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
 
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	if (d_name)
 		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT4_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
 static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
 				  const struct qstr *d_name,
-				  unsigned long offset,
+				  unsigned int offset,
 				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
-		unsigned long ino = le32_to_cpu(de->inode);
+		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			ext4_error(dir->i_sb, "ext4_lookup",
-				   "bad inode number: %lu", ino);
+				   "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-	unsigned long ino;
+	__u32 ino;
 	struct inode *inode;
 	static const struct qstr dotdot = {
 		.name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
 		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-			   "bad inode number: %lu", ino);
+			   "bad inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size, i;
+	unsigned split, move, size;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
-	int	err = 0;
+	int	err = 0, i;
 
 	bh2 = ext4_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		de = de2;
 	}
 	dx_insert_block(frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata(handle, bh2);
+	err = ext4_handle_dirty_metadata(handle, dir, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata(handle, frame->bh);
+	err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
 	if (err)
 		goto journal_error;
 	brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
-	unsigned long	offset = 0;
+	unsigned int	offset = 0;
 	unsigned short	reclen;
 	int		nlen, rlen, err;
 	char		*top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, dir, bh);
 	if (err)
 		ext4_std_error(dir->i_sb, err);
 	brelse(bh);
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	ext4fs_dirhash(name, namelen, &hinfo);
 	frame = frames;
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	unsigned long offset;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		ext4_mark_inode_dirty(handle, dir);
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
-	for (block = 0, offset = 0; block < blocks; block++) {
+	for (block = 0; block < blocks; block++) {
 		bh = ext4_bread(handle, dir, block, 0, &retval);
 		if(!bh)
 			return retval;
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			dxtrace(dx_show_index("node", frames[1].entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_journal_dirty_metadata(handle, bh2);
+			err = ext4_handle_dirty_metadata(handle, inode, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		ext4_journal_dirty_metadata(handle, frames[0].bh);
+		ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
 	}
 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 	if (!de)
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
 			else
 				de->inode = 0;
 			dir->i_version++;
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			ext4_handle_dirty_metadata(handle, dir, bh);
 			return 0;
 		}
 		i += ext4_rec_len_from_disk(de->rec_len);
@@ -1725,7 +1727,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode (handle, dir, mode);
 	err = PTR_ERR(inode);
@@ -1759,7 +1761,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, mode);
 	err = PTR_ERR(inode);
@@ -1795,7 +1797,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
@@ -1824,8 +1826,8 @@ retry:
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
-	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
-	ext4_journal_dirty_metadata(handle, dir_block);
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	ext4_handle_dirty_metadata(handle, dir, dir_block);
 	brelse(dir_block);
 	ext4_mark_inode_dirty(handle, inode);
 	err = ext4_add_entry(handle, dentry, inode);
@@ -1854,7 +1856,7 @@ out_stop:
  */
 static int empty_dir(struct inode *inode)
 {
-	unsigned long offset;
+	unsigned int offset;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de, *de1;
 	struct super_block *sb;
@@ -1899,7 +1901,7 @@ static int empty_dir(struct inode *inode)
 				if (err)
 					ext4_error(sb, __func__,
 						   "error %d reading directory"
-						   " #%lu offset %lu",
+						   " #%lu offset %u",
 						   err, inode->i_ino, offset);
 				offset += sb->s_blocksize;
 				continue;
@@ -1937,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	struct ext4_iloc iloc;
 	int err = 0, rc;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	lock_super(sb);
 	if (!list_empty(&EXT4_I(inode)->i_orphan))
 		goto out_unlock;
@@ -1965,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
@@ -1999,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	struct list_head *prev;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_sb_info *sbi;
-	unsigned long ino_next;
+	__u32 ino_next;
 	struct ext4_iloc iloc;
 	int err = 0;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	lock_super(inode->i_sb);
 	if (list_empty(&ei->i_orphan)) {
 		unlock_super(inode->i_sb);
@@ -2021,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	 * transaction handle with which to update the orphan list on
 	 * disk, but we still need to remove the inode from the linked
 	 * list in memory. */
-	if (!handle)
+	if (sbi->s_journal && !handle)
 		goto out;
 
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2029,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		goto out_err;
 
 	if (prev == &sbi->s_orphan) {
-		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		jbd_debug(4, "superblock will point to %u\n", ino_next);
 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
 		if (err)
 			goto out_brelse;
 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+		err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
 	} else {
 		struct ext4_iloc iloc2;
 		struct inode *i_prev =
 			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
 
-		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+		jbd_debug(4, "orphan inode %lu will point to %u\n",
 			  i_prev->i_ino, ino_next);
 		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
 		if (err)
@@ -2086,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = dentry->d_inode;
 
@@ -2140,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	retval = -ENOENT;
 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2197,7 +2205,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
@@ -2260,7 +2268,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_inc_count(handle, inode);
@@ -2309,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
@@ -2363,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_dir->i_ctime = new_dir->i_mtime =
 					ext4_current_time(new_dir);
 		ext4_mark_inode_dirty(handle, new_dir);
-		BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, new_bh);
+		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+		ext4_handle_dirty_metadata(handle, new_dir, new_bh);
 		brelse(new_bh);
 		new_bh = NULL;
 	}
@@ -2414,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		BUFFER_TRACE(dir_bh, "get_write_access");
 		ext4_journal_get_write_access(handle, dir_bh);
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, dir_bh);
+		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
 		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
 			/* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c328be5d6885 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
 	if (group != sbi->s_groups_count)
 		ext4_warning(sb, __func__,
-			     "Cannot add at group %u (only %lu groups)",
+			     "Cannot add at group %u (only %u groups)",
 			     input->group, sbi->s_groups_count);
 	else if (offset != 0)
 			ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 {
 	int err;
 
-	if (handle->h_buffer_credits >= thresh)
+	if (ext4_handle_has_enough_credits(handle, thresh))
 		return 0;
 
 	err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(gdb);
-		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_handle_dirty_metadata(handle, NULL, gdb);
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(bh);
 			goto exit_bh;
 		}
-		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_handle_dirty_metadata(handle, NULL, gdb);
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(it);
 			goto exit_bh;
 		}
-		ext4_journal_dirty_metadata(handle, it);
+		ext4_handle_dirty_metadata(handle, NULL, it);
 		brelse(it);
 		ext4_set_bit(bit, bh->b_data);
 	}
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if ((err = extend_or_restart_transaction(handle, 2, bh)))
 		goto exit_bh;
 
-	mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
+	mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+	ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
-
 	/* Mark unused entries in inode bitmap used */
 	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
 		   input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
 		goto exit_journal;
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
+	ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
 	brelse(bh);
 
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 * reserved inode, and will become GDT blocks (primary and backup).
 	 */
 	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-	ext4_journal_dirty_metadata(handle, dind);
+	ext4_handle_dirty_metadata(handle, NULL, dind);
 	brelse(dind);
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
 	ext4_mark_iloc_dirty(handle, inode, &iloc);
 	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext4_journal_dirty_metadata(handle, *primary);
+	ext4_handle_dirty_metadata(handle, NULL, *primary);
 
 	o_group_desc = EXT4_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	kfree(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 
 	return 0;
 
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 		       primary[i]->b_blocknr, gdbackups,
 		       blk + primary[i]->b_blocknr); */
 		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-		err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
 		if (!err)
 			err = err2;
 	}
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
 		struct buffer_head *bh;
 
 		/* Out of journal space, and can't get more - abort - so sad */
-		if (handle->h_buffer_credits == 0 &&
+		if (ext4_handle_valid(handle) &&
+		    handle->h_buffer_credits == 0 &&
 		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
 		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
 			break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
 			memset(bh->b_data + size, 0, rest);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
-		ext4_journal_dirty_metadata(handle, bh);
+		ext4_handle_dirty_metadata(handle, NULL, bh);
 		brelse(bh);
 	}
 	if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
 	if (err) {
 		ext4_warning(sb, __func__,
-			     "can't update backup for group %lu (err %d), "
+			     "can't update backup for group %u (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	struct inode *inode = NULL;
 	handle_t *handle;
 	int gdb_off, gdb_num;
+	int num_grp_locked = 0;
 	int err, err2;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (ext4_blocks_count(es) + input->blocks_count <
 	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__, "blocks_count overflow\n");
+		ext4_warning(sb, __func__, "blocks_count overflow");
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
 	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __func__, "inodes_count overflow\n");
+		ext4_warning(sb, __func__, "inodes_count overflow");
 		return -EINVAL;
 	}
 
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		}
 	}
 
+
 	if ((err = verify_group_input(sb, input)))
 		goto exit_put;
 
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
          * using the new disk blocks.
          */
 
+	num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
-	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
 	 * We can allocate memory for mb_alloc based on the new group
 	 * descriptor
 	 */
-	err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-	if (err)
+	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+	if (err) {
+		ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 		goto exit_journal;
+	}
 
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
+	ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 
-	ext4_journal_dirty_metadata(handle, primary);
+	ext4_handle_dirty_metadata(handle, NULL, primary);
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 			EXT4_INODES_PER_GROUP(sb);
 	}
 
-	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
 	sb->s_dirt = 1;
 
 exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	struct buffer_head *bh;
 	handle_t *handle;
 	int err;
-	unsigned long freed_blocks;
 	ext4_group_t group;
-	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__,
-			"CONFIG_LBD not enabled\n");
+			ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
 		return -EINVAL;
 	}
 
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
-	ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
 	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
 
-	/*
-	 * Mark mballoc pages as not up to date so that they will be updated
-	 * next time they are loaded by ext4_mb_load_buddy.
-	 *
-	 * XXX Bad, Bad, BAD!!!  We should not be overloading the
-	 * Uptodate flag, particularly on thte bitmap bh, as way of
-	 * hinting to ext4_mb_load_buddy() that it needs to be
-	 * overloaded.  A user could take a LVM snapshot, then do an
-	 * on-line fsck, and clear the uptodate flag, and this would
-	 * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
-	 */
-	{
-		struct ext4_sb_info *sbi = EXT4_SB(sb);
-		struct inode *inode = sbi->s_buddy_cache;
-		int blocks_per_page;
-		int block;
-		int pnum;
-		struct page *page;
-
-		/* Set buddy page as not up to date */
-		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-		block = group * 2;
-		pnum = block / blocks_per_page;
-		page = find_get_page(inode->i_mapping, pnum);
-		if (page != NULL) {
-			ClearPageUptodate(page);
-			page_cache_release(page);
-		}
-
-		/* Set bitmap page as not up to date */
-		block++;
-		pnum = block / blocks_per_page;
-		page = find_get_page(inode->i_mapping, pnum);
-		if (page != NULL) {
-			ClearPageUptodate(page);
-			page_cache_release(page);
-		}
-
-		/* Get the info on the last group */
-		grp = ext4_get_group_info(sb, group);
-
-		/* Update free blocks in group info */
-		ext4_mb_update_group_info(grp, add);
-	}
-
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74dbb..8f7e0be8ab1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
-			       unsigned int);
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
+__u32 ext4_free_blks_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_itable_unused_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
 void ext4_block_bitmap_set(struct super_block *sb,
 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
+void ext4_free_blks_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
 /*
  * Wrappers for jbd2_journal_start/end.
  *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
 	journal = EXT4_SB(sb)->s_journal;
-	if (is_journal_aborted(journal)) {
-		ext4_abort(sb, __func__,
-			   "Detected aborted journal");
-		return ERR_PTR(-EROFS);
+	if (journal) {
+		if (is_journal_aborted(journal)) {
+			ext4_abort(sb, __func__,
+				   "Detected aborted journal");
+			return ERR_PTR(-EROFS);
+		}
+		return jbd2_journal_start(journal, nblocks);
 	}
-
-	return jbd2_journal_start(journal, nblocks);
+	/*
+	 * We're not journaling, return the appropriate indication.
+	 */
+	current->journal_info = EXT4_NOJOURNAL_HANDLE;
+	return current->journal_info;
 }
 
 /*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
 	int err;
 	int rc;
 
+	if (!ext4_handle_valid(handle)) {
+		/*
+		 * Do this here since we don't call jbd2_journal_stop() in
+		 * no-journal mode.
+		 */
+		current->journal_info = NULL;
+		return 0;
+	}
 	sb = handle->h_transaction->t_journal->j_private;
 	err = handle->h_err;
 	rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 	char nbuf[16];
 	const char *errstr = ext4_decode_error(NULL, err, nbuf);
 
+	BUG_ON(!ext4_handle_valid(handle));
+
 	if (bh)
 		BUFFER_TRACE(bh, "abort");
 
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
 	va_end(args);
 }
 
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+				const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	va_list args;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_CONT)) {
+		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+		ext4_commit_super(sb, es, 0);
+		return;
+	}
+	ext4_unlock_group(sb, grp);
+	ext4_handle_error(sb);
+	/*
+	 * We only get here in the ERRORS_RO case; relocking the group
+	 * may be dangerous, but nothing bad will happen since the
+	 * filesystem will have already been marked read/only and the
+	 * journal has been aborted.  We return 1 as a hint to callers
+	 * who might what to use the return value from
+	 * ext4_grp_locked_error() to distinguish beween the
+	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+	 * aggressively from the ext4 function in question, with a
+	 * more appropriate error code.
+	 */
+	ext4_lock_group(sb, grp);
+	return;
+}
+
+
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+	printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	err = jbd2_journal_destroy(sbi->s_journal);
-	sbi->s_journal = NULL;
-	if (err < 0)
-		ext4_abort(sb, __func__, "Couldn't clean up the journal");
-
+	if (sbi->s_journal) {
+		err = jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+		if (err < 0)
+			ext4_abort(sb, __func__,
+				   "Couldn't clean up the journal");
+	}
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	/*
+	 * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
+	 * therefore it can be null here.  Don't check it, just initialize
+	 * jinode.
+	 */
 	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
 	}
 #endif
 	ext4_discard_preallocations(inode);
-	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+	if (EXT4_JOURNAL(inode))
+		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
 }
 
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
 	/*
 	 * We're changing the default of barrier mount option, so
 	 * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",journal_async_commit");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
-	if (!test_opt(sb, EXTENTS))
-		seq_puts(seq, ",noextents");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 	if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext4_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page,
+							wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops ext4_qctl_operations = {
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
 	.quota_read	= ext4_quota_read,
 	.quota_write	= ext4_quota_write,
 #endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+	Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-	Opt_inode_readahead_blks
+	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
 static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
-	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
-	{Opt_extents, "extents"},
-	{Opt_noextents, "noextents"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_err, NULL},
 };
 
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
 	return sb_block;
 }
 
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
 static int parse_options(char *options, struct super_block *sb,
-			 unsigned int *inum, unsigned long *journal_devnum,
+			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
 	int qtype, qfmt;
 	char *qname;
 #endif
-	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
-		case Opt_journal_inum:
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*inum = option;
-			break;
 		case Opt_journal_dev:
 			if (is_remount) {
 				printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+			break;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
+			break;
 		case Opt_data_journal:
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 				       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT4-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
 		case Opt_bh:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
-		case Opt_extents:
-			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-				ext4_warning(sb, __func__,
-					"extents feature not enabled "
-					"on this filesystem, use tune2fs\n");
-				return 0;
-			}
-			set_opt(sbi->s_mount_opt, EXTENTS);
-			break;
-		case Opt_noextents:
-			/*
-			 * When e2fsprogs support resizing an already existing
-			 * ext3 file system to greater than 2**32 we need to
-			 * add support to block allocator to handle growing
-			 * already existing block  mapped inode so that blocks
-			 * allocated for them fall within 2**32
-			 */
-			last_block = ext4_blocks_count(sbi->s_es) - 1;
-			if (last_block  > 0xffffffffULL) {
-				printk(KERN_ERR "EXT4-fs: Filesystem too "
-						"large to mount with "
-						"-o noextents options\n");
-				return 0;
-			}
-			clear_opt(sbi->s_mount_opt, EXTENTS);
-			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
 				return 0;
 			sbi->s_inode_readahead_blks = option;
 			break;
+		case Opt_journal_ioprio:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 7)
+				break;
+			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+							    option);
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		printk(KERN_WARNING
 		       "EXT4-fs warning: checktime reached, "
 		       "running e2fsck is recommended\n");
-#if 0
-		/* @@@ We _will_ want to clear the valid bit if we find
-		 * inconsistencies, to force a fsck at reboot.  But for
-		 * a plain journaled filesystem we can keep it set as
-		 * valid forever! :)
-		 */
-	es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
+	if (!sbi->s_journal) 
+		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	if (sbi->s_journal)
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
 	ext4_commit_super(sb, es, 1);
 	if (test_opt(sb, DEBUG))
-		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-	       "external", EXT4_SB(sb)->s_journal->j_devname);
+	if (EXT4_SB(sb)->s_journal) {
+		printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+		       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+		       "external", EXT4_SB(sb)->s_journal->j_devname);
+	} else {
+		printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+	}
 	return res;
 }
 
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
-	__u64 block_bitmap = 0;
 	int i;
 
 	if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
 		printk(KERN_ERR "EXT4-fs: not enough memory for "
-				"%lu flex groups\n", flex_group_count);
+				"%u flex groups\n", flex_group_count);
 		goto failed;
 	}
 
-	gdp = ext4_get_group_desc(sb, 1, &bh);
-	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
-
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		gdp = ext4_get_group_desc(sb, i, &bh);
 
 		flex_group = ext4_flex_group(sbi, i);
 		sbi->s_flex_groups[flex_group].free_inodes +=
-			le16_to_cpu(gdp->bg_free_inodes_count);
+			ext4_free_inodes_count(sb, gdp);
 		sbi->s_flex_groups[flex_group].free_blocks +=
-			le16_to_cpu(gdp->bg_free_blocks_count);
+			ext4_free_blks_count(sb, gdp);
 	}
 
 	return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Block bitmap for group %lu not in group "
+			       "Block bitmap for group %u not in group "
 			       "(block %llu)!\n", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Inode bitmap for group %lu not in group "
+			       "Inode bitmap for group %u not in group "
 			       "(block %llu)!\n", i, inode_bitmap);
 			return 0;
 		}
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Inode table for group %lu not in group "
+			       "Inode table for group %u not in group "
 			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
 		spin_lock(sb_bgl_lock(sbi, i));
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Checksum for group %lu failed (%u!=%u)\n",
+			       "Checksum for group %u failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
 	unsigned long offset = 0;
-	unsigned int journal_inum = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
 	char *cp;
+	const char *descr;
 	int ret = -EINVAL;
 	int blocksize;
-	int db_count;
-	int i;
+	unsigned int db_count;
+	unsigned int i;
 	int needs_recovery, has_huge_files;
-	__le32 features;
+	int features;
 	__u64 blocks_count;
 	int err;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
-	 * turn on extents feature by default in ext4 filesystem
-	 * only if feature flag already set by mkfs or tune2fs.
-	 * Use -o noextents to turn it off
-	 */
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-		set_opt(sbi->s_mount_opt, EXTENTS);
-	else
-		ext4_warning(sb, __func__,
-			"extents feature not enabled on this filesystem, "
-			"use tune2fs.\n");
-
-	/*
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
-			   NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
 	if (features) {
 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		       "unsupported optional features (%x).\n", sb->s_id,
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
 		goto failed_mount;
 	}
 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		       "unsupported optional features (%x).\n", sb->s_id,
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+			~EXT4_FEATURE_RO_COMPAT_SUPP));
 		goto failed_mount;
 	}
 	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		sb->s_dirt = 1;
+	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
 		printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 
-	/* ensure blocks_count calculation below doesn't sign-extend */
-	if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
-	    le32_to_cpu(es->s_first_data_block) + 1) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
-		       "first data block %u, blocks per group %lu\n",
-			ext4_blocks_count(es),
-			le32_to_cpu(es->s_first_data_block),
-			EXT4_BLOCKS_PER_GROUP(sb));
+        /*
+         * It makes no sense for the first data block to be beyond the end
+         * of the filesystem.
+         */
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+		       "block %u is beyond end of filesystem (%llu)\n",
+		       le32_to_cpu(es->s_first_data_block),
+		       ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
 			le32_to_cpu(es->s_first_data_block) +
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+		printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+		       "(block count %llu, first data block %u, "
+		       "blocks per group %lu)\n", sbi->s_groups_count,
+		       ext4_blocks_count(es),
+		       le32_to_cpu(es->s_first_data_block),
+		       EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
 	sbi->s_groups_count = blocks_count;
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 				ext4_commit_super(sb, es, 1);
-				printk(KERN_CRIT
-				       "EXT4-fs (device %s): mount failed\n",
-				      sb->s_id);
 				goto failed_mount4;
 			}
 		}
-	} else if (journal_inum) {
-		if (ext4_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
+	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		printk(KERN_ERR "EXT4-fs: required journal recovery "
+		       "suppressed and not mounted read-only\n");
+		goto failed_mount4;
 	} else {
-		if (!silent)
-			printk(KERN_ERR
-			       "ext4: No journal on filesystem on %s\n",
-			       sb->s_id);
-		goto failed_mount3;
+		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+		sbi->s_journal = NULL;
+		needs_recovery = 0;
+		goto no_journal;
 	}
 
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+		printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
 		goto failed_mount4;
 	}
 
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	default:
 		break;
 	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+no_journal:
 
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-	if (needs_recovery)
+	if (needs_recovery) {
 		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
-	ext4_mark_recovery_complete(sb, es);
-	printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-	       "writeback");
+		ext4_mark_recovery_complete(sb, es);
+	}
+	if (EXT4_SB(sb)->s_journal) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			descr = " journalled data mode";
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			descr = " ordered data mode";
+		else
+			descr = " writeback data mode";
+	} else
+		descr = "out journal";
+
+	printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+	       sb->s_id, descr);
 
 	lock_kernel();
 	return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
 	goto failed_mount;
 
 failed_mount4:
-	jbd2_journal_destroy(sbi->s_journal);
-	sbi->s_journal = NULL;
+	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	if (sbi->s_journal) {
+		jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+	}
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
 	spin_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 	struct inode *journal_inode;
 	journal_t *journal;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	/* First, test for the existence of a valid inode on disk.  Bad
 	 * things happen if we iget() an unused inode, as the subsequent
 	 * iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	struct ext4_super_block *es;
 	struct block_device *bdev;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	bdev = ext4_blkdev_get(j_dev);
 	if (bdev == NULL)
 		return NULL;
 
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
-			"EXT4: failed to claim external journal device.\n");
+			"EXT4-fs: failed to claim external journal device.\n");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
 	int err = 0;
 	int really_read_only;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,48 +2888,6 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static int ext4_create_journal(struct super_block *sb,
-			       struct ext4_super_block *es,
-			       unsigned int journal_inum)
-{
-	journal_t *journal;
-	int err;
-
-	if (sb->s_flags & MS_RDONLY) {
-		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-				"create journal.\n");
-		return -EROFS;
-	}
-
-	journal = ext4_get_journal(sb, journal_inum);
-	if (!journal)
-		return -EINVAL;
-
-	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-	       journal_inum);
-
-	err = jbd2_journal_create(journal);
-	if (err) {
-		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-		jbd2_journal_destroy(journal);
-		return -EIO;
-	}
-
-	EXT4_SB(sb)->s_journal = journal;
-
-	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
-	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
-
-	/* Make sure we flush the recovery flag to disk. */
-	ext4_commit_super(sb, es, 1);
-
-	return 0;
-}
-
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync)
 {
@@ -2777,20 +2904,23 @@ static void ext4_commit_super(struct super_block *sb,
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "ext4: previous I/O error to "
+		printk(KERN_ERR "EXT4-fs: previous I/O error to "
 		       "superblock detected for %s.\n", sb->s_id);
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
 	es->s_wtime = cpu_to_le32(get_seconds());
-	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
-	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+					&EXT4_SB(sb)->s_freeblocks_counter));
+	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+					&EXT4_SB(sb)->s_freeinodes_counter));
+
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
 		sync_dirty_buffer(sbh);
 		if (buffer_write_io_error(sbh)) {
-			printk(KERN_ERR "ext4: I/O error while writing "
+			printk(KERN_ERR "EXT4-fs: I/O error while writing "
 			       "superblock for %s.\n", sb->s_id);
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
@@ -2809,6 +2939,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		BUG_ON(journal != NULL);
+		return;
+	}
 	jbd2_journal_lock_updates(journal);
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
@@ -2838,6 +2972,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
 	int j_errno;
 	const char *errstr;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	journal = EXT4_SB(sb)->s_journal;
 
 	/*
@@ -2870,14 +3006,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
-	int ret;
+	int ret = 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	sb->s_dirt = 0;
-	ret = ext4_journal_force_commit(journal);
+	if (journal) {
+		sb->s_dirt = 0;
+		ret = ext4_journal_force_commit(journal);
+	}
+
 	return ret;
 }
 
@@ -2889,9 +3028,13 @@ int ext4_force_commit(struct super_block *sb)
  */
 static void ext4_write_super(struct super_block *sb)
 {
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
+	if (EXT4_SB(sb)->s_journal) {
+		if (mutex_trylock(&sb->s_lock) != 0)
+			BUG();
+		sb->s_dirt = 0;
+	} else {
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+	}
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3043,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
-	if (wait)
-		ret = ext4_force_commit(sb);
-	else
-		jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	if (EXT4_SB(sb)->s_journal) {
+		if (wait)
+			ret = ext4_force_commit(sb);
+		else
+ 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	} else {
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+	}
 	return ret;
 }
 
@@ -2918,15 +3065,17 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
-		/* Now we set up the journal barrier. */
-		jbd2_journal_lock_updates(journal);
+		if (journal) {
+			/* Now we set up the journal barrier. */
+			jbd2_journal_lock_updates(journal);
 
-		/*
-		 * We don't want to clear needs_recovery flag when we failed
-		 * to flush the journal.
-		 */
-		if (jbd2_journal_flush(journal) < 0)
-			return;
+			/*
+			 * We don't want to clear needs_recovery flag when we
+			 * failed to flush the journal.
+			 */
+			if (jbd2_journal_flush(journal) < 0)
+				return;
+		}
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2940,7 +3089,7 @@ static void ext4_write_super_lockfs(struct super_block *sb)
  */
 static void ext4_unlockfs(struct super_block *sb)
 {
-	if (!(sb->s_flags & MS_RDONLY)) {
+	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
 		/* Reser the needs_recovery flag before the fs is unlocked. */
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2958,6 +3107,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
 	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
@@ -2969,16 +3119,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
 		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio,
+			   &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -2991,7 +3146,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	es = sbi->s_es;
 
-	ext4_init_journal_params(sb, sbi->s_journal);
+	if (sbi->s_journal) {
+		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3178,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * We have to unlock super so that we can wait for
 			 * transactions.
 			 */
-			unlock_super(sb);
-			ext4_mark_recovery_complete(sb, es);
-			lock_super(sb);
+			if (sbi->s_journal) {
+				unlock_super(sb);
+				ext4_mark_recovery_complete(sb, es);
+				lock_super(sb);
+			}
 		} else {
-			__le32 ret;
+			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
 				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
 				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n",
-				       sb->s_id, le32_to_cpu(ret));
+				       "optional features (%x).\n", sb->s_id,
+				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+					~EXT4_FEATURE_RO_COMPAT_SUPP));
 				err = -EROFS;
 				goto restore_opts;
 			}
@@ -3047,7 +3208,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
 					printk(KERN_ERR
 	       "EXT4-fs: ext4_remount: "
-		"Checksum for group %lu failed (%u!=%u)\n",
+		"Checksum for group %u failed (%u!=%u)\n",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
@@ -3076,7 +3237,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * been changed by e2fsck since we originally mounted
 			 * the partition.)
 			 */
-			ext4_clear_journal_err(sb, es);
+			if (sbi->s_journal)
+				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
 				goto restore_opts;
@@ -3084,6 +3246,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+	if (sbi->s_journal == NULL)
+		ext4_commit_super(sb, es, 1);
+
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3263,8 @@ restore_opts:
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3527,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext4_should_journal_data(path.dentry->d_inode)) {
+	if (EXT4_SB(sb)->s_journal &&
+	    ext4_should_journal_data(path.dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -3434,7 +3602,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
-	if (!handle) {
+	if (EXT4_SB(sb)->s_journal && !handle) {
 		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started.\n",
 			(unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3627,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		flush_dcache_page(bh->b_page);
 		unlock_buffer(bh);
 		if (journal_quota)
-			err = ext4_journal_dirty_metadata(handle, bh);
+			err = ext4_handle_dirty_metadata(handle, NULL, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
 			err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3681,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
 			       size_t cnt, loff_t *ppos)
 {
-	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+	unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
 	char str[32];
-	unsigned long value;
 
 	if (cnt >= sizeof(str))
 		return -EINVAL;
 	if (copy_from_user(str, buf, cnt))
 		return -EFAULT;
-	value = simple_strtol(str, NULL, 0);
-	if (value < 0)
-		return -ERANGE;
-	*p = value;
+
+	*p = simple_strtoul(str, NULL, 0);
 	return cnt;
 }
 
@@ -3615,7 +3780,7 @@ static void __exit exit_ext4_fs(void)
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
 module_init(init_ext4_fs)
 module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
 		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
 		sb->s_dirt = 1;
-		ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	}
 }
 
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-		error = ext4_journal_dirty_metadata(handle, bh);
+		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 		DQUOT_FREE_BLOCK(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (error == -EIO)
 				goto bad_block;
 			if (!error)
-				error = ext4_journal_dirty_metadata(handle,
-								    bs->bh);
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   bs->bh);
 			if (error)
 				goto cleanup;
 			goto inserted;
@@ -794,8 +795,9 @@ inserted:
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					le32_to_cpu(BHDR(new_bh)->h_refcount));
 				unlock_buffer(new_bh);
-				error = ext4_journal_dirty_metadata(handle,
-								    new_bh);
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   new_bh);
 				if (error)
 					goto cleanup_dquot;
 			}
@@ -810,8 +812,8 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
-							goal, &error);
+			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
 			ext4_xattr_cache_insert(new_bh);
-			error = ext4_journal_dirty_metadata(handle, new_bh);
+			error = ext4_handle_dirty_metadata(handle,
+							   inode, new_bh);
 			if (error)
 				goto cleanup;
 		}
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		 */
 		is.iloc.bh = NULL;
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 	}
 
 cleanup:
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
 
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
 	struct file_system_type *fs;
-	const char *dot = strchr(name, '.');
-	unsigned len = dot ? dot - name : strlen(name);
 
 	read_lock(&file_systems_lock);
 	fs = *(find_filesystem(name, len));
 	if (fs && !try_module_get(fs->owner))
 		fs = NULL;
 	read_unlock(&file_systems_lock);
-	if (!fs && (request_module("%.*s", len, name) == 0)) {
-		read_lock(&file_systems_lock);
-		fs = *(find_filesystem(name, len));
-		if (fs && !try_module_get(fs->owner))
-			fs = NULL;
-		read_unlock(&file_systems_lock);
-	}
+	return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+	struct file_system_type *fs;
+	const char *dot = strchr(name, '.');
+	int len = dot ? dot - name : strlen(name);
+
+	fs = __get_fs_type(name, len);
+	if (!fs && (request_module("%.*s", len, name) == 0))
+		fs = __get_fs_type(name, len);
 
 	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
 		put_filesystem(fs);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * If we're a pdlfush thread, then implement pdflush collision avoidance
  * against the entire list.
  *
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  * This function assumes that the blockdev superblock's inodes are backed by
  * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 				struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
+	int sync = wbc->sync_mode == WB_SYNC_ALL;
 
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
-		if (wbc->sync_mode == WB_SYNC_HOLD) {
-			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &sb->s_dirty);
-		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
 		if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		if (!list_empty(&sb->s_more_io))
 			wbc->more_io = 1;
 	}
-	spin_unlock(&inode_lock);
+
+	if (sync) {
+		struct inode *inode, *old_inode = NULL;
+
+		/*
+		 * Data integrity sync. Must wait for all pages under writeback,
+		 * because there may have been pages dirtied before our sync
+		 * call, but which had writeout started before we write it out.
+		 * In which case, the inode may not be on the dirty list, but
+		 * we still have to wait for that writeout.
+		 */
+		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+			struct address_space *mapping;
+
+			if (inode->i_state & (I_FREEING|I_WILL_FREE))
+				continue;
+			mapping = inode->i_mapping;
+			if (mapping->nrpages == 0)
+				continue;
+			__iget(inode);
+			spin_unlock(&inode_lock);
+			/*
+			 * We hold a reference to 'inode' so it couldn't have
+			 * been removed from s_inodes list while we dropped the
+			 * inode_lock.  We cannot iput the inode now as we can
+			 * be holding the last reference and we cannot iput it
+			 * under inode_lock. So we keep the reference and iput
+			 * it later.
+			 */
+			iput(old_inode);
+			old_inode = inode;
+
+			filemap_fdatawait(mapping);
+
+			cond_resched();
+
+			spin_lock(&inode_lock);
+		}
+		spin_unlock(&inode_lock);
+		iput(old_inode);
+	} else
+		spin_unlock(&inode_lock);
+
 	return;		/* Leave any unwritten inodes on s_io */
 }
 EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
 
 /*
  * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
- * used to park the written inodes on sb->s_dirty for the wait pass.
+ * do this in two passes - one to write, and one to wait.
  *
  * A finite limit is set on the number of pages which will be written.
  * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
-		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
-	wbc.nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
-			nr_dirty + nr_unstable;
-	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
-	sync_sb_inodes(sb, &wbc);
-}
+	if (!wait) {
+		unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+		unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
-/*
- * Rather lame livelock avoidance.
- */
-static void set_sb_syncing(int val)
-{
-	struct super_block *sb;
-	spin_lock(&sb_lock);
-	list_for_each_entry_reverse(sb, &super_blocks, s_list)
-		sb->s_syncing = val;
-	spin_unlock(&sb_lock);
+		wbc.nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	} else
+		wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
+
+	sync_sb_inodes(sb, &wbc);
 }
 
 /**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_syncing)
-			continue;
-		sb->s_syncing = 1;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
 
 void sync_inodes(int wait)
 {
-	set_sb_syncing(0);
 	__sync_inodes(0);
 
-	if (wait) {
-		set_sb_syncing(0);
+	if (wait)
 		__sync_inodes(1);
-	}
 }
 
 /**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
 	size_t size;
 
 	if (!*ppos) {
+		long value;
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (!fc)
 			return 0;
 
-		file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+		value = atomic_read(&fc->num_waiting);
+		file->private_data = (void *)value;
 		fuse_conn_put(fc);
 	}
 	size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..e0c7ada08a1f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-	__releases(fc->lock)
+__releases(&fc->lock)
 {
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	wake_up(&req->waitq);
 	if (end)
 		end(fc, req);
-	else
-		fuse_put_request(fc, req);
+	fuse_put_request(fc, req);
 }
 
 static void wait_answer_interruptible(struct fuse_conn *fc,
 				      struct fuse_req *req)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	if (signal_pending(current))
 		return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
 	spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 	spin_unlock(&fc->lock);
 }
 
-static void request_send_nowait_locked(struct fuse_conn *fc,
-				       struct fuse_req *req)
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+					    struct fuse_req *req)
 {
 	req->background = 1;
 	fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
 	flush_bg_queue(fc);
 }
 
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
 	if (fc->connected) {
-		request_send_nowait_locked(fc, req);
+		fuse_request_send_nowait_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
 		req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 0;
-	request_send_nowait(fc, req);
+	fuse_request_send_nowait(fc, req);
 }
 
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	request_send_nowait(fc, req);
+	fuse_request_send_nowait(fc, req);
 }
 
 /*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
  *
  * fc->connected must have been checked previously
  */
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
 {
 	req->isreply = 1;
-	request_send_nowait_locked(fc, req);
+	fuse_request_send_nowait_locked(fc, req);
 }
 
 /*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		BUG_ON(!cs->nr_segs);
 		cs->seglen = cs->iov[0].iov_len;
 		cs->addr = (unsigned long) cs->iov[0].iov_base;
-		cs->iov ++;
-		cs->nr_segs --;
+		cs->iov++;
+		cs->nr_segs--;
 	}
 	down_read(&current->mm->mmap_sem);
 	err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
 		kunmap_atomic(mapaddr, KM_USER1);
 	}
 	while (count) {
-		int err;
-		if (!cs->len && (err = fuse_copy_fill(cs)))
-			return err;
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
 		if (page) {
 			void *mapaddr = kmap_atomic(page, KM_USER1);
 			void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
 	while (size) {
-		int err;
-		if (!cs->len && (err = fuse_copy_fill(cs)))
-			return err;
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
 		fuse_copy_do(cs, &val, &size);
 	}
 	return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
 
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
  */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
 			       const struct iovec *iov, unsigned long nr_segs)
-	__releases(fc->lock)
+__releases(&fc->lock)
 {
 	struct fuse_copy_state cs;
 	struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return err;
 }
 
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+			    struct fuse_copy_state *cs)
+{
+	struct fuse_notify_poll_wakeup_out outarg;
+	int err;
+
+	if (size != sizeof(outarg))
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		return err;
+
+	return fuse_notify_poll_wakeup(fc, &outarg);
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+		       unsigned int size, struct fuse_copy_state *cs)
+{
+	switch (code) {
+	case FUSE_NOTIFY_POLL:
+		return fuse_notify_poll(fc, size, cs);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	err = fuse_copy_one(&cs, &oh, sizeof(oh));
 	if (err)
 		goto err_finish;
+
+	err = -EINVAL;
+	if (oh.len != nbytes)
+		goto err_finish;
+
+	/*
+	 * Zero oh.unique indicates unsolicited notification message
+	 * and error contains notification code.
+	 */
+	if (!oh.unique) {
+		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+		fuse_copy_finish(&cs);
+		return err ? err : nbytes;
+	}
+
 	err = -EINVAL;
-	if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
-	    oh.len != nbytes)
+	if (oh.error <= -1000 || oh.error > 0)
 		goto err_finish;
 
 	spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
  * locked).
  */
 static void end_io_requests(struct fuse_conn *fc)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	while (!list_empty(&fc->io)) {
 		struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
 		wake_up(&req->waitq);
 		if (end) {
 			req->end = NULL;
-			/* The end function will consume this reference */
 			__fuse_get_request(req);
 			spin_unlock(&fc->lock);
 			wait_event(req->waitq, !req->locked);
 			end(fc, req);
+			fuse_put_request(fc, req);
 			spin_lock(&fc->lock);
 		}
 	}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		parent = dget_parent(entry);
 		fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
 				 &entry->d_name, &outarg);
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		dput(parent);
 		err = req->out.h.error;
 		fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 				return 0;
 			}
 			spin_lock(&fc->lock);
-			fi->nlookup ++;
+			fi->nlookup++;
 			spin_unlock(&fc->lock);
 		}
 		fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	attr_version = fuse_get_attr_version(fc);
 
 	fuse_lookup_init(fc, req, nodeid, name, outarg);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	/* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
 {
 	fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
-	request_send(fc, ff->reserved_req);
+	fuse_request_send(fc, ff->reserved_req);
 	fuse_put_request(fc, ff->reserved_req);
 	kfree(ff);
 }
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 		goto out_put_forget_req;
 
 	err = -ENOMEM;
-	ff = fuse_file_alloc();
+	ff = fuse_file_alloc(fc);
 	if (!ff)
 		goto out_put_request;
 
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	req->out.args[0].value = &outentry;
 	req->out.args[1].size = sizeof(outopen);
 	req->out.args[1].value = &outopen;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	if (err) {
 		if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
 		struct inode *inode = entry->d_inode;
 
-		/* Set nlink to zero so the inode can be cleared, if
-                   the inode does have more links this will be
-                   discovered at the next lookup/getattr */
+		/*
+		 * Set nlink to zero so the inode can be cleared, if the inode
+		 * does have more links this will be discovered at the next
+		 * lookup/getattr.
+		 */
 		clear_nlink(inode);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	req->in.args[1].value = oldent->d_name.name;
 	req->in.args[2].size = newent->d_name.len + 1;
 	req->in.args[2].value = newent->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	req->num_pages = 1;
 	req->pages[0] = page;
 	fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	nbytes = req->out.args[0].size;
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
 	req->out.numargs = 1;
 	req->out.args[0].size = PAGE_SIZE - 1;
 	req->out.args[0].value = link;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	if (req->out.h.error) {
 		free_page((unsigned long) link);
 		link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	req->in.args[1].value = name;
 	req->in.args[2].size = size;
 	req->in.args[2].value = value;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 		req->out.args[0].size = sizeof(outarg);
 		req->out.args[0].value = &outarg;
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	ret = req->out.h.error;
 	if (!ret)
 		ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 		req->out.args[0].size = sizeof(outarg);
 		req->out.args[0].value = &outarg;
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	ret = req->out.h.error;
 	if (!ret)
 		ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	req->in.numargs = 1;
 	req->in.args[0].size = strlen(name) + 1;
 	req->in.args[0].value = name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4c9ee7011265..e8162646a9b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(*outargp);
 	req->out.args[0].value = outargp;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
 	return err;
 }
 
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
 	struct fuse_file *ff;
 	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
 		} else {
 			INIT_LIST_HEAD(&ff->write_entry);
 			atomic_set(&ff->count, 0);
+			spin_lock(&fc->lock);
+			ff->kh = ++fc->khctr;
+			spin_unlock(&fc->lock);
 		}
+		RB_CLEAR_NODE(&ff->polled_node);
+		init_waitqueue_head(&ff->poll_wait);
 	}
 	return ff;
 }
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	dput(req->misc.release.dentry);
 	mntput(req->misc.release.vfsmount);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
 		struct inode *inode = req->misc.release.dentry->d_inode;
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		req->end = fuse_release_end;
-		request_send_background(fc, req);
+		fuse_request_send_background(fc, req);
 		kfree(ff);
 	}
 }
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_open_out outarg;
 	struct fuse_file *ff;
 	int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	if (err)
 		return err;
 
-	ff = fuse_file_alloc();
+	ff = fuse_file_alloc(fc);
 	if (!ff)
 		return -ENOMEM;
 
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
 
 		spin_lock(&fc->lock);
 		list_del(&ff->write_entry);
+		if (!RB_EMPTY_NODE(&ff->polled_node))
+			rb_erase(&ff->polled_node, &fc->polled_files);
 		spin_unlock(&fc->lock);
+
+		wake_up_interruptible_sync(&ff->poll_wait);
 		/*
 		 * Normally this will send the RELEASE request,
 		 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->force = 1;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 		inarg->read_flags |= FUSE_READ_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	return req->out.args[0].size;
 }
 
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 	}
 	if (req->ff)
 		fuse_file_put(req->ff);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 		struct fuse_file *ff = file->private_data;
 		req->ff = fuse_file_get(ff);
 		req->end = fuse_readpages_end;
-		request_send_background(fc, req);
+		fuse_request_send_background(fc, req);
 	} else {
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		fuse_readpages_end(fc, req);
+		fuse_put_request(fc, req);
 	}
 }
 
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 		}
 	}
 	req->pages[req->num_pages] = page;
-	req->num_pages ++;
+	req->num_pages++;
 	return 0;
 }
 
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	return req->misc.write.out.size;
 }
 
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
 	__free_page(req->pages[0]);
 	fuse_file_put(req->ff);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 
 	req->in.args[1].size = inarg->size;
 	fi->writectr++;
-	request_send_background_locked(fc, req);
+	fuse_request_send_background_locked(fc, req);
 	return;
 
  out_free:
 	fuse_writepage_finish(fc, req);
 	spin_unlock(&fc->lock);
 	fuse_writepage_free(fc, req);
+	fuse_put_request(fc, req);
 	spin_lock(&fc->lock);
 }
 
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
  * Called with fc->lock
  */
 void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 		return PTR_ERR(req);
 
 	fuse_lk_fill(req, file, fl, opcode, pid, flock);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	/* locking is restartable */
 	if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	return retval;
 }
 
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+			unsigned int nr_segs, size_t bytes, bool to_user)
+{
+	struct iov_iter ii;
+	int page_idx = 0;
+
+	if (!bytes)
+		return 0;
+
+	iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+
+	while (iov_iter_count(&ii)) {
+		struct page *page = pages[page_idx++];
+		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+		void *kaddr, *map;
+
+		kaddr = map = kmap(page);
+
+		while (todo) {
+			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			size_t copy = min(todo, iov_len);
+			size_t left;
+
+			if (!to_user)
+				left = copy_from_user(kaddr, uaddr, copy);
+			else
+				left = copy_to_user(uaddr, kaddr, copy);
+
+			if (unlikely(left))
+				return -EFAULT;
+
+			iov_iter_advance(&ii, copy);
+			todo -= copy;
+			kaddr += copy;
+		}
+
+		kunmap(map);
+	}
+
+	return 0;
+}
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *	char	*buf;
+ *	size_t	buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
+ *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg, unsigned int flags)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_ioctl_in inarg = {
+		.fh = ff->fh,
+		.cmd = cmd,
+		.arg = arg,
+		.flags = flags
+	};
+	struct fuse_ioctl_out outarg;
+	struct fuse_req *req = NULL;
+	struct page **pages = NULL;
+	struct page *iov_page = NULL;
+	struct iovec *in_iov = NULL, *out_iov = NULL;
+	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+	size_t in_size, out_size, transferred;
+	int err;
+
+	/* assume all the iovs returned by client always fits in a page */
+	BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = -ENOMEM;
+	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+	iov_page = alloc_page(GFP_KERNEL);
+	if (!pages || !iov_page)
+		goto out;
+
+	/*
+	 * If restricted, initialize IO parameters as encoded in @cmd.
+	 * RETRY from server is not allowed.
+	 */
+	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+		struct iovec *iov = page_address(iov_page);
+
+		iov->iov_base = (void __user *)arg;
+		iov->iov_len = _IOC_SIZE(cmd);
+
+		if (_IOC_DIR(cmd) & _IOC_WRITE) {
+			in_iov = iov;
+			in_iovs = 1;
+		}
+
+		if (_IOC_DIR(cmd) & _IOC_READ) {
+			out_iov = iov;
+			out_iovs = 1;
+		}
+	}
+
+ retry:
+	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+	/*
+	 * Out data can be used either for actual out data or iovs,
+	 * make sure there always is at least one page.
+	 */
+	out_size = max_t(size_t, out_size, PAGE_SIZE);
+	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+	/* make sure there are enough buffer pages and init request with them */
+	err = -ENOMEM;
+	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+		goto out;
+	while (num_pages < max_pages) {
+		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!pages[num_pages])
+			goto out;
+		num_pages++;
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		req = NULL;
+		goto out;
+	}
+	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+	req->num_pages = num_pages;
+
+	/* okay, let's send it to the client */
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	if (in_size) {
+		req->in.numargs++;
+		req->in.args[1].size = in_size;
+		req->in.argpages = 1;
+
+		err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+					   false);
+		if (err)
+			goto out;
+	}
+
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	req->out.args[1].size = out_size;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	transferred = req->out.args[1].size;
+	fuse_put_request(fc, req);
+	req = NULL;
+	if (err)
+		goto out;
+
+	/* did it ask for retry? */
+	if (outarg.flags & FUSE_IOCTL_RETRY) {
+		char *vaddr;
+
+		/* no retry if in restricted mode */
+		err = -EIO;
+		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+			goto out;
+
+		in_iovs = outarg.in_iovs;
+		out_iovs = outarg.out_iovs;
+
+		/*
+		 * Make sure things are in boundary, separate checks
+		 * are to protect against overflow.
+		 */
+		err = -ENOMEM;
+		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+		    out_iovs > FUSE_IOCTL_MAX_IOV ||
+		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+			goto out;
+
+		err = -EIO;
+		if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+			goto out;
+
+		/* okay, copy in iovs and retry */
+		vaddr = kmap_atomic(pages[0], KM_USER0);
+		memcpy(page_address(iov_page), vaddr, transferred);
+		kunmap_atomic(vaddr, KM_USER0);
+
+		in_iov = page_address(iov_page);
+		out_iov = in_iov + in_iovs;
+
+		goto retry;
+	}
+
+	err = -EIO;
+	if (transferred > inarg.out_size)
+		goto out;
+
+	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+	if (req)
+		fuse_put_request(fc, req);
+	if (iov_page)
+		__free_page(iov_page);
+	while (num_pages)
+		__free_page(pages[--num_pages]);
+	kfree(pages);
+
+	return err ? err : outarg.result;
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+					      struct rb_node **parent_out)
+{
+	struct rb_node **link = &fc->polled_files.rb_node;
+	struct rb_node *last = NULL;
+
+	while (*link) {
+		struct fuse_file *ff;
+
+		last = *link;
+		ff = rb_entry(last, struct fuse_file, polled_node);
+
+		if (kh < ff->kh)
+			link = &last->rb_left;
+		else if (kh > ff->kh)
+			link = &last->rb_right;
+		else
+			return link;
+	}
+
+	if (parent_out)
+		*parent_out = last;
+	return link;
+}
+
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+				      struct fuse_file *ff)
+{
+	spin_lock(&fc->lock);
+	if (RB_EMPTY_NODE(&ff->polled_node)) {
+		struct rb_node **link, *parent;
+
+		link = fuse_find_polled_node(fc, ff->kh, &parent);
+		BUG_ON(*link);
+		rb_link_node(&ff->polled_node, parent, link);
+		rb_insert_color(&ff->polled_node, &fc->polled_files);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+	struct fuse_poll_out outarg;
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_poll)
+		return DEFAULT_POLLMASK;
+
+	poll_wait(file, &ff->poll_wait, wait);
+
+	/*
+	 * Ask for notification iff there's someone waiting for it.
+	 * The client may ignore the flag and always notify.
+	 */
+	if (waitqueue_active(&ff->poll_wait)) {
+		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+		fuse_register_polled_file(fc, ff);
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_POLL;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err)
+		return outarg.revents;
+	if (err == -ENOSYS) {
+		fc->no_poll = 1;
+		return DEFAULT_POLLMASK;
+	}
+	return POLLERR;
+}
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg)
+{
+	u64 kh = outarg->kh;
+	struct rb_node **link;
+
+	spin_lock(&fc->lock);
+
+	link = fuse_find_polled_node(fc, kh, NULL);
+	if (*link) {
+		struct fuse_file *ff;
+
+		ff = rb_entry(*link, struct fuse_file, polled_node);
+		wake_up_interruptible_sync(&ff->poll_wait);
+	}
+
+	spin_unlock(&fc->lock);
+	return 0;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
 	.lock		= fuse_file_lock,
 	.flock		= fuse_file_flock,
 	.splice_read	= generic_file_splice_read,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.fsync		= fuse_fsync,
 	.lock		= fuse_file_lock,
 	.flock		= fuse_file_flock,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
 	/* no mmap and splice_read */
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -19,6 +19,8 @@
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
 	/** Request reserved for flush and release */
 	struct fuse_req *reserved_req;
 
+	/** Kernel file handle guaranteed to be unique */
+	u64 kh;
+
 	/** File handle used by userspace */
 	u64 fh;
 
@@ -108,6 +113,12 @@ struct fuse_file {
 
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
+
+	/** RB node to be linked on fuse_conn->polled_files */
+	struct rb_node polled_node;
+
+	/** Wait queue head for poll */
+	wait_queue_head_t poll_wait;
 };
 
 /** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
 	/** The list of requests under I/O */
 	struct list_head io;
 
+	/** The next unique kernel file handle */
+	u64 khctr;
+
+	/** rbtree of fuse_files waiting for poll events indexed by ph */
+	struct rb_root polled_files;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
@@ -355,19 +372,19 @@ struct fuse_conn {
 	/** Connection failed (version mismatch).  Cannot race with
 	    setting other bitfields since it is only set once in INIT
 	    reply, before any other request, and never cleared */
-	unsigned conn_error : 1;
+	unsigned conn_error:1;
 
 	/** Connection successful.  Only set in INIT */
-	unsigned conn_init : 1;
+	unsigned conn_init:1;
 
 	/** Do readpages asynchronously?  Only set in INIT */
-	unsigned async_read : 1;
+	unsigned async_read:1;
 
 	/** Do not send separate SETATTR request before open(O_TRUNC)  */
-	unsigned atomic_o_trunc : 1;
+	unsigned atomic_o_trunc:1;
 
 	/** Filesystem supports NFS exporting.  Only set in INIT */
-	unsigned export_support : 1;
+	unsigned export_support:1;
 
 	/*
 	 * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
 	 */
 
 	/** Is fsync not implemented by fs? */
-	unsigned no_fsync : 1;
+	unsigned no_fsync:1;
 
 	/** Is fsyncdir not implemented by fs? */
-	unsigned no_fsyncdir : 1;
+	unsigned no_fsyncdir:1;
 
 	/** Is flush not implemented by fs? */
-	unsigned no_flush : 1;
+	unsigned no_flush:1;
 
 	/** Is setxattr not implemented by fs? */
-	unsigned no_setxattr : 1;
+	unsigned no_setxattr:1;
 
 	/** Is getxattr not implemented by fs? */
-	unsigned no_getxattr : 1;
+	unsigned no_getxattr:1;
 
 	/** Is listxattr not implemented by fs? */
-	unsigned no_listxattr : 1;
+	unsigned no_listxattr:1;
 
 	/** Is removexattr not implemented by fs? */
-	unsigned no_removexattr : 1;
+	unsigned no_removexattr:1;
 
 	/** Are file locking primitives not implemented by fs? */
-	unsigned no_lock : 1;
+	unsigned no_lock:1;
 
 	/** Is access not implemented by fs? */
-	unsigned no_access : 1;
+	unsigned no_access:1;
 
 	/** Is create not implemented by fs? */
-	unsigned no_create : 1;
+	unsigned no_create:1;
 
 	/** Is interrupt not implemented by fs? */
-	unsigned no_interrupt : 1;
+	unsigned no_interrupt:1;
 
 	/** Is bmap not implemented by fs? */
-	unsigned no_bmap : 1;
+	unsigned no_bmap:1;
+
+	/** Is poll not implemented by fs? */
+	unsigned no_poll:1;
 
 	/** Do multi-page cached writes */
-	unsigned big_writes : 1;
+	unsigned big_writes:1;
 
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
 
 	/** Version counter for attribute changes */
 	u64 attr_version;
+
+	/** Called on final put */
+	void (*release)(struct fuse_conn *);
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
  */
 int fuse_open_common(struct inode *inode, struct file *file, int isdir);
 
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
 		      struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 		      int isdir);
 
 /**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
  * Initialize file operations on a regular file
  */
 void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 /**
  * Send a request (synchronous)
  */
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
  * Send a request with no reply
  */
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
  * Send a request in the background
  */
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req);
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 
 /**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+
+/**
  * Release reference to fuse_conn
  */
 void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..47c96fdca1ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
 	unsigned rootmode;
 	unsigned user_id;
 	unsigned group_id;
-	unsigned fd_present : 1;
-	unsigned rootmode_present : 1;
-	unsigned user_id_present : 1;
-	unsigned group_id_present : 1;
+	unsigned fd_present:1;
+	unsigned rootmode_present:1;
+	unsigned user_id_present:1;
+	unsigned group_id_present:1;
 	unsigned flags;
 	unsigned max_read;
 	unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_forget_in);
 	req->in.args[0].value = inarg;
-	request_send_noreply(fc, req);
+	fuse_request_send_noreply(fc, req);
 }
 
 static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 
 	fi = get_fuse_inode(inode);
 	spin_lock(&fc->lock);
-	fi->nlookup ++;
+	fi->nlookup++;
 	spin_unlock(&fc->lock);
 	fuse_change_attributes(inode, attr, attr_valid, attr_version);
 
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
 		req->force = 1;
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
 }
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	req->out.args[0].size =
 		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	if (!err)
 		convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
 {
-	struct fuse_conn *fc;
 	int err;
 
-	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
-	if (fc) {
-		spin_lock_init(&fc->lock);
-		mutex_init(&fc->inst_mutex);
-		atomic_set(&fc->count, 1);
-		init_waitqueue_head(&fc->waitq);
-		init_waitqueue_head(&fc->blocked_waitq);
-		init_waitqueue_head(&fc->reserved_req_waitq);
-		INIT_LIST_HEAD(&fc->pending);
-		INIT_LIST_HEAD(&fc->processing);
-		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->interrupts);
-		INIT_LIST_HEAD(&fc->bg_queue);
-		atomic_set(&fc->num_waiting, 0);
-		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-		fc->bdi.unplug_io_fn = default_unplug_io_fn;
-		/* fuse does it's own writeback accounting */
-		fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
-		fc->dev = sb->s_dev;
-		err = bdi_init(&fc->bdi);
-		if (err)
-			goto error_kfree;
-		if (sb->s_bdev) {
-			err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-					   MAJOR(fc->dev), MINOR(fc->dev));
-		} else {
-			err = bdi_register_dev(&fc->bdi, fc->dev);
-		}
-		if (err)
-			goto error_bdi_destroy;
-		/*
-		 * For a single fuse filesystem use max 1% of dirty +
-		 * writeback threshold.
-		 *
-		 * This gives about 1M of write buffer for memory maps on a
-		 * machine with 1G and 10% dirty_ratio, which should be more
-		 * than enough.
-		 *
-		 * Privileged users can raise it by writing to
-		 *
-		 *    /sys/class/bdi/<bdi>/max_ratio
-		 */
-		bdi_set_max_ratio(&fc->bdi, 1);
-		fc->reqctr = 0;
-		fc->blocked = 1;
-		fc->attr_version = 1;
-		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+	memset(fc, 0, sizeof(*fc));
+	spin_lock_init(&fc->lock);
+	mutex_init(&fc->inst_mutex);
+	atomic_set(&fc->count, 1);
+	init_waitqueue_head(&fc->waitq);
+	init_waitqueue_head(&fc->blocked_waitq);
+	init_waitqueue_head(&fc->reserved_req_waitq);
+	INIT_LIST_HEAD(&fc->pending);
+	INIT_LIST_HEAD(&fc->processing);
+	INIT_LIST_HEAD(&fc->io);
+	INIT_LIST_HEAD(&fc->interrupts);
+	INIT_LIST_HEAD(&fc->bg_queue);
+	INIT_LIST_HEAD(&fc->entry);
+	atomic_set(&fc->num_waiting, 0);
+	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	fc->bdi.unplug_io_fn = default_unplug_io_fn;
+	/* fuse does it's own writeback accounting */
+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+	fc->khctr = 0;
+	fc->polled_files = RB_ROOT;
+	fc->dev = sb->s_dev;
+	err = bdi_init(&fc->bdi);
+	if (err)
+		goto error_mutex_destroy;
+	if (sb->s_bdev) {
+		err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+				   MAJOR(fc->dev), MINOR(fc->dev));
+	} else {
+		err = bdi_register_dev(&fc->bdi, fc->dev);
 	}
-	return fc;
+	if (err)
+		goto error_bdi_destroy;
+	/*
+	 * For a single fuse filesystem use max 1% of dirty +
+	 * writeback threshold.
+	 *
+	 * This gives about 1M of write buffer for memory maps on a
+	 * machine with 1G and 10% dirty_ratio, which should be more
+	 * than enough.
+	 *
+	 * Privileged users can raise it by writing to
+	 *
+	 *    /sys/class/bdi/<bdi>/max_ratio
+	 */
+	bdi_set_max_ratio(&fc->bdi, 1);
+	fc->reqctr = 0;
+	fc->blocked = 1;
+	fc->attr_version = 1;
+	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 
-error_bdi_destroy:
+	return 0;
+
+ error_bdi_destroy:
 	bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
 	mutex_destroy(&fc->inst_mutex);
-	kfree(fc);
-	return NULL;
+	return err;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_init);
 
 void fuse_conn_put(struct fuse_conn *fc)
 {
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
 		bdi_destroy(&fc->bdi);
-		kfree(fc);
+		fc->release(fc);
 	}
 }
 
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
 	return fc;
 }
 
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
 	memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 	return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
 
-struct fuse_inode_handle
-{
+struct fuse_inode_handle {
 	u64 nodeid;
 	u32 generation;
 };
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fuse_put_request(fc, req);
 	fc->blocked = 0;
 	wake_up_all(&fc->blocked_waitq);
 }
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	req->out.args[0].size = sizeof(struct fuse_init_out);
 	req->out.args[0].value = &req->misc.init_out;
 	req->end = process_init_reply;
-	request_send_background(fc, req);
+	fuse_request_send_background(fc, req);
+}
+
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+	kfree(fc);
 }
 
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->f_op != &fuse_dev_operations)
 		return -EINVAL;
 
-	fc = new_conn(sb);
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc)
 		return -ENOMEM;
 
+	err = fuse_conn_init(fc, sb);
+	if (err) {
+		kfree(fc);
+		return err;
+	}
+
+	fc->release = fuse_free_conn;
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = fc;
 
 	err = -ENOMEM;
-	root = get_root_inode(sb, d.rootmode);
+	root = fuse_get_root_inode(sb, d.rootmode);
 	if (!root)
 		goto err;
 
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
 
 static void fuse_inode_init_once(void *foo)
 {
-	struct inode * inode = foo;
+	struct inode *inode = foo;
 
 	inode_init_once(inode);
 }
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
 {
 	int res;
 
-	printk("fuse init (API version %i.%i)\n",
+	printk(KERN_INFO "fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
 	INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+	depends on EXPERIMENTAL && (64BIT || LBD)
 	select FS_POSIX_ACL
 	select CRC32
 	help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
 	struct gfs2_ea_location el_this;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return 0;
 
 	memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 		void *kaddr = kmap(page);
 
 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-		       ip->i_di.di_size);
-		memset(kaddr + ip->i_di.di_size, 0,
-		       PAGE_CACHE_SIZE - ip->i_di.di_size);
+		       ip->i_disksize);
+		memset(kaddr + ip->i_disksize, 0,
+		       PAGE_CACHE_SIZE - ip->i_disksize);
 		kunmap(page);
 
 		SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	if (ip->i_di.di_size) {
+	if (ip->i_disksize) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	if (ip->i_di.di_size) {
+	if (ip->i_disksize) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 		}
 	}
 
-	ip->i_di.di_size = size;
+	ip->i_disksize = size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 		goto out;
 
 	if (gfs2_is_stuffed(ip)) {
-		ip->i_di.di_size = size;
+		ip->i_disksize = size;
 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
 
 		if (!error) {
-			ip->i_di.di_size = size;
+			ip->i_disksize = size;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+			ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 			gfs2_dinode_out(ip, dibh->b_data);
 		}
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	if (!ip->i_di.di_size) {
+	if (!ip->i_disksize) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	}
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
 		return -EINVAL;
 
-	if (size > ip->i_di.di_size)
+	if (size > ip->i_disksize)
 		error = do_grow(ip, size);
-	else if (size < ip->i_di.di_size)
+	else if (size < ip->i_disksize)
 		error = do_shrink(ip, size);
 	else
 		/* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, ip->i_di.di_size);
+	error = trunc_dealloc(ip, ip->i_disksize);
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 }
 
 /**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int tmp;
-
-	if (gfs2_is_dir(ip)) {
-		*data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
-		*ind_blocks = 3 * (sdp->sd_max_jheight - 1);
-	} else {
-		*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
-		*ind_blocks = 3 * (sdp->sd_max_height - 1);
-	}
-
-	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		*ind_blocks += tmp;
-	}
-}
-
-/**
  * gfs2_write_alloc_required - figure out if a write will require an allocation
  * @ip: the file being written to
  * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 	struct buffer_head bh;
 	unsigned int shift;
 	u64 lblock, lblock_stop, size;
+	u64 end_of_file;
 
 	*alloc_required = 0;
 
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 
 	*alloc_required = 1;
 	shift = sdp->sd_sb.sb_bsize_shift;
-	if (gfs2_is_dir(ip)) {
-		unsigned int bsize = sdp->sd_jbsize;
-		lblock = offset;
-		do_div(lblock, bsize);
-		lblock_stop = offset + len + bsize - 1;
-		do_div(lblock_stop, bsize);
-	} else {
-		u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
-		lblock = offset >> shift;
-		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-		if (lblock_stop > end_of_file)
-			return 0;
-	}
+	BUG_ON(gfs2_is_dir(ip));
+	end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+	lblock = offset >> shift;
+	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+	if (lblock_stop > end_of_file)
+		return 0;
 
 	size = (lblock_stop - lblock) << shift;
 	do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
 
+#include "inode.h"
+
 struct inode;
 struct gfs2_inode;
 struct page;
 
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+					  unsigned int len,
+					  unsigned int *data_blocks,
+					  unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp;
+
+	BUG_ON(gfs2_is_dir(ip));
+	*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+	*ind_blocks = 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		*ind_blocks += tmp;
+	}
+}
+
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-			    unsigned int *data_blocks,
-			    unsigned int *ind_blocks);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 			      unsigned int len, int *alloc_required);
 
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-
-/* This uses schedule_timeout() instead of msleep() because it's good for
-   the daemons to wake up more often than the timeout when unmounting so
-   the user's unmount doesn't sit there forever.
-
-   The kthread functions used to start these daemons block and flush signals. */
-
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-
-int gfs2_glockd(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-
-	while (!kthread_should_stop()) {
-		while (atomic_read(&sdp->sd_reclaim_count))
-			gfs2_reclaim_glock(sdp);
-
-		wait_event_interruptible(sdp->sd_reclaim_wq,
-					 (atomic_read(&sdp->sd_reclaim_count) ||
-					 kthread_should_stop()));
-		if (freezing(current))
-			refrigerator();
-	}
-
-	return 0;
-}
-
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_check_journals(sdp);
-		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_quotad(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-	int error;
-
-	while (!kthread_should_stop()) {
-		/* Update the master statfs file */
-
-		t = sdp->sd_statfs_sync_time +
-		    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			error = gfs2_statfs_sync(sdp);
-			if (error &&
-			    error != -EROFS &&
-			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-				fs_err(sdp, "quotad: (1) error=%d\n", error);
-			sdp->sd_statfs_sync_time = jiffies;
-		}
-
-		/* Update quota file */
-
-		t = sdp->sd_quota_sync_time +
-		    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			error = gfs2_quota_sync(sdp);
-			if (error &&
-			    error != -EROFS &&
-			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-				fs_err(sdp, "quotad: (2) error=%d\n", error);
-			sdp->sd_quota_sync_time = jiffies;
-		}
-
-		gfs2_quota_scan(sdp);
-
-		t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-
-int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
-
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
  * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
  * beginning of the leaf block. The dirents reside in leaves when
  *
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
  *
  * Otherwise, the dirents are "linear", within a single stuffed dinode block.
  *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-	if (ip->i_di.di_size < offset + size)
-		ip->i_di.di_size = offset + size;
+	if (ip->i_disksize < offset + size)
+		ip->i_disksize = offset + size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -226,8 +226,8 @@ out:
 	if (error)
 		return error;
 
-	if (ip->i_di.di_size < offset + copied)
-		ip->i_di.di_size = offset + copied;
+	if (ip->i_disksize < offset + copied)
+		ip->i_disksize = offset + copied;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	int copied = 0;
 	int error = 0;
 
-	if (offset >= ip->i_di.di_size)
+	if (offset >= ip->i_disksize)
 		return 0;
 
-	if (offset + size > ip->i_di.di_size)
-		size = ip->i_di.di_size - offset;
+	if (offset + size > ip->i_disksize)
+		size = ip->i_disksize - offset;
 
 	if (!size)
 		return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 
-	if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf;
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
-		if (hsize * sizeof(u64) != ip->i_di.di_size) {
+		if (hsize * sizeof(u64) != ip->i_disksize) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
 		}
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
 		return -ENOSPC;
 	bn = bh->b_blocknr;
 
-	gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
-	leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+	gfs2_assert(sdp, dip->i_entries < (1 << 16));
+	leaf->lf_entries = cpu_to_be16(dip->i_entries);
 
 	/*  Copy dirents  */
 
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
 
-	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
-	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
 	dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 
 	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
 
-	for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
 					    block * sdp->sd_hash_bsize,
 					    sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 	int copied = 0;
 	int error;
 
-	if (!dip->i_di.di_entries)
+	if (!dip->i_entries)
 		return 0;
 
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+	if (dip->i_diskflags & GFS2_DIF_EXHASH)
 		return dir_e_read(inode, offset, opaque, filldir);
 
 	if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = PTR_ERR(dent);
 			goto out;
 		}
-		if (dip->i_di.di_entries != g.offset) {
+		if (dip->i_entries != g.offset) {
 			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
-				"ip->i_di.di_entries (%u) != g.offset (%u)\n",
+				"ip->i_entries (%u) != g.offset (%u)\n",
 				(unsigned long long)dip->i_no_addr,
-				dip->i_di.di_entries,
+				dip->i_entries,
 				g.offset);
 			error = -EIO;
 			goto out;
 		}
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
-					dip->i_di.di_entries, &copied);
+					dip->i_entries, &copied);
 out:
 		kfree(darr);
 	}
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			dent = gfs2_init_dirent(inode, dent, name, bh);
 			gfs2_inum_out(nip, dent);
 			dent->de_type = cpu_to_be16(type);
-			if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
 			}
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			if (error)
 				break;
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			ip->i_di.di_entries++;
+			ip->i_entries++;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 			gfs2_dinode_out(ip, bh->b_data);
 			brelse(bh);
 			error = 0;
 			break;
 		}
-		if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
 			error = dir_make_exhash(inode);
 			if (error)
 				break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	}
 
 	dirent_del(dip, bh, prev, dent);
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
 		u16 entries = be16_to_cpu(leaf->lf_entries);
 		if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	if (error)
 		return error;
 
-	if (!dip->i_di.di_entries)
+	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
-	dip->i_di.di_entries--;
+	dip->i_entries--;
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	gfs2_inum_out(nip, dent);
 	dent->de_type = cpu_to_be16(new_type);
 
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
 		brelse(bh);
 		error = gfs2_meta_inode_buffer(dip, &bh);
 		if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
 #define __DIR_DOT_H__
 
 #include <linux/dcache.h>
+#include <linux/crc32.h>
 
 struct inode;
 struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
 	if (error)
 		return error;
 
-	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
 		error = ea_foreach_i(ip, bh, ea_call, data);
 		goto out;
 	}
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	if (error)
 		return error;
 
-	if (ip->i_di.di_eattr) {
+	if (ip->i_eattr) {
 		struct ea_list ei = { .ei_er = er, .ei_size = 0 };
 
 		error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return -ENODATA;
 
 	error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	ip->i_di.di_eattr = bh->b_blocknr;
+	ip->i_eattr = bh->b_blocknr;
 	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
 
 	brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
 
-	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
 				       &indbh);
 		if (error)
 			return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		gfs2_buffer_clear_tail(indbh, mh_size);
 
 		eablk = (__be64 *)(indbh->b_data + mh_size);
-		*eablk = cpu_to_be64(ip->i_di.di_eattr);
-		ip->i_di.di_eattr = blk;
-		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 		eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
 		blks++;
 	if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
 		blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr) {
+	if (!ip->i_eattr) {
 		if (er->er_flags & XATTR_REPLACE)
 			return -ENODATA;
 		return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 		return error;
 
 	if (el.el_ea) {
-		if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
 			brelse(el.el_bh);
 			return -EPERM;
 		}
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return -ENODATA;
 
 	error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
 	if (error)
 		return error;
 
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
 
-	ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	struct buffer_head *dibh;
 	int error;
 
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
 	if (!rgd) {
 		gfs2_consist_inode(ip);
 		return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	if (error)
 		goto out_gunlock;
 
-	gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+	gfs2_free_meta(ip, ip->i_eattr, 1);
 
-	ip->i_di.di_eattr = 0;
+	ip->i_eattr = 0;
 	gfs2_add_inode_blocks(&ip->i_inode, -1);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out_rindex;
 
-	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		error = ea_dealloc_indirect(ip);
 		if (error)
 			goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
 #include "quota.h"
 #include "super.h"
 #include "util.h"
+#include "bmap.h"
 
 struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
 static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 }
 
 /**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+	if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+		list_add_tail(&gl->gl_lru, &lru_list);
+		atomic_inc(&lru_count);
+	}
+	spin_unlock(&lru_lock);
+}
+
+/**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
  *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 	if (atomic_dec_and_test(&gl->gl_ref)) {
 		hlist_del(&gl->gl_list);
 		write_unlock(gl_lock_addr(gl->gl_hash));
+		spin_lock(&lru_lock);
+		if (!list_empty(&gl->gl_lru)) {
+			list_del_init(&gl->gl_lru);
+			atomic_dec(&lru_count);
+		}
+		spin_unlock(&lru_lock);
 		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
 	}
 	write_unlock(gl_lock_addr(gl->gl_hash));
+	/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+	if (atomic_read(&gl->gl_ref) == 2)
+		gfs2_glock_schedule_for_reclaim(gl);
 out:
 	return rv;
 }
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
  * 
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
  */
 
 static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
 				ret = glops->go_lock(gh);
 				spin_lock(&gl->gl_spin);
 				if (ret) {
+					if (ret == 1)
+						return 2;
 					gh->gh_error = ret;
 					list_del_init(&gh->gh_list);
 					gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh;
 	unsigned state = ret & LM_OUT_ST_MASK;
+	int rv;
 
 	spin_lock(&gl->gl_spin);
 	state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
 		gfs2_demote_wake(gl);
 	if (state != LM_ST_UNLOCKED) {
 		if (glops->go_xmote_bh) {
-			int rv;
 			spin_unlock(&gl->gl_spin);
 			rv = glops->go_xmote_bh(gl, gh);
 			if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
 				goto out;
 			}
 		}
-		do_promote(gl);
+		rv = do_promote(gl);
+		if (rv == 2)
+			goto out_locked;
 	}
 out:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_put(gl);
 }
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
  */
 
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
  */
 
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	struct gfs2_holder *gh = NULL;
+	int ret;
 
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
 		return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
 	} else {
 		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
 			gfs2_demote_wake(gl);
-		if (do_promote(gl) == 0)
+		ret = do_promote(gl);
+		if (ret == 0)
 			goto out;
+		if (ret == 2)
+			return;
 		gh = find_first_waiter(gl);
 		gl->gl_target = gh->gh_state;
 		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
  */
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-			    int remote, unsigned long delay)
+			    unsigned long delay)
 {
 	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
 
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
 		gl->gl_demote_state = state;
 		gl->gl_demote_time = jiffies;
-		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-		    gl->gl_object)
-			gfs2_glock_schedule_for_reclaim(gl);
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
 			gl->gl_demote_state != state) {
 		gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
  */
 
 static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 
 	spin_lock(&gl->gl_spin);
 	if (gh->gh_flags & GL_NOCACHE)
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
 
 	list_del_init(&gh->gh_list);
 	if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		delay = gl->gl_ops->go_min_hold_time;
 
 	spin_lock(&gl->gl_spin);
-	handle_callback(gl, state, 1, delay);
+	handle_callback(gl, state, delay);
 	spin_unlock(&gl->gl_spin);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
 }
 
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		jd->jd_dirty = 1;
+		break;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
 /**
  * gfs2_glock_cb - Callback used by locking module
  * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
  * Returns: 1 if it's ok
  */
 
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	int demote = 1;
-
-	if (test_bit(GLF_STICKY, &gl->gl_flags))
-		demote = 0;
-	else if (glops->go_demote_ok)
-		demote = glops->go_demote_ok(gl);
-
-	return demote;
-}
-
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (list_empty(&gl->gl_reclaim)) {
-		gfs2_glock_hold(gl);
-		list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
-		atomic_inc(&sdp->sd_reclaim_count);
-		spin_unlock(&sdp->sd_reclaim_lock);
-		wake_up(&sdp->sd_reclaim_wq);
-	} else
-		spin_unlock(&sdp->sd_reclaim_lock);
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return 0;
+	if (!list_empty(&gl->gl_holders))
+		return 0;
+	if (glops->go_demote_ok)
+		return glops->go_demote_ok(gl);
+	return 1;
 }
 
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
 
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
 	struct gfs2_glock *gl;
-	int done_callback = 0;
+	int may_demote;
+	int nr_skipped = 0;
+	int got_ref = 0;
+	LIST_HEAD(skipped);
 
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (list_empty(&sdp->sd_reclaim_list)) {
-		spin_unlock(&sdp->sd_reclaim_lock);
-		return;
-	}
-	gl = list_entry(sdp->sd_reclaim_list.next,
-			struct gfs2_glock, gl_reclaim);
-	list_del_init(&gl->gl_reclaim);
-	spin_unlock(&sdp->sd_reclaim_lock);
+	if (nr == 0)
+		goto out;
 
-	atomic_dec(&sdp->sd_reclaim_count);
-	atomic_inc(&sdp->sd_reclaimed);
+	if (!(gfp_mask & __GFP_FS))
+		return -1;
 
-	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL &&
-	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		done_callback = 1;
+	spin_lock(&lru_lock);
+	while(nr && !list_empty(&lru_list)) {
+		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
+
+		/* Test for being demotable */
+		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+			gfs2_glock_hold(gl);
+			got_ref = 1;
+			spin_unlock(&lru_lock);
+			spin_lock(&gl->gl_spin);
+			may_demote = demote_ok(gl);
+			spin_unlock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+			if (may_demote) {
+				handle_callback(gl, LM_ST_UNLOCKED, 0);
+				nr--;
+				if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+					gfs2_glock_put(gl);
+			}
+			spin_lock(&lru_lock);
+			if (may_demote)
+				continue;
+		}
+		if (list_empty(&gl->gl_lru) &&
+		    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+			nr_skipped++;
+			list_add(&gl->gl_lru, &skipped);
+		}
+		if (got_ref) {
+			spin_unlock(&lru_lock);
+			gfs2_glock_put(gl);
+			spin_lock(&lru_lock);
+			got_ref = 0;
+		}
 	}
-	spin_unlock(&gl->gl_spin);
-	if (!done_callback ||
-	    queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-		gfs2_glock_put(gl);
+	list_splice(&skipped, &lru_list);
+	atomic_add(nr_skipped, &lru_count);
+	spin_unlock(&lru_lock);
+out:
+	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
 
+static struct shrinker glock_shrinker = {
+	.shrink = gfs2_shrink_glock_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
 /**
  * examine_bucket - Call a function for glock in a hash bucket
  * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
 }
 
 /**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-
-static void scan_glock(struct gfs2_glock *gl)
-{
-	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
-		return;
-	if (test_bit(GLF_LOCK, &gl->gl_flags))
-		return;
-
-	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL &&
-	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-		gfs2_glock_schedule_for_reclaim(gl);
-	spin_unlock(&gl->gl_spin);
-}
-
-/**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
  *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
 
 static void clear_glock(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int released;
-
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (!list_empty(&gl->gl_reclaim)) {
-		list_del_init(&gl->gl_reclaim);
-		atomic_dec(&sdp->sd_reclaim_count);
-		spin_unlock(&sdp->sd_reclaim_lock);
-		released = gfs2_glock_put(gl);
-		gfs2_assert(sdp, !released);
-	} else {
-		spin_unlock(&sdp->sd_reclaim_lock);
+	spin_lock(&lru_lock);
+	if (!list_empty(&gl->gl_lru)) {
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
 	}
+	spin_unlock(&lru_lock);
 
 	spin_lock(&gl->gl_spin);
 	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 	}
 }
 
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+	struct gfs2_glock *gl = ip->i_gl;
+	int ret;
+
+	ret = gfs2_truncatei_resume(ip);
+	gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+
+	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+}
+
 static const char *state2str(unsigned state)
 {
 	switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 	char *p = buf;
 	if (test_bit(GLF_LOCK, gflags))
 		*p++ = 'l';
-	if (test_bit(GLF_STICKY, gflags))
-		*p++ = 's';
 	if (test_bit(GLF_DEMOTE, gflags))
 		*p++ = 'D';
 	if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 	return error;
 }
 
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-
-static int gfs2_scand(void *data)
-{
-	unsigned x;
-	unsigned delay;
-
-	while (!kthread_should_stop()) {
-		for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-			examine_bucket(scan_glock, NULL, x);
-		if (freezing(current))
-			refrigerator();
-		delay = scand_secs;
-		if (delay < 1)
-			delay = 1;
-		schedule_timeout_interruptible(delay * HZ);
-	}
-
-	return 0;
-}
-
-
 
 int __init gfs2_glock_init(void)
 {
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
-	if (IS_ERR(scand_process))
-		return PTR_ERR(scand_process);
-
 	glock_workqueue = create_workqueue("glock_workqueue");
-	if (IS_ERR(glock_workqueue)) {
-		kthread_stop(scand_process);
+	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	}
+
+	register_shrinker(&glock_shrinker);
 
 	return 0;
 }
 
 void gfs2_glock_exit(void)
 {
+	unregister_shrinker(&glock_shrinker);
 	destroy_workqueue(glock_workqueue);
-	kthread_stop(scand_process);
 }
 
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
 	struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
  * Returns: 1 if it's ok
  */
 
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int demote = 0;
-
-	if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
-		demote = 1;
-	else if (!sdp->sd_args.ar_localcaching &&
-		 time_after_eq(jiffies, gl->gl_stamp +
-			       gfs2_tune_get(sdp, gt_demote_secs) * HZ))
-		demote = 1;
-
-	return demote;
+	if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+		return 0;
+	return 1;
 }
 
 /**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = gl->gl_object;
 	int error = 0;
 
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
 			return error;
 	}
 
-	if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
 	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
-	    (gh->gh_state == LM_ST_EXCLUSIVE))
-		error = gfs2_truncatei_resume(ip);
+	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
+		spin_lock(&sdp->sd_trunc_lock);
+		if (list_empty(&ip->i_trunc_list))
+			list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		wake_up(&sdp->sd_quota_wait);
+		return 1;
+	}
 
 	return error;
 }
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
-		  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+		  (unsigned int)ip->i_diskflags,
+		  (unsigned long long)ip->i_inode.i_size,
+		  (unsigned long long)ip->i_disksize);
 	return 0;
 }
 
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
  * Returns: 1 if it's ok
  */
 
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
 	return !gl->gl_aspace->i_mapping->nrpages;
 }
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_rgrpd *rgd = gl->gl_object;
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
 	return 0;
 }
 
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 }
 
 /**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+	return 0;
+}
+
+/**
  * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
  * @gl: the glock
  *
  * Returns: 1 if it's ok
  */
 
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
 {
 	return !atomic_read(&gl->gl_lvb_count);
 }
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 const struct gfs2_glock_operations gfs2_trans_glops = {
 	.go_xmote_th = trans_go_sync,
 	.go_xmote_bh = trans_go_xmote_bh,
+	.go_demote_ok = trans_go_demote_ok,
 	.go_type = LM_TYPE_NONDISK,
 };
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
 	u32 bi_len;
 };
 
-struct gfs2_rgrp_host {
-	u32 rg_free;
-	u32 rg_dinodes;
-	u64 rg_igeneration;
-};
-
 struct gfs2_rgrpd {
 	struct list_head rd_list;	/* Link with superblock */
 	struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
 	u32 rd_length;			/* length of rgrp header in fs blocks */
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
-	struct gfs2_rgrp_host rd_rg;
+	u32 rd_free;
+	u32 rd_free_clone;
+	u32 rd_dinodes;
+	u64 rd_igeneration;
 	struct gfs2_bitmap *rd_bits;
-	unsigned int rd_bh_count;
 	struct mutex rd_mutex;
-	u32 rd_free_clone;
 	struct gfs2_log_element rd_le;
-	u32 rd_last_alloc;
 	struct gfs2_sbd *rd_sbd;
+	unsigned int rd_bh_count;
+	u32 rd_last_alloc;
 	unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
 	void (*go_inval) (struct gfs2_glock *gl, int flags);
-	int (*go_demote_ok) (struct gfs2_glock *gl);
+	int (*go_demote_ok) (const struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
 	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
 
 enum {
 	GLF_LOCK			= 1,
-	GLF_STICKY			= 2,
 	GLF_DEMOTE			= 3,
 	GLF_PENDING_DEMOTE		= 4,
 	GLF_DEMOTE_IN_PROGRESS		= 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
 	unsigned long gl_tchange;
 	void *gl_object;
 
-	struct list_head gl_reclaim;
+	struct list_head gl_lru;
 
 	struct gfs2_sbd *gl_sbd;
 
@@ -233,29 +228,24 @@ enum {
 	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
-struct gfs2_dinode_host {
-	u64 di_size;		/* number of bytes in file */
-	u64 di_generation;	/* generation number for NFS */
-	u32 di_flags;		/* GFS2_DIF_... */
-	/* These only apply to directories  */
-	u32 di_entries;		/* The number of entries in the directory */
-	u64 di_eattr;		/* extended attribute block number */
-};
 
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
 	u64 i_no_formal_ino;
+	u64 i_generation;
+	u64 i_eattr;
+	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
-
-	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
-
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
 	struct gfs2_alloc *i_alloc;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
+	struct list_head i_trunc_list;
+	u32 i_entries;
+	u32 i_diskflags;
 	u8 i_height;
 	u8 i_depth;
 };
@@ -406,13 +396,11 @@ struct gfs2_args {
 struct gfs2_tune {
 	spinlock_t gt_spin;
 
-	unsigned int gt_demote_secs; /* Cache retention for unheld glock */
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
 
 	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
-	unsigned int gt_quotad_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
 	/* Lock Stuff */
 
 	struct lm_lockstruct sd_lockstruct;
-	struct list_head sd_reclaim_list;
-	spinlock_t sd_reclaim_lock;
-	wait_queue_head_t sd_reclaim_wq;
-	atomic_t sd_reclaim_count;
 	struct gfs2_holder sd_live_gh;
 	struct gfs2_glock *sd_rename_gl;
 	struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
 	spinlock_t sd_statfs_spin;
 	struct gfs2_statfs_change_host sd_statfs_master;
 	struct gfs2_statfs_change_host sd_statfs_local;
-	unsigned long sd_statfs_sync_time;
 
 	/* Resource group stuff */
 
@@ -552,8 +535,6 @@ struct gfs2_sbd {
 	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
-	struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
-	unsigned int sd_glockd_num;
 
 	/* Quota stuff */
 
@@ -561,13 +542,15 @@ struct gfs2_sbd {
 	atomic_t sd_quota_count;
 	spinlock_t sd_quota_spin;
 	struct mutex sd_quota_mutex;
+	wait_queue_head_t sd_quota_wait;
+	struct list_head sd_trunc_list;
+	spinlock_t sd_trunc_lock;
 
 	unsigned int sd_quota_slots;
 	unsigned int sd_quota_chunks;
 	unsigned char **sd_quota_bitmap;
 
 	u64 sd_quota_sync_gen;
-	unsigned long sd_quota_sync_time;
 
 	/* Log stuff */
 
@@ -624,10 +607,6 @@ struct gfs2_sbd {
 	struct mutex sd_freeze_lock;
 	unsigned int sd_freeze_count;
 
-	/* Counters */
-
-	atomic_t sd_reclaimed;
-
 	char sd_fsname[GFS2_FSNAME_LEN];
 	char sd_table_name[GFS2_FSNAME_LEN];
 	char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -248,7 +247,6 @@ fail:
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
-	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
 	struct timespec atime;
 	u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	di->di_size = be64_to_cpu(str->di_size);
-	i_size_write(&ip->i_inode, di->di_size);
+	ip->i_disksize = be64_to_cpu(str->di_size);
+	i_size_write(&ip->i_inode, ip->i_disksize);
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
 
 	ip->i_goal = be64_to_cpu(str->di_goal_meta);
-	di->di_generation = be64_to_cpu(str->di_generation);
+	ip->i_generation = be64_to_cpu(str->di_generation);
 
-	di->di_flags = be32_to_cpu(str->di_flags);
+	ip->i_diskflags = be32_to_cpu(str->di_flags);
 	gfs2_set_inode_flags(&ip->i_inode);
 	height = be16_to_cpu(str->di_height);
 	if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
 		goto corrupt;
 	ip->i_depth = (u8)depth;
-	di->di_entries = be32_to_cpu(str->di_entries);
+	ip->i_entries = be32_to_cpu(str->di_entries);
 
-	di->di_eattr = be64_to_cpu(str->di_eattr);
+	ip->i_eattr = be64_to_cpu(str->di_eattr);
 	if (S_ISREG(ip->i_inode.i_mode))
 		gfs2_set_aops(&ip->i_inode);
 
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	gfs2_free_di(rgd, ip);
 
 	gfs2_trans_end(sdp);
-	clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 
 out_rg_gunlock:
 	gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 		return error;
 	}
 
-	if (dip->i_di.di_entries == (u32)-1)
+	if (dip->i_entries == (u32)-1)
 		return -EFBIG;
 	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
 		return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_flags = 0;
 
 	if (S_ISREG(mode)) {
-		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
 		    gfs2_tune_get(sdp, gt_new_files_jdata))
 			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
 	} else if (S_ISDIR(mode)) {
-		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+		di->di_flags |= cpu_to_be32(dip->i_diskflags &
 					    GFS2_DIF_INHERIT_JDATA);
 	}
 
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	struct qstr dotname;
 	int error;
 
-	if (ip->i_di.di_entries != 2) {
+	if (ip->i_entries != 2) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 		return error;
 	}
 
-	if (!ip->i_di.di_size) {
+	if (!ip->i_disksize) {
 		gfs2_consist_inode(ip);
 		error = -EIO;
 		goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 	if (error)
 		goto out;
 
-	x = ip->i_di.di_size + 1;
+	x = ip->i_disksize + 1;
 	if (x > *len) {
 		*buf = kmalloc(x, GFP_NOFS);
 		if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
-	const struct gfs2_dinode_host *di = &ip->i_di;
 	struct gfs2_dinode *str = buf;
 
 	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(di->di_size);
+	str->di_size = cpu_to_be64(ip->i_disksize);
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 	str->di_goal_meta = cpu_to_be64(ip->i_goal);
 	str->di_goal_data = cpu_to_be64(ip->i_goal);
-	str->di_generation = cpu_to_be64(di->di_generation);
+	str->di_generation = cpu_to_be64(ip->i_generation);
 
-	str->di_flags = cpu_to_be32(di->di_flags);
+	str->di_flags = cpu_to_be32(ip->i_diskflags);
 	str->di_height = cpu_to_be16(ip->i_height);
 	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
 					     GFS2_FORMAT_DE : 0);
 	str->di_depth = cpu_to_be16(ip->i_depth);
-	str->di_entries = cpu_to_be32(di->di_entries);
+	str->di_entries = cpu_to_be32(ip->i_entries);
 
-	str->di_eattr = cpu_to_be64(di->di_eattr);
+	str->di_eattr = cpu_to_be64(ip->i_eattr);
 	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
 	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
 	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
-	const struct gfs2_dinode_host *di = &ip->i_di;
-
 	printk(KERN_INFO "  no_formal_ino = %llu\n",
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+	printk(KERN_INFO "  i_disksize = %llu\n",
+	       (unsigned long long)ip->i_disksize);
 	printk(KERN_INFO "  blocks = %llu\n",
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
 	       (unsigned long long)ip->i_goal);
-	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+	printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-	printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
-	printk(KERN_INFO "  di_eattr = %llu\n",
-	       (unsigned long long)di->di_eattr);
+	printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
+	printk(KERN_INFO "  i_eattr = %llu\n",
+	       (unsigned long long)ip->i_eattr);
 }
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
 
+#include <linux/fs.h>
 #include "util.h"
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 {
-	return ip->i_di.di_flags & GFS2_DIF_JDATA;
+	return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 
 static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 void gfs2_dinode_print(const struct gfs2_inode *ip);
 
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
+
 #endif /* __INODE_DOT_H__ */
 
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
 static void gdlm_recovery_done(void *lockspace, unsigned int jid,
                                unsigned int message)
 {
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
 	struct gdlm_ls *ls = lockspace;
 	ls->recover_jid_done = jid;
 	ls->recover_jid_status = message;
-	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 
 static void gdlm_others_may_mount(void *lockspace)
 {
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
 	struct gdlm_ls *ls = lockspace;
 	ls->first_done = 1;
-	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 
 /* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 	kobject_put(&ls->kobj);
 }
 
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+        add_uevent_var(env, "LOCKPROTO=lock_dlm");
+        return 0;
+}
+
+static struct kset_uevent_ops gdlm_uevent_ops = {
+	.uevent = gdlm_uevent,
+};
+
+
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
 
 #include "gfs2.h"
 #include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
 
 	inode_init_once(&ip->i_inode);
 	init_rwsem(&ip->i_rw_mutex);
+	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_alloc = NULL;
 }
 
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_LIST_HEAD(&gl->gl_holders);
 	gl->gl_lvb = NULL;
 	atomic_set(&gl->gl_lvb_count, 0);
-	INIT_LIST_HEAD(&gl->gl_reclaim);
+	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
 }
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_rgrpd_cachep)
 		goto fail;
 
+	gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+					       sizeof(struct gfs2_quota_data),
+					       0, 0, NULL);
+	if (!gfs2_quotad_cachep)
+		goto fail;
+
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
 		goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
 fail:
 	gfs2_glock_exit();
 
+	if (gfs2_quotad_cachep)
+		kmem_cache_destroy(gfs2_quotad_cachep);
+
 	if (gfs2_rgrpd_cachep)
 		kmem_cache_destroy(gfs2_rgrpd_cachep);
 
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
+	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
 	kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
 	Opt_debug,
 	Opt_nodebug,
 	Opt_upgrade,
-	Opt_num_glockd,
 	Opt_acl,
 	Opt_noacl,
 	Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
 	{Opt_debug, "debug"},
 	{Opt_nodebug, "nodebug"},
 	{Opt_upgrade, "upgrade"},
-	{Opt_num_glockd, "num_glockd=%d"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	int error = 0;
 
 	if (!remount) {
-		/*  If someone preloaded options, use those instead  */
-		spin_lock(&gfs2_sys_margs_lock);
-		if (gfs2_sys_margs) {
-			data = gfs2_sys_margs;
-			gfs2_sys_margs = NULL;
-		}
-		spin_unlock(&gfs2_sys_margs_lock);
-
 		/*  Set some defaults  */
-		args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
 		args->ar_quota = GFS2_QUOTA_DEFAULT;
 		args->ar_data = GFS2_DATA_DEFAULT;
 	}
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	   process them */
 
 	for (options = data; (o = strsep(&options, ",")); ) {
-		int token, option;
+		int token;
 		substring_t tmp[MAX_OPT_ARGS];
 
 		if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 				goto cant_remount;
 			args->ar_upgrade = 1;
 			break;
-		case Opt_num_glockd:
-			if ((error = match_int(&tmp[0], &option))) {
-				fs_info(sdp, "problem getting num_glockd\n");
-				goto out_error;
-			}
-
-			if (remount && option != args->ar_num_glockd)
-				goto cant_remount;
-			if (!option || option > GFS2_GLOCKD_MAX) {
-				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-				        GFS2_GLOCKD_MAX, option);
-				error = -EINVAL;
-				goto out_error;
-			}
-			args->ar_num_glockd = option;
-			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 15f710f2d4da..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 {
 	struct inode *inode = page->mapping->host;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
+	int ret;
 	int done_trans = 0;
 
-	error = gfs2_writepage_common(page, wbc);
-	if (error <= 0)
-		return error;
-
 	if (PageChecked(page)) {
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			goto out_ignore;
-		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-		if (error)
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
 			goto out_ignore;
 		done_trans = 1;
 	}
-	error = __gfs2_jdata_writepage(page, wbc);
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
 	if (done_trans)
 		gfs2_trans_end(sdp);
-	return error;
+	return ret;
 
 out_ignore:
 	redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-	       ip->i_di.di_size);
-	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+	       ip->i_disksize);
+	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
 	kunmap_atomic(kaddr, KM_USER0);
 	flush_dcache_page(page);
 	brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct gfs2_inode *ip = GFS2_I(mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	unsigned int data_blocks, ind_blocks, rblocks;
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
 	struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(error))
 		goto out_uninit;
 
-	gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
 	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
 	if (error)
 		goto out_unlock;
 
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
 		if (!al) {
@@ -675,6 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		goto out_trans_fail;
 
 	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	*pagep = page;
 	if (unlikely(!page))
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 
 	if (inode->i_size < to) {
 		i_size_write(inode, to);
-		ip->i_di.di_size = inode->i_size;
+		ip->i_disksize = inode->i_size;
 		di->di_size = cpu_to_be64(inode->i_size);
 		mark_inode_dirty(inode);
 	}
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 
-	if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+	if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
 		di = (struct gfs2_dinode *)dibh->b_data;
-		ip->i_di.di_size = inode->i_size;
+		ip->i_disksize = inode->i_size;
 		di->di_size = cpu_to_be64(inode->i_size);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
 #include "incore.h"
 #include "dir.h"
 #include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
 #include "util.h"
 #include "inode.h"
 
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-
-#include <linux/dcache.h>
-
-extern struct dentry_operations gfs2_dops;
-
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "ops_dentry.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "rgrp.h"
 #include "util.h"
 
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	}
 
 	error = -EIO;
-	if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
 		iput(inode);
 		goto fail;
 	}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	if (error)
 		return error;
 
-	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-	if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
 		fsflags |= FS_JOURNAL_DATA_FL;
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 void gfs2_set_inode_flags(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_dinode_host *di = &ip->i_di;
 	unsigned int flags = inode->i_flags;
 
 	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	if (di->di_flags & GFS2_DIF_IMMUTABLE)
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
 		flags |= S_IMMUTABLE;
-	if (di->di_flags & GFS2_DIF_APPENDONLY)
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
 		flags |= S_APPEND;
-	if (di->di_flags & GFS2_DIF_NOATIME)
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
 		flags |= S_NOATIME;
-	if (di->di_flags & GFS2_DIF_SYNC)
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
 		flags |= S_SYNC;
 	inode->i_flags = flags;
 }
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_drop_write;
 
-	flags = ip->i_di.di_flags;
+	flags = ip->i_diskflags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
 		goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_trans_end;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	ip->i_di.di_flags = new_flags;
+	ip->i_diskflags = new_flags;
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned long last_index;
-	u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
 	int alloc_required = 0;
 	struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out;
 
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
 		goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = gfs2_quota_lock_check(ip);
 	if (ret)
 		goto out_alloc_put;
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
 	al->al_requested = data_blocks + ind_blocks;
 	ret = gfs2_inplace_reserve(ip);
 	if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail;
 
 		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_di.di_size > MAX_NON_LFS) {
+		    ip->i_disksize > MAX_NON_LFS) {
 			error = -EOVERFLOW;
 			goto fail_gunlock;
 		}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
-#include "daemon.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
 #include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "log.h"
+#include "quota.h"
+#include "dir.h"
 
 #define DO 0
 #define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_demote_secs = 300;
 	gt->gt_incore_log_blocks = 1024;
 	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
-	gt->gt_quotad_secs = 5;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	gfs2_tune_init(&sdp->sd_tune);
 
-	INIT_LIST_HEAD(&sdp->sd_reclaim_list);
-	spin_lock_init(&sdp->sd_reclaim_lock);
-	init_waitqueue_head(&sdp->sd_reclaim_wq);
-
 	mutex_init(&sdp->sd_inum_mutex);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
 	spin_lock_init(&sdp->sd_quota_spin);
 	mutex_init(&sdp->sd_quota_mutex);
+	init_waitqueue_head(&sdp->sd_quota_wait);
+	INIT_LIST_HEAD(&sdp->sd_trunc_list);
+	spin_lock_init(&sdp->sd_trunc_lock);
 
 	spin_lock_init(&sdp->sd_log_lock);
 
@@ -443,24 +438,11 @@ out:
 static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 			int undo)
 {
-	struct task_struct *p;
 	int error = 0;
 
 	if (undo)
 		goto fail_trans;
 
-	for (sdp->sd_glockd_num = 0;
-	     sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
-	     sdp->sd_glockd_num++) {
-		p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
-		error = IS_ERR(p);
-		if (error) {
-			fs_err(sdp, "can't start glockd thread: %d\n", error);
-			goto fail;
-		}
-		sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
-	}
-
 	error = gfs2_glock_nq_num(sdp,
 				  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
 				  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 		fs_err(sdp, "can't create transaction glock: %d\n", error);
 		goto fail_rename;
 	}
-	set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
 
 	return 0;
 
@@ -506,9 +487,6 @@ fail_live:
 fail_mount:
 	gfs2_glock_dq_uninit(mount_gh);
 fail:
-	while (sdp->sd_glockd_num--)
-		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
-
 	return error;
 }
 
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+	for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 					sdp->sd_lockstruct.ls_lockspace);
 }
 
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+	struct qstr name;
+	char buf[20];
+	struct gfs2_jdesc *jd;
+	int error;
+
+	name.name = buf;
+
+	mutex_lock(&sdp->sd_jindex_mutex);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+		if (error)
+			break;
+
+		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+		name.hash = gfs2_disk_hash(name.name, name.len);
+
+		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+		if (error == -ENOENT) {
+			error = 0;
+			break;
+		}
+
+		gfs2_glock_dq_uninit(ji_gh);
+
+		if (error)
+			break;
+
+		error = -ENOMEM;
+		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+		if (!jd)
+			break;
+
+		INIT_LIST_HEAD(&jd->extent_list);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+			if (!jd->jd_inode)
+				error = -ENOENT;
+			else
+				error = PTR_ERR(jd->jd_inode);
+			kfree(jd);
+			break;
+		}
+
+		spin_lock(&sdp->sd_jindex_spin);
+		jd->jd_jid = sdp->sd_journals++;
+		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+		spin_unlock(&sdp->sd_jindex_spin);
+	}
+
+	mutex_unlock(&sdp->sd_jindex_mutex);
+
+	return error;
+}
+
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		return PTR_ERR(sdp->sd_jindex);
 	}
 	ip = GFS2_I(sdp->sd_jindex);
-	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 
 	/* Load in the journal index special file */
 
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		goto fail_statfs;
 	}
 	ip = GFS2_I(sdp->sd_rindex);
-	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 	}
 	sdp->sd_logd_process = p;
 
-	sdp->sd_statfs_sync_time = jiffies;
-	sdp->sd_quota_sync_time = jiffies;
-
 	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
 	error = IS_ERR(p);
 	if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	if (sdp) {
-		gfs2_meta_syncfs(sdp);
-		dput(sdp->sd_root_dir);
-		dput(sdp->sd_master_dir);
-		sdp->sd_root_dir = NULL;
-		sdp->sd_master_dir = NULL;
+
+	if (sdp == NULL) {
+		kill_block_super(sb);
+		return;
 	}
+
+	gfs2_meta_syncfs(sdp);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
 	shrink_dcache_sb(sb);
 	kill_block_super(sb);
-	if (sdp)
-		gfs2_delete_debugfs_file(sdp);
+	gfs2_delete_debugfs_file(sdp);
+	kfree(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-
-#include <linux/fs.h>
-
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/fiemap.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -31,12 +32,11 @@
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
 
 /**
  * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (!dip->i_inode.i_nlink)
 		goto out_gunlock;
 	error = -EFBIG;
-	if (dip->i_di.di_entries == (u32)-1)
+	if (dip->i_entries == (u32)-1)
 		goto out_gunlock;
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_di.di_size = size;
+	ip->i_disksize = size;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_inode.i_nlink = 2;
-	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-	ip->i_di.di_flags |= GFS2_DIF_JDATA;
-	ip->i_di.di_entries = 2;
+	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	ip->i_diskflags |= GFS2_DIF_JDATA;
+	ip->i_entries = 2;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		goto out_gunlock;
 
-	if (ip->i_di.di_entries < 2) {
+	if (ip->i_entries < 2) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		error = -EIO;
 		goto out_gunlock;
 	}
-	if (ip->i_di.di_entries > 2) {
+	if (ip->i_entries > 2) {
 		error = -ENOTEMPTY;
 		goto out_gunlock;
 	}
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock;
 
 		if (S_ISDIR(nip->i_inode.i_mode)) {
-			if (nip->i_di.di_entries < 2) {
+			if (nip->i_entries < 2) {
 				if (gfs2_consist_inode(nip))
 					gfs2_dinode_print(nip);
 				error = -EIO;
 				goto out_gunlock;
 			}
-			if (nip->i_di.di_entries > 2) {
+			if (nip->i_entries > 2) {
 				error = -ENOTEMPTY;
 				goto out_gunlock;
 			}
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 				error = -EINVAL;
 				goto out_gunlock;
 			}
-			if (ndip->i_di.di_entries == (u32)-1) {
+			if (ndip->i_entries == (u32)-1) {
 				error = -EFBIG;
 				goto out_gunlock;
 			}
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 
-	if (attr->ia_size != ip->i_di.di_size) {
+	if (attr->ia_size != ip->i_disksize) {
 		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 		if (error)
 			return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 	}
 
 	error = gfs2_truncatei(ip, attr->ia_size);
-	if (error && (inode->i_size != ip->i_di.di_size))
-		i_size_write(inode, ip->i_di.di_size);
+	if (error && (inode->i_size != ip->i_disksize))
+		i_size_write(inode, ip->i_disksize);
 
 	return error;
 }
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
 }
 
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (ret)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 phys = ip->i_no_addr << inode->i_blkbits;
+		u64 size = i_size_read(inode);
+		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+			    FIEMAP_EXTENT_DATA_INLINE;
+		phys += sizeof(struct gfs2_dinode);
+		phys += start;
+		if (start + len > size)
+			len = size - start;
+		if (start < size)
+			ret = fiemap_fill_next_extent(fieinfo, start, phys,
+						      len, flags);
+		if (ret == 1)
+			ret = 0;
+	} else {
+		ret = __generic_block_fiemap(inode, fieinfo, start, len,
+					     gfs2_block_map);
+	}
+
+	gfs2_glock_dq_uninit(&gh);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct inode_operations gfs2_file_iops = {
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
 const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
 const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-
-extern void gfs2_set_inode_flags(struct inode *inode);
-
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..777783deddcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "mount.h"
-#include "ops_super.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
 	kthread_stop(sdp->sd_recoverd_process);
-	while (sdp->sd_glockd_num--)
-		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  At this point, we're through participating in the lockspace  */
 	gfs2_sys_fs_del(sdp);
-	kfree(sdp);
 }
 
 /**
@@ -260,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb)
 }
 
 /**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
  * gfs2_statfs - Gather and return stats about the filesystem
  * @sb: The superblock
  * @statfsbuf: The buffer
@@ -370,7 +497,6 @@ static void gfs2_clear_inode(struct inode *inode)
 	 */
 	if (test_bit(GIF_USER, &ip->i_flags)) {
 		ip->i_gl->gl_object = NULL;
-		gfs2_glock_schedule_for_reclaim(ip->i_gl);
 		gfs2_glock_put(ip->i_gl);
 		ip->i_gl = NULL;
 		if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +549,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",debug");
 	if (args->ar_upgrade)
 		seq_printf(s, ",upgrade");
-	if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
-		seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
 	if (args->ar_posix_acl)
 		seq_printf(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +618,16 @@ static void gfs2_delete_inode(struct inode *inode)
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
 	if (error)
-		goto out_uninit;
+		goto out_truncate;
 
 	if (S_ISDIR(inode->i_mode) &&
-	    (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
 		error = gfs2_dir_exhash_dealloc(ip);
 		if (error)
 			goto out_unlock;
 	}
 
-	if (ip->i_di.di_eattr) {
+	if (ip->i_eattr) {
 		error = gfs2_ea_dealloc(ip);
 		if (error)
 			goto out_unlock;
@@ -519,6 +643,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	if (error)
 		goto out_unlock;
 
+out_truncate:
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
 		goto out_unlock;
@@ -527,8 +652,8 @@ static void gfs2_delete_inode(struct inode *inode)
 	gfs2_trans_end(sdp);
 
 out_unlock:
-	gfs2_glock_dq(&ip->i_iopen_gh);
-out_uninit:
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
 	if (error && error != GLR_TRYFAILED)
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct super_operations gfs2_super_ops;
-
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	struct gfs2_quota_data *qd;
 	int error;
 
-	qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
 	if (!qd)
 		return -ENOMEM;
 
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	return 0;
 
 fail:
-	kfree(qd);
+	kmem_cache_free(gfs2_quotad_cachep, qd);
 	return error;
 }
 
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 		if (qd || !create) {
 			if (new_qd) {
 				gfs2_lvb_unhold(new_qd->qd_gl);
-				kfree(new_qd);
+				kmem_cache_free(gfs2_quotad_cachep, new_qd);
 			}
 			*qdp = qd;
 			return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 
 	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
 		return;
-	if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
 		return;
 
 	for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-	unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+	unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
 
-	if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
-	    ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+	if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
+	    ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
@@ -1195,7 +1197,7 @@ fail:
 	return error;
 }
 
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
 {
 	struct gfs2_quota_data *qd, *safe;
 	LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_lvb_unhold(qd->qd_gl);
-		kfree(qd);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
 	}
 }
 
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_lvb_unhold(qd->qd_gl);
-		kfree(qd);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
 
 		spin_lock(&sdp->sd_quota_spin);
 	}
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	}
 }
 
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+	if (error == 0 || error == -EROFS)
+		return;
+	if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+			       int (*fxn)(struct gfs2_sbd *sdp),
+			       unsigned long t, unsigned long *timeo,
+			       unsigned int *new_timeo)
+{
+	if (t >= *timeo) {
+		int error = fxn(sdp);
+		quotad_error(sdp, msg, error);
+		*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+	} else {
+		*timeo -= t;
+	}
+}
+
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+
+	while(1) {
+		ip = NULL;
+		spin_lock(&sdp->sd_trunc_lock);
+		if (!list_empty(&sdp->sd_trunc_list)) {
+			ip = list_entry(sdp->sd_trunc_list.next,
+					struct gfs2_inode, i_trunc_list);
+			list_del_init(&ip->i_trunc_list);
+		}
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (ip == NULL)
+			return;
+		gfs2_glock_finish_truncate(ip);
+	}
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_tune *tune = &sdp->sd_tune;
+	unsigned long statfs_timeo = 0;
+	unsigned long quotad_timeo = 0;
+	unsigned long t = 0;
+	DEFINE_WAIT(wait);
+	int empty;
+
+	while (!kthread_should_stop()) {
+
+		/* Update the master statfs file */
+		quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+				   &statfs_timeo, &tune->gt_statfs_quantum);
+
+		/* Update quota file */
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+				   &quotad_timeo, &tune->gt_quota_quantum);
+
+		/* FIXME: This should be turned into a shrinker */
+		gfs2_quota_scan(sdp);
+
+		/* Check for & recover partially truncated inodes */
+		quotad_check_trunc_list(sdp);
+
+		if (freezing(current))
+			refrigerator();
+		t = min(quotad_timeo, statfs_timeo);
+
+		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_lock(&sdp->sd_trunc_lock);
+		empty = list_empty(&sdp->sd_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (empty)
+			t -= schedule_timeout(t);
+		else
+			t = 0;
+		finish_wait(&sdp->sd_quota_wait, &wait);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
 
 #define NO_QUOTA_CHANGE ((u32)-1)
 
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
 
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-		       u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+			      u32 uid, u32 gid);
 
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 
-int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
 
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -583,13 +585,35 @@ fail:
 	return error;
 }
 
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	spin_lock(&sdp->sd_jindex_spin);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_dirty) {
+			jd->jd_dirty = 0;
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
 /**
  * gfs2_check_journals - Recover any dirty journals
  * @sdp: the filesystem
  *
  */
 
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
 	struct gfs2_jdesc *jd;
 
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
 	}
 }
 
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t;
+
+	while (!kthread_should_stop()) {
+		gfs2_check_journals(sdp);
+		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+		if (freezing(current))
+			refrigerator();
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 	        *blk = 0;
 }
 
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
 			   struct buffer_head **bh);
 
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recoverd(void *data);
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 						  bi->bi_len, x);
 	}
 
-	if (count[0] != rgd->rd_rg.rg_free) {
+	if (count[0] != rgd->rd_free) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "free data mismatch:  %u != %u\n",
-			       count[0], rgd->rd_rg.rg_free);
+			       count[0], rgd->rd_free);
 		return;
 	}
 
-	tmp = rgd->rd_data -
-		rgd->rd_rg.rg_free -
-		rgd->rd_rg.rg_dinodes;
+	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
 	if (count[1] + count[2] != tmp) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used data mismatch:  %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 		return;
 	}
 
-	if (count[3] != rgd->rd_rg.rg_dinodes) {
+	if (count[3] != rgd->rd_dinodes) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-			       count[3], rgd->rd_rg.rg_dinodes);
+			       count[3], rgd->rd_dinodes);
 		return;
 	}
 
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+		if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = ip->i_di.di_size;
+	u64 rgrp_count = ip->i_disksize;
 	int error;
 
 	if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
 		/* Ignore partials */
 		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    ip->i_di.di_size)
+		    ip->i_disksize)
 			break;
 		error = read_rindex_entry(ip, &ra_state);
 		if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
 	const struct gfs2_rgrp *str = buf;
-	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
 	u32 rg_flags;
 
 	rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 		rgd->rd_flags |= GFS2_RDF_NOALLOC;
 	else
 		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
-	rg->rg_free = be32_to_cpu(str->rg_free);
-	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
-	rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+	rgd->rd_free = be32_to_cpu(str->rg_free);
+	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
+	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
-	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
 	u32 rg_flags = 0;
 
 	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
 		rg_flags |= GFS2_RGF_NOALLOC;
 	str->rg_flags = cpu_to_be32(rg_flags);
-	str->rg_free = cpu_to_be32(rg->rg_free);
-	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+	str->rg_free = cpu_to_be32(rgd->rd_free);
+	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
 	str->__pad = cpu_to_be32(0);
-	str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
 
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	rgd->rd_free_clone = rgd->rd_free;
 	rgd->rd_bh_count++;
 	spin_unlock(&sdp->sd_rindex_spin);
 
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	rgd->rd_free_clone = rgd->rd_free;
 	spin_unlock(&sdp->sd_rindex_spin);
 }
 
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
 
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
-	rgd->rd_rg.rg_free -= *n;
+	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+	rgd->rd_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	block = rgd->rd_data0 + blk;
 
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-	rgd->rd_rg.rg_free--;
-	rgd->rd_rg.rg_dinodes++;
-	*generation = rgd->rd_rg.rg_igeneration++;
+	gfs2_assert_withdraw(sdp, rgd->rd_free);
+	rgd->rd_free--;
+	rgd->rd_dinodes++;
+	*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	if (!rgd)
 		return;
 
-	rgd->rd_rg.rg_free += blen;
+	rgd->rd_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	if (!rgd)
 		return;
 
-	rgd->rd_rg.rg_free += blen;
+	rgd->rd_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 		return;
 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
 
-	if (!rgd->rd_rg.rg_dinodes)
+	if (!rgd->rd_dinodes)
 		gfs2_consist_rgrpd(rgd);
-	rgd->rd_rg.rg_dinodes--;
-	rgd->rd_rg.rg_free++;
+	rgd->rd_dinodes--;
+	rgd->rd_free++;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
 #include "util.h"
 
 /**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
-	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
-	struct qstr name;
-	char buf[20];
-	struct gfs2_jdesc *jd;
-	int error;
-
-	name.name = buf;
-
-	mutex_lock(&sdp->sd_jindex_mutex);
-
-	for (;;) {
-		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-		if (error)
-			break;
-
-		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
-		name.hash = gfs2_disk_hash(name.name, name.len);
-
-		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
-		if (error == -ENOENT) {
-			error = 0;
-			break;
-		}
-
-		gfs2_glock_dq_uninit(ji_gh);
-
-		if (error)
-			break;
-
-		error = -ENOMEM;
-		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
-		if (!jd)
-			break;
-
-		INIT_LIST_HEAD(&jd->extent_list);
-		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
-			if (!jd->jd_inode)
-				error = -ENOENT;
-			else
-				error = PTR_ERR(jd->jd_inode);
-			kfree(jd);
-			break;
-		}
-
-		spin_lock(&sdp->sd_jindex_spin);
-		jd->jd_jid = sdp->sd_journals++;
-		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
-		spin_unlock(&sdp->sd_jindex_spin);
-	}
-
-	mutex_unlock(&sdp->sd_jindex_mutex);
-
-	return error;
-}
-
-/**
  * gfs2_jindex_free - Clear all the journal index information
  * @sdp: The GFS2 superblock
  *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
 	return jd;
 }
 
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-	struct gfs2_jdesc *jd;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
-	if (jd)
-		jd->jd_dirty = 1;
-	spin_unlock(&sdp->sd_jindex_spin);
-}
-
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-	struct gfs2_jdesc *jd;
-	int found = 0;
-
-	spin_lock(&sdp->sd_jindex_spin);
-
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_dirty) {
-			jd->jd_dirty = 0;
-			found = 1;
-			break;
-		}
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	if (!found)
-		jd = NULL;
-
-	return jd;
-}
-
 int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 	int ar;
 	int error;
 
-	if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
-	    (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+	if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
+	    (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
-	jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+	jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 
-	error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+	error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
 	if (!error && ar) {
 		gfs2_consist_inode(ip);
 		error = -EIO;
@@ -423,137 +320,6 @@ out:
 	return error;
 }
 
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
-	spin_lock(&sdp->sd_statfs_spin);
-
-	*sc = *m_sc;
-	sc->sc_total += l_sc->sc_total;
-	sc->sc_free += l_sc->sc_free;
-	sc->sc_dinodes += l_sc->sc_dinodes;
-
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	if (sc->sc_free < 0)
-		sc->sc_free = 0;
-	if (sc->sc_free > sc->sc_total)
-		sc->sc_free = sc->sc_total;
-	if (sc->sc_dinodes < 0)
-		sc->sc_dinodes = 0;
-
-	return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change_host *sc)
-{
-	gfs2_rgrp_verify(rgd);
-	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_rg.rg_free;
-	sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
-	return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_holder ri_gh;
-	struct gfs2_rgrpd *rgd_next;
-	struct gfs2_holder *gha, *gh;
-	unsigned int slots = 64;
-	unsigned int x;
-	int done;
-	int error = 0, err;
-
-	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-	if (!gha)
-		return -ENOMEM;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
-	rgd_next = gfs2_rgrpd_get_first(sdp);
-
-	for (;;) {
-		done = 1;
-
-		for (x = 0; x < slots; x++) {
-			gh = gha + x;
-
-			if (gh->gh_gl && gfs2_glock_poll(gh)) {
-				err = gfs2_glock_wait(gh);
-				if (err) {
-					gfs2_holder_uninit(gh);
-					error = err;
-				} else {
-					if (!error)
-						error = statfs_slow_fill(
-							gh->gh_gl->gl_object, sc);
-					gfs2_glock_dq_uninit(gh);
-				}
-			}
-
-			if (gh->gh_gl)
-				done = 0;
-			else if (rgd_next && !error) {
-				error = gfs2_glock_nq_init(rgd_next->rd_gl,
-							   LM_ST_SHARED,
-							   GL_ASYNC,
-							   gh);
-				rgd_next = gfs2_rgrpd_get_next(rgd_next);
-				done = 0;
-			}
-
-			if (signal_pending(current))
-				error = -ERESTARTSYS;
-		}
-
-		if (done)
-			break;
-
-		yield();
-	}
-
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
-	kfree(gha);
-	return error;
-}
-
 struct lfcc {
 	struct list_head list;
 	struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	struct gfs2_log_header_host lh;
 	int error;
 
-	error = gfs2_jindex_hold(sdp, &ji_gh);
-	if (error)
-		return error;
-
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
 		if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
 #ifndef __SUPER_DOT_H__
 #define __SUPER_DOT_H__
 
+#include <linux/fs.h>
+#include <linux/dcache.h>
 #include "incore.h"
 
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
 int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
 			s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
 
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
+
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
 #include "quota.h"
 #include "util.h"
 
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
-
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching,    "%d\n");
 ARGS_ATTR(localflocks,     "%d\n");
 ARGS_ATTR(debug,           "%d\n");
 ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(num_glockd,      "%u\n");
 ARGS_ATTR(posix_acl,       "%d\n");
 ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
 	&args_attr_localflocks.attr,
 	&args_attr_debug.attr,
 	&args_attr_upgrade.attr,
-	&args_attr_num_glockd.attr,
 	&args_attr_posix_acl.attr,
 	&args_attr_quota.attr,
 	&args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
 };
 
 /*
- * display counters from superblock
- */
-
-struct counters_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
-#define COUNTERS_ATTR(name, fmt)                                            \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt,                                \
-			(unsigned int)atomic_read(&sdp->sd_##name));        \
-}                                                                           \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-
-COUNTERS_ATTR(reclaimed,        "%u\n");
-
-static struct attribute *counters_attrs[] = {
-	&counters_attr_reclaimed.attr,
-	NULL,
-};
-
-/*
  * get and set struct gfs2_tune fields
  */
 
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
-	&tune_attr_demote_secs.attr,
 	&tune_attr_incore_log_blocks.attr,
 	&tune_attr_log_flush_secs.attr,
 	&tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_recoverd_secs.attr,
 	&tune_attr_logd_secs.attr,
-	&tune_attr_quotad_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
 	NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
 	.attrs = lockstruct_attrs,
 };
 
-static struct attribute_group counters_group = {
-	.name = "counters",
-	.attrs = counters_attrs,
-};
-
 static struct attribute_group args_group = {
 	.name = "args",
 	.attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_reg;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
-	if (error)
-		goto fail_lockstruct;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
 	if (error)
-		goto fail_counters;
+		goto fail_lockstruct;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
-	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 fail_lockstruct:
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 	kobject_put(&sdp->sd_kobj);
 }
 
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	return 0;
+}
+
+static struct kset_uevent_ops gfs2_uevent_ops = {
+	.uevent = gfs2_uevent,
+};
+
+
 int gfs2_sys_init(void)
 {
-	gfs2_sys_margs = NULL;
-	spin_lock_init(&gfs2_sys_margs_lock);
-	gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
+	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
 	if (!gfs2_kset)
 		return -ENOMEM;
 	return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
 
 void gfs2_sys_uninit(void)
 {
-	kfree(gfs2_sys_margs);
 	kset_unregister(gfs2_kset);
 }
 
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
 #include <linux/spinlock.h>
 struct gfs2_sbd;
 
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
-
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
 
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
 
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 					   unsigned int *p)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	for (;;) {
 		struct page *page;
 		unsigned long nr, ret;
+		int ra;
 
 		/* nr is the maximum number of bytes to copy from this page */
 		nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			 */
 			ret = len < nr ? len : nr;
 			if (clear_user(buf, ret))
-				ret = -EFAULT;
+				ra = -EFAULT;
+			else
+				ra = 0;
 		} else {
 			/*
 			 * We have the page, copy it to user space buffer.
 			 */
-			ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+			ret = ra;
 		}
-		if (ret < 0) {
+		if (ra < 0) {
 			if (retval == 0)
-				retval = ret;
+				retval = ra;
 			if (page)
 				page_cache_release(page);
 			goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 7de1cda92489..0013ac1af8e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
 #include <linux/mount.h>
+#include <linux/async.h>
 
 /*
  * This is needed for the following functions:
@@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode)
 
 /**
  * inode_init_always - perform inode structure intialisation
- * @sb		- superblock inode belongs to.
- * @inode	- inode to initialise
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
  *
  * These are initializations that need to be done on every inode
  * allocation as the fields are not initialised by slab allocation.
@@ -131,6 +132,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_op = &empty_iops;
 	inode->i_fop = &empty_fops;
 	inode->i_nlink = 1;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
 	atomic_set(&inode->i_writecount, 0);
 	inode->i_size = 0;
 	inode->i_blocks = 0;
@@ -164,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	mapping->a_ops = &empty_aops;
 	mapping->host = inode;
 	mapping->flags = 0;
-	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
@@ -574,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 
 /**
  * inode_add_to_lists - add a new inode to relevant lists
- * @sb		- superblock inode belongs to.
- * @inode	- inode to mark in use
+ * @sb: superblock inode belongs to
+ * @inode: inode to mark in use
  *
  * When an inode is allocated it needs to be accounted for, added to the in use
  * list, the owning superblock and the inode hash. This needs to be done under
@@ -599,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
  *	@sb: superblock
  *
  *	Allocates a new inode for given superblock. The default gfp_mask
- *	for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
  *	If HIGHMEM pages are unsuitable or it is known that pages allocated
  *	for the page cache are not reclaimable or migratable,
  *	mapping_set_gfp_mask() must be called with suitable flags on the
@@ -1136,16 +1139,11 @@ EXPORT_SYMBOL(remove_inode_hash);
  * I_FREEING is set so that no-one will take a new reference to the inode while
  * it is being deleted.
  */
-void generic_delete_inode(struct inode *inode)
+static void generic_delete_inode_async(void *data, async_cookie_t cookie)
 {
+	struct inode *inode = data;
 	const struct super_operations *op = inode->i_sb->s_op;
 
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
 	security_inode_delete(inode);
 
 	if (op->delete_inode) {
@@ -1169,6 +1167,16 @@ void generic_delete_inode(struct inode *inode)
 	destroy_inode(inode);
 }
 
+void generic_delete_inode(struct inode *inode)
+{
+	list_del_init(&inode->i_list);
+	list_del_init(&inode->i_sb_list);
+	inode->i_state |= I_FREEING;
+	inodes_stat.nr_inodes--;
+	spin_unlock(&inode_lock);
+	async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
+}
+
 EXPORT_SYMBOL(generic_delete_inode);
 
 static void generic_forget_inode(struct inode *inode)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..cc3f1aa1cf7b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
 
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
  * @inode - the inode to map
  * @arg - the pointer to userspace where we copy everything to
  * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
  *
  * If it is possible to have data blocks beyond a hole past @inode->i_size, then
  * please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
  */
-int generic_block_fiemap(struct inode *inode,
-			 struct fiemap_extent_info *fieinfo, u64 start,
-			 u64 len, get_block_t *get_block)
+
+int __generic_block_fiemap(struct inode *inode,
+			   struct fiemap_extent_info *fieinfo, u64 start,
+			   u64 len, get_block_t *get_block)
 {
 	struct buffer_head tmp;
 	unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
 
 	start_blk = logical_to_blk(inode, start);
 
-	/* guard against change */
-	mutex_lock(&inode->i_mutex);
-
 	length = (long long)min_t(u64, len, i_size_read(inode));
 	map_len = length;
 
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
 		cond_resched();
 	} while (1);
 
-	mutex_unlock(&inode->i_mutex);
-
 	/* if ret is 1 then we just hit the end of the extent array */
 	if (ret == 1)
 		ret = 0;
 
 	return ret;
 }
+EXPORT_SYMBOL(__generic_block_fiemap);
+
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	int ret;
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
 EXPORT_SYMBOL(generic_block_fiemap);
 
 #endif  /*  CONFIG_BLOCK  */
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	task_unlock(task);
 	return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
 	}
 	sbi->s_joliet_level = joliet_level;
 
-	/* check the root inode */
-	if (!inode->i_op)
-		goto out_bad_root;
-
 	/* Make sure the root inode is a directory */
 	if (!S_ISDIR(inode->i_mode)) {
 		printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
 	/*
 	 * Display error messages and free resources.
 	 */
-out_bad_root:
-	printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
 	iput(inode);
 	goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time*3 +
+				journal->j_average_commit_time) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
 	spin_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
  * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
  * @handle: transaction to add buffer modifications to
  * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
  *
  * Returns an error code or 0 on success.
  *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
 	 * on IO anyway.  Speeds up many-threaded, many-dir operations
 	 * by 30x or more...
 	 *
+	 * We try and optimize the sleep time against what the underlying disk
+	 * can do, instead of having a static sleep time.  This is usefull for
+	 * the case where our storage is so fast that it is more optimal to go
+	 * ahead and force a flush and wait for the transaction to be committed
+	 * than it is to wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We acheive this by measuring how long it takes
+	 * to commit a transaction, and compare it with how long this
+	 * transaction has been running, and if run time < commit time then we
+	 * sleep for the delta and commit.  This greatly helps super fast disks
+	 * that would see slowdowns as more threads started doing fsyncs.
+	 *
 	 * But don't do this if this process was the most recent one to
 	 * perform a synchronous write.  We do this to detect the case where a
 	 * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
 	return ret;
 }
 
-#define NR_BATCH	64
-
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, bhs);
+	ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
 	for (i = 0; i < *batch_count; i++) {
-		struct buffer_head *bh = bhs[i];
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
 		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			transaction_t *transaction)
+			    int *batch_count, transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
-		bhs[*batch_count] = bh;
+		journal->j_chkpt_bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
-		if (*batch_count == NR_BATCH) {
+		if (*batch_count == JBD2_NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
-			__flush_batch(journal, bhs, batch_count);
+			__flush_batch(journal, batch_count);
 			ret = 1;
 		}
 	}
@@ -388,7 +385,6 @@ restart:
 	if (journal->j_checkpoint_transactions == transaction &&
 			transaction->t_tid == this_tid) {
 		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
 		int retry = 0, err;
 
@@ -402,7 +398,7 @@ restart:
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs, &batch_count,
+			retry = __process_buffer(journal, jh, &batch_count,
 						 transaction);
 			if (retry < 0 && !result)
 				result = retry;
@@ -419,7 +415,7 @@ restart:
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 			}
-			__flush_batch(journal, bhs, &batch_count);
+			__flush_batch(journal, &batch_count);
 		}
 
 		if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	   safely remove this transaction from the log */
 
 	__jbd2_journal_drop_transaction(journal, transaction);
+	kfree(transaction);
 
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(journal->j_running_transaction != transaction);
 
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-	kfree(transaction);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE, bh);
+	ret = submit_bh(WRITE_SYNC, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 	}
 	*cbh = bh;
 	return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
  * This function along with journal_submit_commit_record
  * allows to write the commit record asynchronously.
  */
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+					 struct buffer_head *bh)
 {
 	int ret = 0;
 
+retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
+	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+		printk(KERN_WARNING
+		       "JBD2: wait_on_commit_record: sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
+		spin_lock(&journal->j_state_lock);
+		journal->j_flags &= ~JBD2_BARRIER;
+		spin_unlock(&journal->j_state_lock);
+
+		lock_buffer(bh);
+		clear_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		bh->b_end_io = journal_end_buffer_io_sync;
+
+		ret = submit_bh(WRITE_SYNC, bh);
+		if (ret) {
+			unlock_buffer(bh);
+			return ret;
+		}
+		goto retry;
+	}
 
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i;
+	int i, to_free = 0;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (is_journal_aborted(journal)) {
 			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_buffer_abort_trigger(jh,
+						  jh->b_frozen_data ?
+						  jh->b_frozen_triggers :
+						  jh->b_triggers);
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
 			 * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
 			__jbd2_journal_abort_hard(journal);
 	}
 	if (!err && !is_journal_aborted(journal))
-		err = journal_wait_on_commit_record(cbh);
+		err = journal_wait_on_commit_record(journal, cbh);
 
 	if (err)
 		jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
 		 * data.
 		 *
 		 * Otherwise, we can just throw away the frozen data now.
+		 *
+		 * We also know that the frozen data has already fired
+		 * its triggers if they exist, so we can clear that too.
 		 */
 		if (jh->b_committed_data) {
 			jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
+				jh->b_frozen_triggers = NULL;
 			}
 		} else if (jh->b_frozen_data) {
 			jbd2_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
+			jh->b_frozen_triggers = NULL;
 		}
 
 		spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
-	spin_unlock(&journal->j_state_lock);
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in the commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time +
+				journal->j_average_commit_time*3) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+	spin_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		to_free = 1;
 	} else {
 		if (journal->j_checkpoint_transactions == NULL) {
 			journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, journal->j_commit_sequence,
+		   journal->j_devname, commit_transaction->t_tid,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
+	if (to_free)
+		kfree(commit_transaction);
 
 	wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/div64.h>
 
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
 	journal->j_task = current;
 	wake_up(&journal->j_wait_done_commit);
 
-	printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
-			journal->j_commit_interval / HZ);
+	printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
+	       "commit interval %ld seconds\n", current->pid,
+	       journal->j_devname, journal->j_commit_interval / HZ);
 
 	/*
 	 * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
+	struct jbd2_buffer_trigger_type *triggers;
 
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
+		triggers = jh_in->b_frozen_triggers;
 	} else {
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+		triggers = jh_in->b_triggers;
 	}
 
 	mapped_data = kmap_atomic(new_page, KM_USER0);
 	/*
+	 * Fire any commit trigger.  Do this before checking for escaping,
+	 * as the trigger may modify the magic offset.  If a copy-out
+	 * happens afterwards, it will have the correct data in the buffer.
+	 */
+	jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+				   triggers);
+
+	/*
 	 * Check for escaping
 	 */
 	if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
 		new_page = virt_to_page(tmp);
 		new_offset = offset_in_page(tmp);
 		done_copy_out = 1;
+
+		/*
+		 * This isn't strictly necessary, as we're using frozen
+		 * data for the escaping, but it keeps consistency with
+		 * b_frozen_data usage.
+		 */
+		jh_in->b_frozen_triggers = jh_in->b_triggers;
 	}
 
 	/*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 		return NULL;
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
 	lock_buffer(bh);
 	memset(bh->b_data, 0, journal->j_blocksize);
 	set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
 	seq_printf(seq, "  %ums logging transaction\n",
 	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+	seq_printf(seq, "  %luus average transaction commit time\n",
+		   do_div(s->journal->j_average_commit_time, 1000));
 	seq_printf(seq, "  %lu handles per transaction\n",
 	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
 	seq_printf(seq, "  %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 
 	/* journal descriptor can store up to n blocks -bzzz */
 	journal->j_blocksize = blocksize;
+	jbd2_stats_proc_init(journal);
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
 	journal->j_wbufsize = n;
 	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		kfree(journal);
-		journal = NULL;
-		goto out;
+		goto out_err;
 	}
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	p = journal->j_devname;
 	while ((p = strchr(p, '/')))
 		*p = '!';
-	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
 	return journal;
+out_err:
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
 }
 
 /**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		jbd2_stats_proc_exit(journal);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
-		jbd2_stats_proc_exit(journal);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
 	return journal;
+out_err:
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
 }
 
 /*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
 }
 
 /**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-	unsigned long long blocknr;
-	struct buffer_head *bh;
-	journal_superblock_t *sb;
-	int i, err;
-
-	if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-			journal->j_maxlen);
-		journal_fail_superblock(journal);
-		return -EINVAL;
-	}
-
-	if (journal->j_inode == NULL) {
-		/*
-		 * We don't know what block to start at!
-		 */
-		printk(KERN_EMERG
-		       "%s: creation of journal on external device!\n",
-		       __func__);
-		BUG();
-	}
-
-	/* Zero out the entire journal on disk.  We cannot afford to
-	   have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-	for (i = 0; i < journal->j_maxlen; i++) {
-		err = jbd2_journal_bmap(journal, i, &blocknr);
-		if (err)
-			return err;
-		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-		lock_buffer(bh);
-		memset (bh->b_data, 0, journal->j_blocksize);
-		BUFFER_TRACE(bh, "marking dirty");
-		mark_buffer_dirty(bh);
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		__brelse(bh);
-	}
-
-	sync_blockdev(journal->j_dev);
-	jbd_debug(1, "JBD: journal cleared.\n");
-
-	/* OK, fill in the initial static fields in the new superblock */
-	sb = journal->j_superblock;
-
-	sb->s_header.h_magic	 = cpu_to_be32(JBD2_MAGIC_NUMBER);
-	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-
-	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
-	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
-	sb->s_first	= cpu_to_be32(1);
-
-	journal->j_transaction_sequence = 1;
-
-	journal->j_flags &= ~JBD2_ABORT;
-	journal->j_format_version = 2;
-
-	return journal_reset(journal);
-}
-
-/**
  * void jbd2_journal_update_superblock() - Update journal sb on disk.
  * @journal: The journal to update.
  * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
 		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
+
+		/*
+		 * Now that the frozen data is saved off, we need to store
+		 * any matching triggers.
+		 */
+		jh->b_frozen_triggers = jh->b_triggers;
 	}
 	jbd_unlock_bh_state(bh);
 
@@ -944,6 +952,47 @@ out:
 }
 
 /**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+			       struct jbd2_buffer_trigger_type *type)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+				struct jbd2_buffer_trigger_type *triggers)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (!triggers || !triggers->t_commit)
+		return;
+
+	triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+			       struct jbd2_buffer_trigger_type *triggers)
+{
+	if (!triggers || !triggers->t_abort)
+		return;
+
+	triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
 	/*
 	 * Implement synchronous transaction batching.  If the handle
 	 * was synchronous, don't force a commit immediately.  Let's
-	 * yield and let another thread piggyback onto this transaction.
-	 * Keep doing that while new threads continue to arrive.
-	 * It doesn't cost much - we're about to run a commit and sleep
-	 * on IO anyway.  Speeds up many-threaded, many-dir operations
-	 * by 30x or more...
+	 * yield and let another thread piggyback onto this
+	 * transaction.  Keep doing that while new threads continue to
+	 * arrive.  It doesn't cost much - we're about to run a commit
+	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+	 * operations by 30x or more...
 	 *
-	 * But don't do this if this process was the most recent one to
-	 * perform a synchronous write.  We do this to detect the case where a
-	 * single process is doing a stream of sync writes.  No point in waiting
-	 * for joiners in that case.
+	 * We try and optimize the sleep time against what the
+	 * underlying disk can do, instead of having a static sleep
+	 * time.  This is useful for the case where our storage is so
+	 * fast that it is more optimal to go ahead and force a flush
+	 * and wait for the transaction to be committed than it is to
+	 * wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how
+	 * long it takes to commit a transaction, and compare it with
+	 * how long this transaction has been running, and if run time
+	 * < commit time then we sleep for the delta and commit.  This
+	 * greatly helps super fast disks that would see slowdowns as
+	 * more threads started doing fsyncs.
+	 *
+	 * But don't do this if this process was the most recent one
+	 * to perform a synchronous write.  We do this to detect the
+	 * case where a single process is doing a stream of sync
+	 * writes.  No point in waiting for joiners in that case.
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
+		commit_time = min_t(u64, commit_time,
+				    1000*journal->j_max_batch_time);
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
 
 /*
  * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
  */
-static HLIST_HEAD(aggregate_hash);
 
 /*
  * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	/* release the page */
 	release_metapage(mp);
 
-	hlist_add_head(&ip->i_hash, &aggregate_hash);
+	/*
+	 * that will look hashed, but won't be on any list; hlist_del()
+	 * will work fine and require no locking.
+	 */
+	ip->i_hash.pprev = &ip->i_hash.next;
 
 	return (ip);
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index bdaec17fa388..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	 */
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_uid = root->i_gid = 0;
 	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
 	dentry = d_alloc(NULL, &d_name);
 	if (!dentry) {
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	 */
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_uid = inode->i_gid = 0;
-	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 		if (!inode)
 			goto out;
 		inode->i_mode = S_IFREG | files->mode;
-		inode->i_uid = inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
 		inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	unsigned char fl_type;
 	int status = -ENOLCK;
 
-	if (nsm_monitor(host) < 0) {
-		printk(KERN_NOTICE "lockd: failed to monitor %s\n",
-					host->h_name);
+	if (nsm_monitor(host) < 0)
 		goto out;
-	}
+
 	fl->fl_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
 	fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b820..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 
 #include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
 static void			nlm_gc_hosts(void);
-static struct nsm_handle	*nsm_find(const struct sockaddr *sap,
-						const size_t salen,
-						const char *hostname,
-						const size_t hostname_len,
-						const int create);
 
 struct nlm_lookup_host_info {
 	const int		server;		/* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
 	}
 }
 
-static void nlm_display_address(const struct sockaddr *sap,
-				char *buf, const size_t len)
-{
-	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
-	switch (sap->sa_family) {
-	case AF_UNSPEC:
-		snprintf(buf, len, "unspecified");
-		break;
-	case AF_INET:
-		snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-		break;
-	case AF_INET6:
-		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-			snprintf(buf, len, "%pI4",
-				 &sin6->sin6_addr.s6_addr32[3]);
-		else
-			snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-		break;
-	default:
-		snprintf(buf, len, "unsupported address family");
-		break;
-	}
-}
-
 /*
  * Common host lookup routine for server & client
  */
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 		atomic_inc(&nsm->sm_count);
 	else {
 		host = NULL;
-		nsm = nsm_find(ni->sap, ni->salen,
-				ni->hostname, ni->hostname_len, 1);
+		nsm = nsm_get_handle(ni->sap, ni->salen,
+					ni->hostname, ni->hostname_len);
 		if (!nsm) {
 			dprintk("lockd: nlm_lookup_host failed; "
 				"no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 		goto out;
 	}
 	host->h_name	   = nsm->sm_name;
+	host->h_addrbuf    = nsm->sm_addrbuf;
 	memcpy(nlm_addr(host), ni->sap, ni->salen);
 	host->h_addrlen = ni->salen;
 	nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 
 	nrhosts++;
 
-	nlm_display_address((struct sockaddr *)&host->h_addr,
-				host->h_addrbuf, sizeof(host->h_addrbuf));
-	nlm_display_address((struct sockaddr *)&host->h_srcaddr,
-				host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
-
 	dprintk("lockd: nlm_lookup_host created host %s\n",
 			host->h_name);
 
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
 	BUG_ON(!list_empty(&host->h_lockowners));
 	BUG_ON(atomic_read(&host->h_count));
 
-	/*
-	 * Release NSM handle and unmonitor host.
-	 */
 	nsm_unmonitor(host);
+	nsm_release(host->h_nsmhandle);
 
 	clnt = host->h_rpcclnt;
 	if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
-	dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
-			host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+	dprintk("lockd: nlm_bind_host %s (%s)\n",
+			host->h_name, host->h_addrbuf);
 
 	/* Lock host handle */
 	mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
 	}
 }
 
-/*
- * We were notified that the host indicated by address &sin
- * has rebooted.
- * Release all resources held by that peer.
+/**
+ * nlm_host_rebooted - Release all resources held by rebooted host
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
  */
-void nlm_host_rebooted(const struct sockaddr_in *sin,
-				const char *hostname,
-				unsigned int hostname_len,
-				u32 new_state)
+void nlm_host_rebooted(const struct nlm_reboot *info)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
-			hostname, hostname_len, 0);
-	if (nsm == NULL) {
-		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-				hostname_len, hostname);
+	nsm = nsm_reboot_lookup(info);
+	if (unlikely(nsm == NULL))
 		return;
-	}
-
-	dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-			hostname_len, hostname, nsm->sm_addrbuf);
-
-	/* When reclaiming locks on this peer, make sure that
-	 * we set up a new notification */
-	nsm->sm_monitored = 0;
 
 	/* Mark all hosts tied to this NSM state as having rebooted.
 	 * We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again:	mutex_lock(&nlm_host_mutex);
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
 		hlist_for_each_entry(host, pos, chain, h_hash) {
 			if (host->h_nsmhandle == nsm
-			 && host->h_nsmstate != new_state) {
-				host->h_nsmstate = new_state;
+			 && host->h_nsmstate != info->state) {
+				host->h_nsmstate = info->state;
 				host->h_state++;
 
 				nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
 
 	next_gc = jiffies + NLM_HOST_COLLECT;
 }
-
-
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-				   const size_t salen,
-				   const char *hostname,
-				   const size_t hostname_len,
-				   const int create)
-{
-	struct nsm_handle *nsm = NULL;
-	struct nsm_handle *pos;
-
-	if (!sap)
-		return NULL;
-
-	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
-		if (printk_ratelimit()) {
-			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-					    "in NFS lock request\n",
-				(int)hostname_len, hostname);
-		}
-		return NULL;
-	}
-
-retry:
-	spin_lock(&nsm_lock);
-	list_for_each_entry(pos, &nsm_handles, sm_link) {
-
-		if (hostname && nsm_use_hostnames) {
-			if (strlen(pos->sm_name) != hostname_len
-			 || memcmp(pos->sm_name, hostname, hostname_len))
-				continue;
-		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-			continue;
-		atomic_inc(&pos->sm_count);
-		kfree(nsm);
-		nsm = pos;
-		goto found;
-	}
-	if (nsm) {
-		list_add(&nsm->sm_link, &nsm_handles);
-		goto found;
-	}
-	spin_unlock(&nsm_lock);
-
-	if (!create)
-		return NULL;
-
-	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-	if (nsm == NULL)
-		return NULL;
-
-	memcpy(nsm_addr(nsm), sap, salen);
-	nsm->sm_addrlen = salen;
-	nsm->sm_name = (char *) (nsm + 1);
-	memcpy(nsm->sm_name, hostname, hostname_len);
-	nsm->sm_name[hostname_len] = '\0';
-	nlm_display_address((struct sockaddr *)&nsm->sm_addr,
-				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-	atomic_set(&nsm->sm_count, 1);
-	goto retry;
-
-found:
-	spin_unlock(&nsm_lock);
-	return nsm;
-}
-
-/*
- * Release an NSM handle
- */
-void
-nsm_release(struct nsm_handle *nsm)
-{
-	if (!nsm)
-		return;
-	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
-		list_del(&nsm->sm_link);
-		spin_unlock(&nsm_lock);
-		kfree(nsm);
-	}
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
 #include <linux/types.h>
 #include <linux/utsname.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
+
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
+#define NSM_PROGRAM		100024
+#define NSM_VERSION		1
+
+enum {
+	NSMPROC_NULL,
+	NSMPROC_STAT,
+	NSMPROC_MON,
+	NSMPROC_UNMON,
+	NSMPROC_UNMON_ALL,
+	NSMPROC_SIMU_CRASH,
+	NSMPROC_NOTIFY,
+};
+
+struct nsm_args {
+	struct nsm_private	*priv;
+	u32			prog;		/* RPC callback info */
+	u32			vers;
+	u32			proc;
 
-#define XDR_ADDRBUF_LEN		(20)
+	char			*mon_name;
+};
 
-static struct rpc_clnt *	nsm_create(void);
+struct nsm_res {
+	u32			status;
+	u32			state;
+};
 
 static struct rpc_program	nsm_program;
+static				LIST_HEAD(nsm_handles);
+static				DEFINE_SPINLOCK(nsm_lock);
 
 /*
  * Local NSM state
  */
-int				nsm_local_state;
+int	__read_mostly		nsm_local_state;
+int	__read_mostly		nsm_use_hostnames;
 
-/*
- * Common procedure for SM_MON/SM_UNMON calls
- */
-static int
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
+{
+	return (struct sockaddr *)&nsm->sm_addr;
+}
+
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+	else if (sin6->sin6_scope_id != 0)
+		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+				sin6->sin6_scope_id);
+	else
+		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+
+static void nsm_display_address(const struct sockaddr *sap,
+				char *buf, const size_t len)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		nsm_display_ipv4_address(sap, buf, len);
+		break;
+	case AF_INET6:
+		nsm_display_ipv6_address(sap, buf, len);
+		break;
+	default:
+		snprintf(buf, len, "unsupported address family");
+		break;
+	}
+}
+
+static struct rpc_clnt *nsm_create(void)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+	};
+	struct rpc_create_args args = {
+		.protocol		= XPRT_TRANSPORT_UDP,
+		.address		= (struct sockaddr *)&sin,
+		.addrsize		= sizeof(sin),
+		.servername		= "rpc.statd",
+		.program		= &nsm_program,
+		.version		= NSM_VERSION,
+		.authflavor		= RPC_AUTH_NULL,
+	};
+
+	return rpc_create(&args);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
 	struct rpc_clnt	*clnt;
 	int		status;
-	struct nsm_args	args;
+	struct nsm_args args = {
+		.priv		= &nsm->sm_priv,
+		.prog		= NLM_PROGRAM,
+		.vers		= 3,
+		.proc		= NLMPROC_NSM_NOTIFY,
+		.mon_name	= nsm->sm_mon_name,
+	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
 		.rpc_resp	= res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	clnt = nsm_create();
 	if (IS_ERR(clnt)) {
 		status = PTR_ERR(clnt);
+		dprintk("lockd: failed to create NSM upcall transport, "
+				"status=%d\n", status);
 		goto out;
 	}
 
-	memset(&args, 0, sizeof(args));
-	args.mon_name = nsm->sm_name;
-	args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
-	args.prog = NLM_PROGRAM;
-	args.vers = 3;
-	args.proc = NLMPROC_NSM_NOTIFY;
 	memset(res, 0, sizeof(*res));
 
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, 0);
 	if (status < 0)
-		printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
-			status);
+		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
+				status);
 	else
 		status = 0;
 	rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	return status;
 }
 
-/*
- * Set up monitoring of a remote host
+/**
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
  */
-int
-nsm_monitor(struct nlm_host *host)
+int nsm_monitor(const struct nlm_host *host)
 {
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
 	int		status;
 
-	dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
-	BUG_ON(nsm == NULL);
+	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
 
 	if (nsm->sm_monitored)
 		return 0;
 
-	status = nsm_mon_unmon(nsm, SM_MON, &res);
+	/*
+	 * Choose whether to record the caller_name or IP address of
+	 * this peer in the local rpc.statd's database.
+	 */
+	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
-	if (status < 0 || res.status != 0)
-		printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
+	if (res.status != 0)
+		status = -EIO;
+	if (status < 0)
+		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
 	else
 		nsm->sm_monitored = 1;
 	return status;
 }
 
-/*
- * Cease to monitor remote host
+/**
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
  */
-int
-nsm_unmonitor(struct nlm_host *host)
+void nsm_unmonitor(const struct nlm_host *host)
 {
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
-	int		status = 0;
-
-	if (nsm == NULL)
-		return 0;
-	host->h_nsmhandle = NULL;
+	int status;
 
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
-		dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
-		status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+		if (res.status != 0)
+			status = -EIO;
 		if (status < 0)
 			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
-					host->h_name);
+					nsm->sm_name);
 		else
 			nsm->sm_monitored = 0;
 	}
-	nsm_release(nsm);
-	return status;
+}
+
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+					      const size_t len)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (strlen(nsm->sm_name) == len &&
+		    memcmp(nsm->sm_name, hostname, len) == 0)
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (nlm_cmp_addr(nsm_addr(nsm), sap))
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (memcmp(nsm->sm_priv.data, priv->data,
+					sizeof(priv->data)) == 0)
+			return nsm;
+	return NULL;
 }
 
 /*
- * Create NSM client for the local host
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
  */
-static struct rpc_clnt *
-nsm_create(void)
+static void nsm_init_private(struct nsm_handle *nsm)
 {
-	struct sockaddr_in	sin = {
-		.sin_family	= AF_INET,
-		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-		.sin_port	= 0,
-	};
-	struct rpc_create_args args = {
-		.protocol	= XPRT_TRANSPORT_UDP,
-		.address	= (struct sockaddr *)&sin,
-		.addrsize	= sizeof(sin),
-		.servername	= "localhost",
-		.program	= &nsm_program,
-		.version	= SM_VERSION,
-		.authflavor	= RPC_AUTH_NULL,
-	};
+	u64 *p = (u64 *)&nsm->sm_priv.data;
+	struct timespec ts;
 
-	return rpc_create(&args);
+	ktime_get_ts(&ts);
+	*p++ = timespec_to_ns(&ts);
+	*p = (unsigned long)nsm;
+}
+
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+					    const size_t salen,
+					    const char *hostname,
+					    const size_t hostname_len)
+{
+	struct nsm_handle *new;
+
+	new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+	if (unlikely(new == NULL))
+		return NULL;
+
+	atomic_set(&new->sm_count, 1);
+	new->sm_name = (char *)(new + 1);
+	memcpy(nsm_addr(new), sap, salen);
+	new->sm_addrlen = salen;
+	nsm_init_private(new);
+	nsm_display_address((const struct sockaddr *)&new->sm_addr,
+				new->sm_addrbuf, sizeof(new->sm_addrbuf));
+	memcpy(new->sm_name, hostname, hostname_len);
+	new->sm_name[hostname_len] = '\0';
+
+	return new;
+}
+
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+				  const size_t salen, const char *hostname,
+				  const size_t hostname_len)
+{
+	struct nsm_handle *cached, *new = NULL;
+
+	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+					    "in NFS lock request\n",
+				(int)hostname_len, hostname);
+		}
+		return NULL;
+	}
+
+retry:
+	spin_lock(&nsm_lock);
+
+	if (nsm_use_hostnames && hostname != NULL)
+		cached = nsm_lookup_hostname(hostname, hostname_len);
+	else
+		cached = nsm_lookup_addr(sap);
+
+	if (cached != NULL) {
+		atomic_inc(&cached->sm_count);
+		spin_unlock(&nsm_lock);
+		kfree(new);
+		dprintk("lockd: found nsm_handle for %s (%s), "
+				"cnt %d\n", cached->sm_name,
+				cached->sm_addrbuf,
+				atomic_read(&cached->sm_count));
+		return cached;
+	}
+
+	if (new != NULL) {
+		list_add(&new->sm_link, &nsm_handles);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: created nsm_handle for %s (%s)\n",
+				new->sm_name, new->sm_addrbuf);
+		return new;
+	}
+
+	spin_unlock(&nsm_lock);
+
+	new = nsm_create_handle(sap, salen, hostname, hostname_len);
+	if (unlikely(new == NULL))
+		return NULL;
+	goto retry;
+}
+
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+	struct nsm_handle *cached;
+
+	spin_lock(&nsm_lock);
+
+	cached = nsm_lookup_priv(&info->priv);
+	if (unlikely(cached == NULL)) {
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				info->len, info->mon);
+		return cached;
+	}
+
+	atomic_inc(&cached->sm_count);
+	spin_unlock(&nsm_lock);
+
+	/*
+	 * During subsequent lock activity, force a fresh
+	 * notification to be set up for this host.
+	 */
+	cached->sm_monitored = 0;
+
+	dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+			cached->sm_name, cached->sm_addrbuf,
+			atomic_read(&cached->sm_count));
+	return cached;
+}
+
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+		list_del(&nsm->sm_link);
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+				nsm->sm_name, nsm->sm_addrbuf);
+		kfree(nsm);
+	}
 }
 
 /*
@@ -154,127 +428,132 @@ nsm_create(void)
  * Status Monitor wire protocol.
  */
 
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
-	size_t len = strlen(string);
-
-	if (len > SM_MAXSTRLEN)
-		len = SM_MAXSTRLEN;
-	return xdr_encode_opaque(p, string, len);
+	const u32 len = strlen(string);
+	__be32 *p;
+
+	if (unlikely(len > SM_MAXSTRLEN))
+		return -EIO;
+	p = xdr_reserve_space(xdr, sizeof(u32) + len);
+	if (unlikely(p == NULL))
+		return -EIO;
+	xdr_encode_opaque(p, string, len);
+	return 0;
 }
 
 /*
  * "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
  */
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	char	buffer[XDR_ADDRBUF_LEN + 1];
-	char	*name = argp->mon_name;
-
-	if (!nsm_use_hostnames) {
-		snprintf(buffer, XDR_ADDRBUF_LEN,
-			 "%pI4", &argp->addr);
-		name = buffer;
-	}
-
-	return xdr_encode_nsm_string(p, name);
+	return encode_nsm_string(xdr, argp->mon_name);
 }
 
 /*
  * The "my_id" argument specifies the hostname and RPC procedure
  * to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
  * has changed.
  */
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	p = xdr_encode_nsm_string(p, utsname()->nodename);
-	if (!p)
-		return ERR_PTR(-EIO);
-
+	int status;
+	__be32 *p;
+
+	status = encode_nsm_string(xdr, utsname()->nodename);
+	if (unlikely(status != 0))
+		return status;
+	p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
 	*p++ = htonl(argp->prog);
 	*p++ = htonl(argp->vers);
 	*p++ = htonl(argp->proc);
-
-	return p;
+	return 0;
 }
 
 /*
  * The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
  */
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_name(p, argp);
-	if (!p)
-		return ERR_PTR(-EIO);
+	int status;
 
-	return xdr_encode_my_id(p, argp);
+	status = encode_mon_name(xdr, argp);
+	if (unlikely(status != 0))
+		return status;
+	return encode_my_id(xdr, argp);
 }
 
 /*
  * The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
- * SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
+ * by the NSMPROC_MON call. This information will be supplied in the
+ * NLMPROC_SM_NOTIFY call.
  */
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	*p++ = argp->addr;
-	*p++ = 0;
-	*p++ = 0;
-	*p++ = 0;
+	__be32 *p;
 
-	return p;
+	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+	if (unlikely(p == NULL))
+		return -EIO;
+	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+	return 0;
 }
 
-static int
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+		       const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_id(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	p = xdr_encode_priv(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-	return 0;
+	struct xdr_stream xdr;
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	status = encode_mon_id(&xdr, argp);
+	if (unlikely(status))
+		return status;
+	return encode_priv(&xdr, argp);
 }
 
-static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+			 const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_id(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-	return 0;
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	return encode_mon_id(&xdr, argp);
 }
 
-static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+			    struct nsm_res *resp)
 {
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
 	resp->status = ntohl(*p++);
-	resp->state = ntohl(*p++);
-	dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+	resp->state = ntohl(*p);
+
+	dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
 			resp->status, resp->state);
 	return 0;
 }
 
-static int
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+			struct nsm_res *resp)
 {
-	resp->state = ntohl(*p++);
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	p = xdr_inline_decode(&xdr, sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->state = ntohl(*p);
+
+	dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
 	return 0;
 }
 
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_unmonres_sz	1
 
 static struct rpc_procinfo	nsm_procedures[] = {
-[SM_MON] = {
-		.p_proc		= SM_MON,
-		.p_encode	= (kxdrproc_t) xdr_encode_mon,
-		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
+[NSMPROC_MON] = {
+		.p_proc		= NSMPROC_MON,
+		.p_encode	= (kxdrproc_t)xdr_enc_mon,
+		.p_decode	= (kxdrproc_t)xdr_dec_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
-		.p_statidx	= SM_MON,
+		.p_statidx	= NSMPROC_MON,
 		.p_name		= "MONITOR",
 	},
-[SM_UNMON] = {
-		.p_proc		= SM_UNMON,
-		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
-		.p_decode	= (kxdrproc_t) xdr_decode_stat,
+[NSMPROC_UNMON] = {
+		.p_proc		= NSMPROC_UNMON,
+		.p_encode	= (kxdrproc_t)xdr_enc_unmon,
+		.p_decode	= (kxdrproc_t)xdr_dec_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
-		.p_statidx	= SM_UNMON,
+		.p_statidx	= NSMPROC_UNMON,
 		.p_name		= "UNMONITOR",
 	},
 };
@@ -322,7 +601,7 @@ static struct rpc_stat		nsm_stats;
 
 static struct rpc_program	nsm_program = {
 		.name		= "statd",
-		.number		= SM_PROGRAM,
+		.number		= NSM_PROGRAM,
 		.nrvers		= ARRAY_SIZE(nsm_version),
 		.version	= nsm_version,
 		.stats		= &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d02..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
 /*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+	defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t	nlmsvc_family = AF_INET6;
+#else	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t	nlmsvc_family = AF_INET;
+#endif	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
  */
 static unsigned long		nlm_grace_period;
 static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
-int				nsm_use_hostnames = 0;
+
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int		nlm_max_connections = 1024;
 
 /*
  * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
 		long timeout = MAX_SCHEDULE_TIMEOUT;
 		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
+		/* update sv_maxconn if it has changed */
+		rqstp->rq_server->sv_maxconn = nlm_max_connections;
+
 		if (signalled()) {
 			flush_signals(current);
 			if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
 	return 0;
 }
 
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+				 unsigned short port)
+{
+	struct svc_xprt *xprt;
+
+	xprt = svc_find_xprt(serv, name, 0, 0);
+	if (xprt == NULL)
+		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+
+	svc_xprt_put(xprt);
+	return 0;
+}
+
 /*
  * Ensure there are active UDP and TCP listeners for lockd.
  *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
 static int make_socks(struct svc_serv *serv)
 {
 	static int warned;
-	struct svc_xprt *xprt;
-	int err = 0;
+	int err;
 
-	xprt = svc_find_xprt(serv, "udp", 0, 0);
-	if (!xprt)
-		err = svc_create_xprt(serv, "udp", nlm_udpport,
-				      SVC_SOCK_DEFAULTS);
-	else
-		svc_xprt_put(xprt);
-	if (err >= 0) {
-		xprt = svc_find_xprt(serv, "tcp", 0, 0);
-		if (!xprt)
-			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
-					      SVC_SOCK_DEFAULTS);
-		else
-			svc_xprt_put(xprt);
-	}
-	if (err >= 0) {
-		warned = 0;
-		err = 0;
-	} else if (warned++ == 0)
+	err = create_lockd_listener(serv, "udp", nlm_udpport);
+	if (err < 0)
+		goto out_err;
+
+	err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+	if (err < 0)
+		goto out_err;
+
+	warned = 0;
+	return 0;
+
+out_err:
+	if (warned++ == 0)
 		printk(KERN_WARNING
-		       "lockd_up: makesock failed, error=%d\n", err);
+			"lockd_up: makesock failed, error=%d\n", err);
 	return err;
 }
 
@@ -252,7 +274,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
 	}
 
 	svc_sock_update_bufs(serv);
+	serv->sv_maxconn = nlm_max_connections;
 
 	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
 		  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
 
 /*
  * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
@@ -419,8 +417,6 @@ static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
-	struct sockaddr_in	saddr;
-
 	dprintk("lockd: SM_NOTIFY     called\n");
 
 	if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	/* Obtain the host pointer for this NFS server and try to
-	 * reclaim all locks we hold on this server.
-	 */
-	memset(&saddr, 0, sizeof(saddr));
-	saddr.sin_family = AF_INET;
-	saddr.sin_addr.s_addr = argp->addr;
-	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+	nlm_host_rebooted(argp);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
@@ -451,8 +449,6 @@ static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
-	struct sockaddr_in	saddr;
-
 	dprintk("lockd: SM_NOTIFY     called\n");
 
 	if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	/* Obtain the host pointer for this NFS server and try to
-	 * reclaim all locks we hold on this server.
-	 */
-	memset(&saddr, 0, sizeof(saddr));
-	saddr.sin_family = AF_INET;
-	saddr.sin_addr.s_addr = argp->addr;
-	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+	nlm_host_rebooted(argp);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
 	argp->state = ntohl(*p++);
-	/* Preserve the address in network byte order */
-	argp->addr = *p++;
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
 	argp->state = ntohl(*p++);
-	/* Preserve the address in network byte order */
-	argp->addr  = *p++;
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
 	return -EINVAL;
 
 got_it:
-	pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+	pos = page_offset(page) + p - (char *)page_address(page);
 	err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
 					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
 	if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 				first_hole = page_block;
 			page_block++;
 			block_in_file++;
-			clear_buffer_mapped(map_bh);
 			continue;
 		}
 
@@ -308,7 +307,10 @@ alloc_new:
 		goto alloc_new;
 	}
 
-	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+	relative_block = block_in_file - *first_logical_block;
+	nblocks = map_bh->b_size >> blkbits;
+	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+	    (first_hole != blocks_per_page))
 		bio = mpage_bio_submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index df2d3df4f049..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	if (inode->i_op && inode->i_op->permission)
+	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
 		retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
-	if (inode->i_op && inode->i_op->permission)
+	if (inode->i_op->permission)
 		return -EAGAIN;
 
 	if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		inode = next.dentry->d_inode;
 		if (!inode)
 			goto out_dput;
-		err = -ENOTDIR; 
-		if (!inode->i_op)
-			goto out_dput;
 
 		if (inode->i_op->follow_link) {
 			err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			inode = nd->path.dentry->d_inode;
 			if (!inode)
 				break;
-			err = -ENOTDIR; 
-			if (!inode->i_op)
-				break;
 		} else
 			path_to_nameidata(&next, nd);
 		err = -ENOTDIR; 
@@ -961,7 +955,7 @@ last_component:
 			break;
 		inode = next.dentry->d_inode;
 		if ((lookup_flags & LOOKUP_FOLLOW)
-		    && inode && inode->i_op && inode->i_op->follow_link) {
+		    && inode && inode->i_op->follow_link) {
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
@@ -973,7 +967,7 @@ last_component:
 			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-			if (!inode->i_op || !inode->i_op->lookup)
+			if (!inode->i_op->lookup)
 				break;
 		}
 		goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->create)
+	if (!dir->i_op->create)
 		return -EACCES;	/* shouldn't it be ENOSYS? */
 	mode &= S_IALLUGO;
 	mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
 	error = -ENOENT;
 	if (!path.dentry->d_inode)
 		goto exit_dput;
-	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+	if (path.dentry->d_inode->i_op->follow_link)
 		goto do_link;
 
 	path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
 		return -EPERM;
 
-	if (!dir->i_op || !dir->i_op->mknod)
+	if (!dir->i_op->mknod)
 		return -EPERM;
 
 	error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->mkdir)
+	if (!dir->i_op->mkdir)
 		return -EPERM;
 
 	mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->rmdir)
+	if (!dir->i_op->rmdir)
 		return -EPERM;
 
 	DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->unlink)
+	if (!dir->i_op->unlink)
 		return -EPERM;
 
 	DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->symlink)
+	if (!dir->i_op->symlink)
 		return -EPERM;
 
 	error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 */
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
-	if (!dir->i_op || !dir->i_op->link)
+	if (!dir->i_op->link)
 		return -EPERM;
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
-	if (!old_dir->i_op || !old_dir->i_op->rename)
+	if (!old_dir->i_op->rename)
 		return -EPERM;
 
 	DQUOT_INIT(old_dir);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
  *	@opts: an array of &struct option entries controlling parser operations
  *	@optopt: output; will contain the current option
  *	@optarg: output; will contain the value (if one exists)
- *	@flag: output; may be NULL; should point to a long for or'ing flags
  *	@value: output; may be NULL; will be overwritten with the integer value
  *		of the current argument.
  *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
 {
 	s32		auth_type;
 	u32		object_name_len;
-	compat_caddr_t	object_name;	/* an userspace data, in most cases user name */
+	compat_caddr_t	object_name;	/* a userspace data, in most cases user name */
 };
 
 struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..c903e04aa217 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 
 	ret = set_groups(new, gi);
 	put_group_info(gi);
-	if (!ret)
+	if (ret < 0)
 		goto error;
 
-	if (new->uid)
+	if (new->fsuid)
 		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c197..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
 
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
-
 /* Index of predefined Linux callback client operations */
 
 enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
 			nfsd4_encode_operation(resp, op);
 			status = op->status;
 		}
+
+		dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+			args->ops, args->opcnt, resp->opcnt, op->opnum,
+			be32_to_cpu(status));
+
 		if (cstate->replay_owner) {
 			nfs4_put_stateowner(cstate->replay_owner);
 			cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 
 	md5_to_hex(dname, cksum.data);
 
-	kfree(cksum.data);
 	status = nfs_ok;
 out:
+	kfree(cksum.data);
 	crypto_free_hash(desc.tfm);
 out_no_tfm:
 	return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb8..88db7d3ec120 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
 #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
 #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
 
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
 #define lockownerid_hashval(id) \
         ((id) & LOCK_HASH_MASK)
 
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
-	struct nfs4_stateid *local = NULL;
+	struct nfs4_stateid *local;
 	u32 st_id = stid->si_stateownerid;
 	u32 f_id = stid->si_fileid;
 	unsigned int hashval;
 
 	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-	if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+	if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
 		hashval = stateid_hashval(st_id, f_id);
 		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
 			if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
 				return local;
 		}
 	} 
-	if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+
+	if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
 		hashval = stateid_hashval(st_id, f_id);
 		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
 			if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 		deny->ld_clientid.cl_id = 0;
 	}
 	deny->ld_start = fl->fl_start;
-	deny->ld_length = ~(u64)0;
-	if (fl->fl_end != ~(u64)0)
+	deny->ld_length = NFS4_MAX_UINT64;
+	if (fl->fl_end != NFS4_MAX_UINT64)
 		deny->ld_length = fl->fl_end - fl->fl_start + 1;        
 	deny->ld_type = NFS4_READ_LT;
 	if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
 static int
 check_lock_length(u64 offset, u64 length)
 {
-	return ((length == 0)  || ((length != ~(u64)0) &&
+	return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
 	     LOFF_OVERFLOW(offset, length)));
 }
 
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lock->lk_offset;
-	if ((lock->lk_length == ~(u64)0) || 
-			LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
+	file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
 	nfs4_transform_lock_offset(&file_lock);
 
 	/*
@@ -2781,6 +2798,25 @@ out:
 }
 
 /*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+	struct file *file;
+	int err;
+
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (err)
+		return err;
+	err = vfs_test_lock(file, lock);
+	nfsd_close(file);
+	return err;
+}
+
+/*
  * LOCKT operation
  */
 __be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_lockt *lockt)
 {
 	struct inode *inode;
-	struct file file;
 	struct file_lock file_lock;
 	int error;
 	__be32 status;
@@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
-	if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
+	file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
 
 	nfs4_transform_lock_offset(&file_lock);
 
-	/* vfs_test_lock uses the struct file _only_ to resolve the inode.
-	 * since LOCKT doesn't require an OPEN, and therefore a struct
-	 * file may not exist, pass vfs_test_lock a struct file with
-	 * only the dentry:inode set.
-	 */
-	memset(&file, 0, sizeof (struct file));
-	file.f_path.dentry = cstate->current_fh.fh_dentry;
-
 	status = nfs_ok;
-	error = vfs_test_lock(&file, &file_lock);
+	error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
 	if (error) {
 		status = nfserrno(error);
 		goto out;
@@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 	file_lock.fl_start = locku->lu_offset;
 
-	if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
+	file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
 	nfs4_transform_lock_offset(&file_lock);
 
 	/*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs/nfs4xdr.c
- *
  *  Server-side XDR for NFSv4
  *
  *  Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
-
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Svc] = write_svc,
 	[NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Getfd] = write_getfd,
 	[NFSD_Getfs] = write_getfs,
 	[NFSD_Fh] = write_filehandle,
-	[NFSD_FO_UnlockIP] = failover_unlock_ip,
-	[NFSD_FO_UnlockFS] = failover_unlock_fs,
+	[NFSD_FO_UnlockIP] = write_unlock_ip,
+	[NFSD_FO_UnlockFS] = write_unlock_fs,
 	[NFSD_Threads] = write_threads,
 	[NFSD_Pool_Threads] = write_pool_threads,
 	[NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned.  Otherwise return 0 or and -error.
  */
 
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_svc
+ *				svc_port:	port number of this
+ *						server's listener
+ *				svc_nthreads:	number of threads to start
+ *			size:	size in bytes of passed in nfsctl_svc
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
 	return nfsd_svc(data->svc_port, data->svc_nthreads);
 }
 
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	no. of items in cl_addrlist
+ *				cl_addrlist:	array of client addresses
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_add(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
 	return exp_addclient(data);
 }
 
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	ignored
+ *				cl_addrlist:	ignored
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_del(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
 	return exp_delclient(data);
 }
 
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client allowed to access
+ *						this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		fsid to use for this export
+ *				ex_ino:		ignored
+ *				ex_flags:	export flags for this export
+ *				ex_anon_uid:	UID to use for anonymous
+ *						requests
+ *				ex_anon_gid:	GID to use for anonymous
+ *						requests
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_export(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
 	return exp_export(data);
 }
 
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client no longer allowed
+ *						to access this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		ignored
+ *				ex_ino:		ignored
+ *				ex_flags:	ignored
+ *				ex_anon_uid:	ignored
+ *				ex_anon_gid:	ignored
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 	return exp_unexport(data);
 }
 
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fsparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_maxlen:	maximum size of returned file
+ *						handle
+ *			size:	size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *	On success:	passed-in buffer filled with a knfsd_fh structure
+ *			(a variable-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 	return err;
 }
 
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fdparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_version:	fdparm structure version
+ *			size:	size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *	On success:	passed-in buffer filled with nfsctl_res
+ *			(a fixed-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 	return err;
 }
 
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing a
+ *				presentation format IPv4 address
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
 	struct sockaddr_in sin = {
 		.sin_family	= AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing the
+ *				absolute pathname of a local file system
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
 	struct path path;
 	char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
 	if (error)
 		return error;
 
+	/*
+	 * XXX: Needs better sanity checking.  Otherwise we could end up
+	 * releasing locks on the wrong file system.
+	 *
+	 * For example:
+	 * 1.  Does the path refer to a directory?
+	 * 2.  Is that directory a mount point, or
+	 * 3.  Is that directory the root of an exported file system?
+	 */
 	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
 
 	path_put(&path);
 	return error;
 }
 
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *			buf:
+ *				domain:		client domain name
+ *				path:		export pathname
+ *				maxsize:	numeric maximum size of
+ *						@buf
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing a ASCII hex text version
+ *			of the NFS file handle;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
-	/* request is:
-	 *   domain path maxsize
-	 * response is
-	 *   filehandle
-	 *
-	 * qword quoting is used, so filehandle will be \x....
-	 */
 	char *dname, *path;
 	int uninitialized_var(maxsize);
 	char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 
 	dname = mesg;
 	len = qword_get(&mesg, dname, size);
-	if (len <= 0) return -EINVAL;
+	if (len <= 0)
+		return -EINVAL;
 	
 	path = dname+len+1;
 	len = qword_get(&mesg, path, size);
-	if (len <= 0) return -EINVAL;
+	if (len <= 0)
+		return -EINVAL;
 
 	len = get_int(&mesg, &maxsize);
 	if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	if (len)
 		return len;
 	
-	mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+	mesg = buf;
+	len = SIMPLE_TRANSACTION_LIMIT;
 	qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
 	mesg[-1] = '\n';
 	return mesg - buf;	
 }
 
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the
+ *					number of NFSD threads to start
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
-	/* if size > 0, look for a number of threads and call nfsd_svc
-	 * then write out number of threads as reply
-	 */
 	char *mesg = buf;
 	int rv;
 	if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 		rv = get_int(&mesg, &newthreads);
 		if (rv)
 			return rv;
-		if (newthreads <0)
+		if (newthreads < 0)
 			return -EINVAL;
-		rv = nfsd_svc(2049, newthreads);
+		rv = nfsd_svc(NFS_PORT, newthreads);
 		if (rv)
 			return rv;
 	}
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated unsigned integer values
+ *					representing the number of NFSD
+ *					threads to start in each pool
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing integer values representing the
+ *			number of NFSD threads in each pool;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
 	/* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
 
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
-	/*
-	 * Format:
-	 *   [-/+]vers [-/+]vers ...
-	 */
 	char *mesg = buf;
 	char *vers, sign;
 	int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing positive or negative integer
+ *			values representing the current status of each
+ *			protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated positive or negative
+ * 					integer values representing NFS
+ * 					protocol versions to enable ("+n")
+ * 					or disable ("-n")
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	status of zero or more protocol versions has
+ *			been updated; passed-in buffer filled with
+ *			'\n'-terminated C string containing positive
+ *			or negative integer values representing the
+ *			current status of each protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 	return -EINVAL;
 }
 
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with a '\n'-terminated C
+ *			string containing a whitespace-separated list of
+ *			named NFSD listeners;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing a bound
+ *					but unconnected socket that is to be
+ *					used as an NFSD listener
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique alphanumeric name of
+ *			the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by an integer value representing a
+ *					previously passed in socket file
+ *					descriptor
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service no longer listens on that socket;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique name of the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a transport
+ *					name and an unsigned integer value
+ *					representing the port to listen on,
+ *					separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service is started
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by a transport name and an unsigned
+ *					integer value representing the port
+ *					to listen on, separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service no longer listens
+ *			on that transport
+ *	On error:	return code is a negative errno value
+ */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 
 int nfsd_max_blksize;
 
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new
+ * 					NFS blksize
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current NFS blksize
+ *			setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the new
+ *					NFSv4 lease expiry time
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing unsigned integer value of the
+ *			current lease expiry time;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing the pathname
+ *					of the directory on a local file
+ *					system containing permanent NFSv4
+ *					recovery data
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing the current recovery pathname setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
 	return error;
 }
 
-/*
- * Perform sanity checks on the dentry in a client's file handle.
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
  *
- * Note that the file handle dentry may need to be freed even after
- * an error return.
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
  *
- * This is only called at the start of an nfsproc call, so fhp points to
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
  */
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 				goto retry;
 			break;
 		}
+	} else if (exp->ex_flags & NFSEXP_FSID) {
+		fsid_type = FSID_NUM;
 	} else if (exp->ex_uuid) {
 		if (fhp->fh_maxsize >= 64) {
 			if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 			else
 				fsid_type = FSID_UUID4_INUM;
 		}
-	} else if (exp->ex_flags & NFSEXP_FSID)
-		fsid_type = FSID_NUM;
-	else if (!old_valid_dev(ex_dev))
+	} else if (!old_valid_dev(ex_dev))
 		/* for newer device numbers, we must use a newer fsid format */
 		fsid_type = FSID_ENCODE_DEV;
 	else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
 		{ nfserr_badname, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
+		{ nfserr_toosmall, -ETOOSMALL },
 	};
 	int	i;
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
 
 	return err;
 }
-	
 
 static int
 nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	dirp = dentry->d_inode;
 
 	err = nfserr_notdir;
-	if(!dirp->i_op || !dirp->i_op->lookup)
+	if (!dirp->i_op->lookup)
 		goto out;
 	/*
 	 * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	/* Get all the sanity checks out of the way before
 	 * we lock the parent. */
 	err = nfserr_notdir;
-	if(!dirp->i_op || !dirp->i_op->lookup)
+	if (!dirp->i_op->lookup)
 		goto out;
 	fh_lock_nested(fhp, I_MUTEX_PARENT);
 
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	inode = dentry->d_inode;
 
 	err = nfserr_inval;
-	if (!inode->i_op || !inode->i_op->readlink)
+	if (!inode->i_op->readlink)
 		goto out;
 
 	touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 	size_t size;
 	int error;
 
-	if (!IS_POSIXACL(inode) || !inode->i_op ||
+	if (!IS_POSIXACL(inode) ||
 	    !inode->i_op->setxattr || !inode->i_op->removexattr)
 		return -EOPNOTSUPP;
 	switch(type) {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 400f8064a548..81b8644b0136 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -704,7 +704,7 @@ fput_and_out:
 	return ret;
 }
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
 {
 	struct file *filp;
 	struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 		ni->allocated_size = sle64_to_cpu(
 				a->data.non_resident.allocated_size);
 	}
-	/* Setup the operations for this attribute inode. */
-	vi->i_op = NULL;
-	vi->i_fop = NULL;
 	if (NInoMstProtected(ni))
 		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
 	alloc.o 		\
 	aops.o 			\
+	blockcheck.o		\
 	buffer_head_io.o	\
 	dcache.o 		\
 	dir.o 			\
@@ -35,8 +36,14 @@ ocfs2-objs := \
 	sysfile.o 		\
 	uptodate.o		\
 	ver.o			\
+	quota_local.o		\
+	quota_global.o		\
 	xattr.o
 
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct posix_acl_entry))
+		return ERR_PTR(-EINVAL);
+
+	count = size / sizeof(struct posix_acl_entry);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		struct ocfs2_acl_entry *entry =
+			(struct ocfs2_acl_entry *)value;
+
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+		value += sizeof(struct posix_acl_entry);
+
+	}
+	return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+	struct ocfs2_acl_entry *entry = NULL;
+	char *ocfs2_acl;
+	size_t n;
+
+	*size = acl->a_count * sizeof(struct posix_acl_entry);
+
+	ocfs2_acl = kmalloc(*size, GFP_NOFS);
+	if (!ocfs2_acl)
+		return ERR_PTR(-ENOMEM);
+
+	entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+	for (n = 0; n < acl->a_count; n++, entry++) {
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+	}
+	return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+					      int type,
+					      struct buffer_head *di_bh)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+						"", value, retval);
+	}
+
+	if (retval > 0)
+		acl = ocfs2_acl_from_xattr(value, retval);
+	else if (retval == -ENODATA || retval == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		acl = ERR_PTR(ret);
+		return acl;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_mode = mode;
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = ocfs2_acl_to_xattr(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	if (handle)
+		ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+					     "", value, size, 0,
+					     meta_ac, data_ac);
+	else
+		ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+
+	return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int ret = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return ret;
+	}
+
+	return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl, *clone;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+				    clone, NULL, NULL);
+	posix_acl_release(clone);
+	return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+		   struct inode *inode,
+		   struct inode *dir,
+		   struct buffer_head *di_bh,
+		   struct buffer_head *dir_bh,
+		   struct ocfs2_alloc_context *meta_ac,
+		   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+						   dir_bh);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ocfs2_set_acl(handle, inode, di_bh,
+					    ACL_TYPE_DEFAULT, acl,
+					    meta_ac, data_ac);
+			if (ret)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				ret = ocfs2_set_acl(handle, inode,
+						    di_bh, ACL_TYPE_ACCESS,
+						    clone, meta_ac, data_ac);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+					  char *list,
+					  size_t list_len,
+					  const char *name,
+					  size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+					   char *list,
+					   size_t list_len,
+					   const char *name,
+					   size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+			       int type,
+			       void *buffer,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ocfs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+				      const char *name,
+				      void *buffer,
+				      size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+				       const char *name,
+				       void *buffer,
+				       size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+			       int type,
+			       const void *value,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto cleanup;
+		}
+	} else
+		acl = NULL;
+
+	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+				      const char *name,
+				      const void *value,
+				      size_t size,
+				      int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+				       const char *name,
+				       const void *value,
+				       size_t size,
+				       int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= ocfs2_xattr_list_acl_access,
+	.get	= ocfs2_xattr_get_acl_access,
+	.set	= ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= ocfs2_xattr_list_acl_default,
+	.get	= ocfs2_xattr_get_acl_default,
+	.set	= ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+			  struct buffer_head *, struct buffer_head *,
+			  struct ocfs2_alloc_context *,
+			  struct ocfs2_alloc_context *);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+				 struct inode *inode,
+				 struct inode *dir,
+				 struct buffer_head *di_bh,
+				 struct buffer_head *dir_bh,
+				 struct ocfs2_alloc_context *meta_ac,
+				 struct ocfs2_alloc_context *data_ac)
+{
+	return 0;
+}
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
 				     struct ocfs2_extent_tree *et)
 {
-	int ret = 0;
-	struct ocfs2_dinode *di;
+	struct ocfs2_dinode *di = et->et_object;
 
 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
 
-	di = et->et_object;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			"Inode %llu has invalid path root",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno);
-	}
-
-	return ret;
+	return 0;
 }
 
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_xattr_value_root *xv = et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	et->et_root_el = &xv->xr_list;
+	et->et_root_el = &vb->vb_xv->xr_list;
 }
 
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					      u64 blkno)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *) et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	return le64_to_cpu(xv->xr_last_eb_blk);
+	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 					      struct ocfs2_extent_tree *et,
 					      u32 clusters)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	le32_add_cpu(&xv->xr_clusters, clusters);
+	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct inode *inode,
 				     struct buffer_head *bh,
+				     ocfs2_journal_access_func access,
 				     void *obj,
 				     struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_ops = ops;
 	et->et_root_bh = bh;
+	et->et_root_journal_access = access;
 	if (!obj)
 		obj = (void *)bh->b_data;
 	et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 				   struct inode *inode,
 				   struct buffer_head *bh)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+				 NULL, &ocfs2_dinode_et_ops);
 }
 
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
 				       struct buffer_head *bh)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, NULL,
-				 &ocfs2_xattr_tree_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+				 NULL, &ocfs2_xattr_tree_et_ops);
 }
 
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
-					struct buffer_head *bh,
-					struct ocfs2_xattr_value_root *xv)
+					struct ocfs2_xattr_value_buf *vb)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, xv,
+	__ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
 				 &ocfs2_xattr_value_et_ops);
 }
 
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
 	et->et_ops->eo_update_clusters(inode, et, clusters);
 }
 
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+					       struct inode *inode,
+					       struct ocfs2_extent_tree *et,
+					       int type)
+{
+	return et->et_root_journal_access(handle, inode, et->et_root_bh,
+					  type);
+}
+
 static inline int ocfs2_et_insert_check(struct inode *inode,
 					struct ocfs2_extent_tree *et,
 					struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH	5
 
 struct ocfs2_path {
-	int			p_tree_depth;
-	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
 };
 
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 	 */
 	if (keep_root)
 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+	else
+		path_root_access(path) = NULL;
 
 	path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 
 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
 	BUG_ON(path_root_el(dest) != path_root_el(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
 
 	ocfs2_reinit_path(dest, 1);
 
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 	int i;
 
 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
 
 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 		brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-					 struct ocfs2_extent_list *root_el)
+					 struct ocfs2_extent_list *root_el,
+					 ocfs2_journal_access_func access)
 {
 	struct ocfs2_path *path;
 
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 		get_bh(root_bh);
 		path_root_bh(path) = root_bh;
 		path_root_el(path) = root_el;
+		path_root_access(path) = access;
 	}
 
 	return path;
 }
 
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+			      path_root_access(path));
+}
+
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+			      et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+					struct inode *inode,
+					struct ocfs2_path *path,
+					int idx)
+{
+	ocfs2_journal_access_func access = path_root_access(path);
+
+	if (!access)
+		access = ocfs2_journal_access;
+
+	if (idx)
+		access = ocfs2_journal_access_eb;
+
+	return access(handle, inode, path->p_node[idx].bh,
+		      OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
 /*
  * Convenience function to journal all components in a path.
  */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
 		goto out;
 
 	for(i = 0; i < path_num_items(path); i++) {
-		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
 	int			c_split_covers_rec;
 };
 
+static int ocfs2_validate_extent_block(struct super_block *sb,
+				       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_extent_block *eb =
+		(struct ocfs2_extent_block *)bh->b_data;
+
+	mlog(0, "Validating extent block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    eb->h_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid h_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(eb->h_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid "
+			    "h_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(eb->h_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+			    struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+			      ocfs2_validate_extent_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
 	if (last_eb_blk) {
-		retval = ocfs2_read_block(inode, last_eb_blk,
-					  &eb_bh);
+		retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 			}
 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 
-			status = ocfs2_journal_access(handle, inode, bhs[i],
-						      OCFS2_JOURNAL_ACCESS_CREATE);
+			status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+							 OCFS2_JOURNAL_ACCESS_CREATE);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	for(i = 0; i < new_blocks; i++) {
 		bh = new_eb_bhs[i];
 		eb = (struct ocfs2_extent_block *) bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
-		}
+		/* ocfs2_create_new_meta_bhs() should create it right! */
+		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 		eb_el = &eb->h_list;
 
-		status = ocfs2_journal_access(handle, inode, bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
+		status = ocfs2_journal_access_eb(handle, inode, bh,
+						 OCFS2_JOURNAL_ACCESS_CREATE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	 * journal_dirty erroring as it won't unless we've aborted the
 	 * handle (in which case we would never be here) so reserving
 	 * the write with journal_access is all we need to do. */
-	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	if (eb_bh) {
-		status = ocfs2_journal_access(handle, inode, eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		status = -EIO;
-		goto bail;
-	}
+	/* ocfs2_create_new_meta_bhs() should create it right! */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
 	eb_el = &eb->h_list;
 	root_el = et->et_root_el;
 
-	status = ocfs2_journal_access(handle, inode, new_eb_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 		brelse(bh);
 		bh = NULL;
 
-		status = ocfs2_read_block(inode, blkno, &bh);
+		status = ocfs2_read_extent_block(inode, blkno, &bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 
 		eb = (struct ocfs2_extent_block *) bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
-		}
 		el = &eb->h_list;
 
 		if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(inode, blkno, &bh);
+		ret = ocfs2_read_extent_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		eb = (struct ocfs2_extent_block *) bh->b_data;
 		el = &eb->h_list;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EIO;
-			goto out;
-		}
 
 		if (le16_to_cpu(el->l_next_free_rec) >
 		    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
 	root_bh = left_path->p_node[subtree_index].bh;
 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   subtree_index);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-		ret = ocfs2_journal_access(handle, inode,
-					   right_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   right_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ret = ocfs2_journal_access(handle, inode,
-					   left_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 
 	*ret_left_path = NULL;
 
-	left_path = ocfs2_new_path(path_root_bh(right_path),
-				   path_root_el(right_path));
+	left_path = ocfs2_new_path_from_path(right_path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 			return -EAGAIN;
 
 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-			ret = ocfs2_journal_access(handle, inode,
-						   path_leaf_bh(right_path),
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_journal_access_eb(handle, inode,
+						      path_leaf_bh(right_path),
+						      OCFS2_JOURNAL_ACCESS_WRITE);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, et_root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_et_root_journal_access(handle, inode, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 	 */
 	BUG_ON(right_has_empty && !del_right_subtree);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   subtree_index);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-		ret = ocfs2_journal_access(handle, inode,
-					   right_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   right_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ret = ocfs2_journal_access(handle, inode,
-					   left_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2596,16 +2694,17 @@ out:
 
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
 					    handle_t *handle,
-					    struct buffer_head *bh,
-					    struct ocfs2_extent_list *el)
+					    struct ocfs2_path *path)
 {
 	int ret;
+	struct buffer_head *bh = path_leaf_bh(path);
+	struct ocfs2_extent_list *el = path_leaf_el(path);
 
 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
 		return 0;
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 		goto out;
 	}
 
-	left_path = ocfs2_new_path(path_root_bh(path),
-				   path_root_el(path));
+	left_path = ocfs2_new_path_from_path(path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 	ocfs2_cp_path(left_path, path);
 
-	right_path = ocfs2_new_path(path_root_bh(path),
-				    path_root_el(path));
+	right_path = ocfs2_new_path_from_path(path);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 		 * Caller might still want to make changes to the
 		 * tree root, so re-add it to the journal here.
 		 */
-		ret = ocfs2_journal_access(handle, inode,
-					   path_root_bh(left_path),
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 * We have a path to the left of this one - it needs
 		 * an update too.
 		 */
-		left_path = ocfs2_new_path(path_root_bh(path),
-					   path_root_el(path));
+		left_path = ocfs2_new_path_from_path(path);
 		if (!left_path) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-						       path_leaf_bh(path),
-						       path_leaf_el(path));
+						       path);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
 	/* This function shouldn't be called for the rightmost leaf. */
 	BUG_ON(right_cpos == 0);
 
-	right_path = ocfs2_new_path(path_root_bh(left_path),
-				    path_root_el(left_path));
+	right_path = ocfs2_new_path_from_path(left_path);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		root_bh = left_path->p_node[subtree_index].bh;
 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-		ret = ocfs2_journal_access(handle, inode, root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+						   subtree_index);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 
 		for (i = subtree_index + 1;
 		     i < path_num_items(right_path); i++) {
-			ret = ocfs2_journal_access(handle, inode,
-						   right_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   right_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 
-			ret = ocfs2_journal_access(handle, inode,
-						   left_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   left_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		right_rec = &el->l_recs[index + 1];
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+					   path_num_items(left_path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
 	/* This function shouldn't be called for the leftmost leaf. */
 	BUG_ON(left_cpos == 0);
 
-	left_path = ocfs2_new_path(path_root_bh(right_path),
-				   path_root_el(right_path));
+	left_path = ocfs2_new_path_from_path(right_path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		root_bh = left_path->p_node[subtree_index].bh;
 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-		ret = ocfs2_journal_access(handle, inode, root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+						   subtree_index);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 
 		for (i = subtree_index + 1;
 		     i < path_num_items(right_path); i++) {
-			ret = ocfs2_journal_access(handle, inode,
-						   right_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   right_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 
-			ret = ocfs2_journal_access(handle, inode,
-						   left_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   left_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 			has_empty_extent = 1;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   path_num_items(right_path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 		 * leftmost leaf.
 		 */
 		if (left_cpos) {
-			left_path = ocfs2_new_path(path_root_bh(right_path),
-						   path_root_el(right_path));
+			left_path = ocfs2_new_path_from_path(right_path);
 			if (!left_path) {
 				ret = -ENOMEM;
 				mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
 	struct ocfs2_extent_rec *rec, *tmprec;
 
-	right_el = path_leaf_el(right_path);;
+	right_el = path_leaf_el(right_path);
 	if (left_path)
 		left_el = path_leaf_el(left_path);
 
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 	el = et->et_root_el;
 
-	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_et_root_journal_access(handle, inode, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	right_path = ocfs2_new_path_from_et(et);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_et_root_journal_access(handle, inode, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			goto out;
 
 		if (left_cpos != 0) {
-			left_path = ocfs2_new_path(path_root_bh(path),
-						   path_root_el(path));
+			left_path = ocfs2_new_path_from_path(path);
 			if (!left_path)
 				goto out;
 
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			    le16_to_cpu(new_el->l_count)) {
 				bh = path_leaf_bh(left_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-								 eb);
+				ocfs2_error(inode->i_sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of "
+					    "%d.  It should have "
+					    "matched the l_count of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec),
+					    le16_to_cpu(new_el->l_count));
+				status = -EINVAL;
 				goto out;
 			}
 			rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 		if (right_cpos == 0)
 			goto out;
 
-		right_path = ocfs2_new_path(path_root_bh(path),
-					    path_root_el(path));
+		right_path = ocfs2_new_path_from_path(path);
 		if (!right_path)
 			goto out;
 
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
 				bh = path_leaf_bh(right_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-								 eb);
+				ocfs2_error(inode->i_sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec));
+				status = -EINVAL;
 				goto out;
 			}
 			rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		return 0;
 	}
 
-	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	path = ocfs2_new_path_from_et(et);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 
 	BUG_ON(num_bits > clusters_to_add);
 
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	/* reserve our write early -- insert_extent may update the tree root */
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
 
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
 		}
 
 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EROFS;
-			goto out;
-		}
-
 		rightmost_el = &eb->h_list;
 	} else
 		rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
 	if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	left_path = ocfs2_new_path_from_et(et);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 		}
 
 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-			left_path = ocfs2_new_path(path_root_bh(path),
-						   path_root_el(path));
+			left_path = ocfs2_new_path_from_path(path);
 			if (!left_path) {
 				ret = -ENOMEM;
 				mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	path = ocfs2_new_path_from_et(et);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
 	return ret;
 }
 
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_et_root_journal_access(handle, inode, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+				  dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_et_update_clusters(inode, et, -len);
+
+	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-		status = -EIO;
-		goto bail;
-	}
 
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	tl_count = le16_to_cpu(tl->tl_count);
 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
 			tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 	while (i >= 0) {
 		/* Caller has given us at least enough credits to
 		 * update the truncate log dinode */
-		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-		status = -EIO;
-		goto out;
-	}
 
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	num_to_flush = le16_to_cpu(tl->tl_used);
 	mlog(0, "Flush %u records from truncate log #%llu\n",
 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-		status = -EIO;
-		goto bail;
-	}
 
+	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+	 * validated by the underlying call to ocfs2_read_inode_block(),
+	 * so any corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	if (le16_to_cpu(tl->tl_used)) {
 		mlog(0, "We'll have %u logs to recover\n",
 		     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 		 * tl_used. */
 		tl->tl_used = 0;
 
+		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
 		if (status < 0) {
 			mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  */
 
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
  */
 struct ocfs2_cached_block_free {
 	struct ocfs2_cached_block_free		*free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
 	struct ocfs2_cached_block_free		*f_first;
 };
 
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
-				   int sysfile_type,
-				   int slot,
-				   struct ocfs2_cached_block_free *head)
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+				    int sysfile_type,
+				    int slot,
+				    struct ocfs2_cached_block_free *head)
 {
 	int ret;
 	u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
 	return ret;
 }
 
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit)
+{
+	int ret = 0;
+	struct ocfs2_cached_block_free *item;
+
+	item = kmalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+	     bit, (unsigned long long)blkno);
+
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = ctxt->c_global_allocator;
+
+	ctxt->c_global_allocator = item;
+	return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+				      struct ocfs2_cached_block_free *head)
+{
+	struct ocfs2_cached_block_free *tmp;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	int ret = 0;
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	while (head) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			ret = __ocfs2_flush_truncate_log(osb);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+						head->free_bit);
+
+		ocfs2_commit_trans(osb, handle);
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	while (head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		if (fl->f_first) {
 			mlog(0, "Free items: (type %u, slot %d)\n",
 			     fl->f_inode_type, fl->f_slot);
-			ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
-						       fl->f_slot, fl->f_first);
+			ret2 = ocfs2_free_cached_blocks(osb,
+							fl->f_inode_type,
+							fl->f_slot,
+							fl->f_first);
 			if (ret2)
 				mlog_errno(ret2);
 			if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		kfree(fl);
 	}
 
+	if (ctxt->c_global_allocator) {
+		ret2 = ocfs2_free_cached_clusters(osb,
+						  ctxt->c_global_allocator);
+		if (ret2)
+			mlog_errno(ret2);
+		if (!ret)
+			ret = ret2;
+
+		ctxt->c_global_allocator = NULL;
+	}
+
 	return ret;
 }
 
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 
 	eb = (struct ocfs2_extent_block *) bh->b_data;
 	el = &eb->h_list;
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		ret = -EROFS;
-		goto out;
-	}
+
+	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
+	 * Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
 	*new_last_eb = bh;
 	get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	}
 
 	if (last_eb_bh) {
-		status = ocfs2_journal_access(handle, inode, last_eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	vfs_dq_free_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
 				      clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 	else if (ocfs2_should_order_data(inode)) {
 		ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-		ret = walk_page_buffers(handle, page_buffers(page),
-					from, to, &partial,
-					ocfs2_journal_dirty_data);
-#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_inline_to_extents_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		unsigned int page_end;
 		u64 phys;
 
+		if (vfs_dq_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+			ret = -EDQUOT;
+			goto out_commit;
+		}
+		did_quota = 1;
+
 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
 					   &num);
 		if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	}
 
 out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(inode,
+					  ocfs2_clusters_to_bytes(osb->sb, 1));
+
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+			      ocfs2_journal_access_di);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
 	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh);
+		status = ocfs2_read_extent_block(inode,
+						 le64_to_cpu(fe->i_last_eb_blk),
+						 &last_eb_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-
-			brelse(last_eb_bh);
-			status = -EIO;
-			goto bail;
-		}
 	}
 
 	(*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
  *
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
  * ocfs2_extent_tree_operations abstract the normal operations we do for
  * the root of extent b-tree.
  */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations	*et_ops;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
+	ocfs2_journal_access_func		et_root_journal_access;
 	void					*et_object;
 	unsigned int				et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
 				       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
-					struct buffer_head *bh,
-					struct ocfs2_xattr_value_root *xv);
+					struct ocfs2_xattr_value_buf *vb);
+
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+			    struct buffer_head **bh);
 
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc);
+
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
  */
 struct ocfs2_cached_dealloc_ctxt {
 	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+	struct ocfs2_cached_block_free 		*c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
 	c->c_first_suballocator = NULL;
+	c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		goto bail;
-	}
-
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 
 	if (ocfs2_should_order_data(inode)) {
 		ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, NULL,
-					ocfs2_journal_dirty_data);
-#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		ocfs2_commit_trans(osb, handle);
 
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	wc->w_handle = handle;
 
+	if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * here.
 	 */
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	/*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 					 mmap_page);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
 					  len);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	if (data_ac)
@@ -1790,6 +1776,10 @@ success:
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
+out_quota:
+	if (clusters_to_alloc)
+		vfs_dq_free_space(inode,
+			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+	unsigned int b, p = 0;
+
+	/*
+	 * Data bits are 0-based, but we're talking code bits, which
+	 * are 1-based.
+	 */
+	b = i + 1;
+
+	/* Use the cache if it is there */
+	if (p_cache)
+		p = *p_cache;
+        b += p;
+
+	/*
+	 * For every power of two below our bit number, bump our bit.
+	 *
+	 * We compare with (b + 1) because we have to compare with what b
+	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 *
+	 * p is set above.
+	 */
+	for (; (1 << p) < (b + 1); p++)
+		b++;
+
+	if (p_cache)
+		*p_cache = p;
+
+	return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+	unsigned int i, b, p = 0;
+
+	BUG_ON(!d);
+
+	/*
+	 * b is the hamming code bit number.  Hamming code specifies a
+	 * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+	 * for the algorithm.
+	 *
+	 * The i++ in the for loop is so that the start offset passed
+	 * to ocfs2_find_next_bit_set() is one greater than the previously
+	 * found bit.
+	 */
+	for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+	{
+		/*
+		 * i is the offset in this hunk, nr + i is the total bit
+		 * offset.
+		 */
+		b = calc_code_bit(nr + i, &p);
+
+		/*
+		 * Data bits in the resultant code are checked by
+		 * parity bits that are part of the bit number
+		 * representation.  Huh?
+		 *
+		 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+		 * In other words, the parity bit at position 2^k
+		 * checks bits in positions having bit k set in
+		 * their binary representation.  Conversely, for
+		 * instance, bit 13, i.e. 1101(2), is checked by
+		 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+		 * </wikipedia>
+		 *
+		 * Note that 'k' is the _code_ bit number.  'b' in
+		 * our loop.
+		 */
+		parity ^= b;
+	}
+
+	/* While the data buffer was treated as little endian, the
+	 * return value is in host endian. */
+	return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+	return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix)
+{
+	unsigned int i, b;
+
+	BUG_ON(!d);
+
+	/*
+	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
+	 * busted parity bit is its own error.  Nothing to do here.
+	 */
+	if (hweight32(fix) == 1)
+		return;
+
+	/*
+	 * nr + d is the bit right past the data hunk we're looking at.
+	 * If fix after that, nothing to do
+	 */
+	if (fix >= calc_code_bit(nr + d, NULL))
+		return;
+
+	/*
+	 * nr is the offset in the data hunk we're starting at.  Let's
+	 * start b at the offset in the code buffer.  See hamming_encode()
+	 * for a more detailed description of 'b'.
+	 */
+	b = calc_code_bit(nr, NULL);
+	/* If the fix is before this hunk, nothing to do */
+	if (fix < b)
+		return;
+
+	for (i = 0; i < d; i++, b++)
+	{
+		/* Skip past parity bits */
+		while (hweight32(b) == 1)
+			b++;
+
+		/*
+		 * i is the offset in this data hunk.
+		 * nr + i is the offset in the total data buffer.
+		 * b is the offset in the total code buffer.
+		 *
+		 * Thus, when b == fix, bit i in the current hunk needs
+		 * fixing.
+		 */
+		if (b == fix)
+		{
+			if (ocfs2_test_bit(i, data))
+				ocfs2_clear_bit(i, data);
+			else
+				ocfs2_set_bit(i, data);
+			break;
+		}
+	}
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+			     unsigned int fix)
+{
+	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	u32 crc;
+	u32 ecc;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	crc = crc32_le(~0, data, blocksize);
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+	ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+	/* And check the crc32 again */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i;
+	u32 crc, ecc;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i, rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc, fix;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return 0;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	for (i = 0, ecc = 0; i < nr; i++) {
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+	fix = ecc ^ check.bc_ecc;
+	for (i = 0; i < nr; i++) {
+		/*
+		 * Try the fix against each buffer.  It will only affect
+		 * one of them.
+		 */
+		ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+				  bhs[i]->b_size * 8 * i, fix);
+	}
+
+	/* And check the crc32 again */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+	return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+	return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+			 unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+				    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+	BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags)
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh))
 {
 	int status = 0;
 	int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 
 			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
+			if (validate)
+				set_buffer_needs_validate(bh);
 			bh->b_end_io = end_buffer_read_sync;
 			submit_bh(READ, bh);
 			continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 				bhs[i] = NULL;
 				continue;
 			}
+
+			if (buffer_needs_validate(bh)) {
+				/* We never set NeedsValidate if the
+				 * buffer was held by the journal, so
+				 * that better not have changed */
+				BUG_ON(buffer_jbd(bh));
+				clear_buffer_needs_validate(bh);
+				status = validate(inode->i_sb, bh);
+				if (status) {
+					put_bh(bh);
+					bhs[i] = NULL;
+					continue;
+				}
+			}
 		}
 
 		/* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 
-static inline int ocfs2_read_block(struct inode	       *inode,
-				   u64                  off,
-				   struct buffer_head **bh);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct inode	  *inode,
-		      u64                  block,
-		      int                  nr,
-		      struct buffer_head  *bhs[],
-		      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			   unsigned int nr, struct buffer_head *bhs[]);
 
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh));
+
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-				   struct buffer_head **bh)
+				   struct buffer_head **bh,
+				   int (*validate)(struct super_block *sb,
+						   struct buffer_head *bh))
 {
 	int status = 0;
 
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
 
 	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
 		/* We track the time spent inside
-		 * o2hb_do_disk_heartbeat so that we avoid more then
+		 * o2hb_do_disk_heartbeat so that we avoid more than
 		 * hr_timeout_ms between disk writes. On busy systems
 		 * this should result in a heartbeat which is less
 		 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(QUORUM),
 	define_mask(EXPORT),
 	define_mask(XATTR),
+	define_mask(QUOTA),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
-static struct buffer_head *ocfs2_bread(struct inode *inode,
-				       int block, int *err, int reada)
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = 0;
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
 
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
+	return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
 
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+	return ocfs2_meta_ecc(osb);
+}
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+	return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
 
-	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
-	if (tmperr < 0)
-		goto fail;
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
 
-	tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data)
+{
+	char *p = data;
 
-	*err = 0;
-	return bh;
+	p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+	return (struct ocfs2_dir_block_trailer *)p;
+}
 
-fail:
-	brelse(bh);
-	bh = NULL;
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+				  struct ocfs2_dir_entry *de,
+				  unsigned long offset,
+				  unsigned long blklen)
+{
+	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 
-	*err = -EIO;
-	return NULL;
+	if (!ocfs2_dir_has_trailer(dir))
+		return 0;
+
+	if (offset != toff)
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+	strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+	trailer->db_compat_rec_len =
+			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -250,6 +277,108 @@ out:
 	return NULL;
 }
 
+static int ocfs2_validate_dir_block(struct super_block *sb,
+				    struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(bh, sb);
+
+
+	/*
+	 * We don't validate dirents here, that's handled
+	 * in-place when the code walks them.
+	 */
+	mlog(0, "Validating dirblock %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 *
+	 * Note that we are safe to call this even if the directory
+	 * doesn't have a trailer.  Filesystems without metaecc will do
+	 * nothing, and filesystems with it will have one.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+	if (rc)
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+				struct buffer_head **bh, int flags)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+				    ocfs2_validate_dir_block);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	/*
+	 * We check the trailer here rather than in
+	 * ocfs2_validate_dir_block() because that function doesn't have
+	 * the inode to test.
+	 */
+	if (!(flags & OCFS2_BH_READAHEAD) &&
+	    ocfs2_dir_has_trailer(inode)) {
+		trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+		if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Invalid dirblock #%llu: "
+				    "signature = %.*s\n",
+				    (unsigned long long)tmp->b_blocknr, 7,
+				    trailer->db_signature);
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu has an invalid "
+				    "db_blkno of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_parent_dinode) !=
+		    OCFS2_I(inode)->ip_blkno) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu on dinode "
+				    "#%llu has an invalid parent_dinode "
+				    "of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc ? -EIO : 0;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 					       struct inode *dir,
 					       struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
 				}
 				num++;
 
-				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh = NULL;
+				err = ocfs2_read_dir_block(dir, b++, &bh,
+							   OCFS2_BH_READAHEAD);
 				bh_use[ra_max] = bh;
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		if (ocfs2_read_block(dir, block, &bh)) {
+		if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
 			/* read error, skip block & hope for the best.
-			 * ocfs2_read_block() has released the bh. */
+			 * ocfs2_read_dir_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
 		       struct inode *new_entry_inode)
 {
 	int ret;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 	/*
 	 * The same code works fine for both inline-data and extent
-	 * based directories, so no need to split this up.
+	 * based directories, so no need to split this up.  The only
+	 * difference is the journal_access function.
 	 */
 
-	ret = ocfs2_journal_access(handle, dir, de_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
 	struct ocfs2_dir_entry *de, *pde;
 	int i, status = -ENOENT;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
 	i = 0;
 	pde = NULL;
 	de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 			goto bail;
 		}
 		if (de == de_del)  {
-			status = ocfs2_journal_access(handle, dir, bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = access(handle, dir, bh,
+					OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				status = -EIO;
 				mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
 			goto bail;
 		}
 
+		/* We're guaranteed that we should have space, so we
+		 * can't possibly have hit the trailer...right? */
+		mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+				"Hit dir trailer trying to insert %.*s "
+			        "(namelen %d) into directory %llu.  "
+				"offset is %lu, trailer offset is %d\n",
+				namelen, name, namelen,
+				(unsigned long long)parent_fe_bh->b_blocknr,
+				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
 				goto bail;
 			}
 
-			status = ocfs2_journal_access(handle, dir, insert_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (insert_bh == parent_fe_bh)
+				status = ocfs2_journal_access_di(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
+			else
+				status = ocfs2_journal_access_db(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
 			retval = 0;
 			goto bail;
 		}
+
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 	int i, stored;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
-	int err;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
 
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 
 	while (!error && !stored && *f_pos < i_size_read(inode)) {
 		blk = (*f_pos) >> sb->s_blocksize_bits;
-		bh = ocfs2_bread(inode, blk, &err, 0);
-		if (!bh) {
-			mlog(ML_ERROR,
-			     "directory #%llu contains a hole at offset %lld\n",
-			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			     *f_pos);
+		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+			/* Skip the corrupt dirblock and keep trying */
 			*f_pos += sb->s_blocksize - offset;
 			continue;
 		}
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
-				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				brelse(tmp);
+				tmp = NULL;
+				if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+							  OCFS2_BH_READAHEAD))
+					brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
 		}
 		offset = 0;
 		brelse(bh);
+		bh = NULL;
 	}
 
 	stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
 	return !priv.seen_other;
 }
 
-static void ocfs2_fill_initial_dirents(struct inode *inode,
-				       struct inode *parent,
-				       char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+							  struct inode *parent,
+							  char *start,
+							  unsigned int size)
 {
 	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
 
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ocfs2_set_de_type(de, S_IFDIR);
+
+	return de;
 }
 
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
 	struct ocfs2_inline_data *data = &di->id2.i_data;
 	unsigned int size = le16_to_cpu(data->id_count);
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 				 struct ocfs2_alloc_context *data_ac)
 {
 	int status;
+	unsigned int size = osb->sb->s_blocksize;
 	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de;
 
 	mlog_entry_void();
 
+	if (ocfs2_supports_dir_trailer(osb))
+		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
 				     data_ac, NULL, &new_bh);
 	if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
-	status = ocfs2_journal_access(handle, inode, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, inode, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
-	ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
-				   osb->sb->s_blocksize);
+	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(inode, new_bh);
 
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 				     data_ac);
 }
 
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-				     unsigned int new_size)
+				     struct super_block *sb)
 {
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *prev_de;
 	char *de_buf, *limit;
-	unsigned int bytes = new_size - old_size;
+	unsigned int new_size = sb->s_blocksize;
+	unsigned int bytes;
+
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		new_size = ocfs2_dir_trailer_blk_off(sb);
+
+	bytes = new_size - old_size;
 
 	limit = start + old_size;
 	de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int blocks_wanted,
 				   struct buffer_head **first_block_bh)
 {
-	int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
 	u32 alloc, bit_off, len;
 	struct super_block *sb = dir->i_sb;
+	int ret, credits = ocfs2_inline_to_extents_credits(sb);
 	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_sem;
 	}
 
+	if (vfs_dq_alloc_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
+	did_quota = 1;
 	/*
 	 * Try to claim as many clusters as the bitmap can give though
 	 * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 	ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
 
-	ret = ocfs2_journal_access(handle, dir, dirdata_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
 	memset(dirdata_bh->b_data + i_size_read(dir), 0,
 	       sb->s_blocksize - i_size_read(dir));
-	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
-				 sb->s_blocksize);
+	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(dir, dirdata_bh);
 
 	ret = ocfs2_journal_dirty(handle, dirdata_bh);
 	if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * We let the later dirent insert modify c/mtime - to the user
 	 * the data hasn't changed.
 	 */
-	ret = ocfs2_journal_access(handle, dir, di_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_di(handle, dir, di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	dirdata_bh = NULL;
 
 out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(dir,
+			ocfs2_clusters_to_bytes(osb->sb, 2));
 	ocfs2_commit_trans(osb, handle);
 
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct buffer_head **new_bh)
 {
 	int status;
-	int extend;
+	int extend, did_quota = 0;
 	u64 p_blkno, v_blkno;
 
 	spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
+		if (vfs_dq_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
+
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
 					      1, 0, parent_fe_bh, handle,
 					      data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	}
 	status = 0;
 bail:
+	if (did_quota && status < 0)
+		vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
 	mlog_exit(status);
 	return status;
 }
@@ -1569,16 +1771,22 @@ do_extend:
 
 	ocfs2_set_new_buffer_uptodate(dir, new_bh);
 
-	status = ocfs2_journal_access(handle, dir, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, dir, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	memset(new_bh->b_data, 0, sb->s_blocksize);
+
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	if (ocfs2_dir_has_trailer(dir)) {
+		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+		ocfs2_init_dir_trailer(dir, new_bh);
+	} else {
+		de->rec_len = cpu_to_le16(sb->s_blocksize);
+	}
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int *blocks_wanted)
 {
 	int ret;
+	struct super_block *sb = dir->i_sb;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_dir_entry *de, *last_de = NULL;
 	char *de_buf, *limit;
 	unsigned long offset = 0;
-	unsigned int rec_len, new_rec_len;
+	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+	/*
+	 * This calculates how many free bytes we'd have in block zero, should
+	 * this function force expansion to an extent tree.
+	 */
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+	else
+		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
 
 	de_buf = di->id2.i_data.id_data;
 	limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 			ret = -EEXIST;
 			goto out;
 		}
+		/*
+		 * No need to check for a trailing dirent record here as
+		 * they're not used for inline dirs.
+		 */
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 	 * dirent can be found.
 	 */
 	*blocks_wanted = 1;
-	new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+	new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
 	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
 		*blocks_wanted = 2;
 
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = dir->i_sb;
 	int status;
+	int blocksize = dir->i_sb->s_blocksize;
 
-	bh = ocfs2_bread(dir, 0, &status, 0);
-	if (!bh) {
+	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 				status = -ENOSPC;
 				goto bail;
 			}
-			bh = ocfs2_bread(dir,
-					 offset >> sb->s_blocksize_bits,
-					 &status,
-					 0);
-			if (!bh) {
+			status = ocfs2_read_dir_block(dir,
+					     offset >> sb->s_blocksize_bits,
+					     &bh, 0);
+			if (status) {
 				mlog_errno(status);
 				goto bail;
 			}
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = -EEXIST;
 			goto bail;
 		}
+
+		if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+					   blocksize))
+			goto next;
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = 0;
 			goto bail;
 		}
+next:
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
 	}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       struct buffer_head *fe_bh,
 		       struct ocfs2_alloc_context *data_ac);
 
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct list_head *iter, *head=NULL;
 	u64 cookie;
 	u32 flags;
+	u8 node;
 
 	if (!dlm_grab(dlm)) {
 		dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	name = past->name;
 	locklen = past->namelen;
-	cookie = be64_to_cpu(past->cookie);
+	cookie = past->cookie;
 	flags = be32_to_cpu(past->flags);
+	node = past->node_idx;
 
 	if (locklen > DLM_LOCKID_NAME_MAX) {
 		ret = DLM_IVBUFLEN;
-		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+		     "handler!\n", locklen);
 		goto leave;
 	}
 
 	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
 	     (LKM_PUT_LVB|LKM_GET_LVB)) {
-		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+		     flags);
 		ret = DLM_BADARGS;
 		goto leave;
 	}
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (past->type != DLM_AST &&
 	    past->type != DLM_BAST) {
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-		     "name=%.*s\n", past->type, 
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name);
+		     "name=%.*s, node=%u\n", past->type,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
 
 	res = dlm_lookup_lockres(dlm, name, locklen);
 	if (!res) {
-		mlog(0, "got %sast for unknown lockres! "
-		     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
-		     past->type == DLM_AST ? "" : "b",
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name, locklen);
+		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
-		mlog(0, "responding with DLM_RECOVERING!\n");
+		mlog(0, "Responding with DLM_RECOVERING!\n");
 		ret = DLM_RECOVERING;
 		goto unlock_out;
 	}
 	if (res->state & DLM_LOCK_RES_MIGRATING) {
-		mlog(0, "responding with DLM_MIGRATING!\n");
+		mlog(0, "Responding with DLM_MIGRATING!\n");
 		ret = DLM_MIGRATING;
 		goto unlock_out;
 	}
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	lock = NULL;
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
 
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
 
-	mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
-	     "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-	     dlm_get_lock_cookie_node(cookie),
-	     dlm_get_lock_cookie_seq(cookie),
-	     locklen, name, locklen);
+	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n", past->type == DLM_AST ? "" : "b",
+	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     locklen, name, node);
 
 	ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
 		list_move_tail(&lock->list, &res->granted);
-		mlog(0, "ast: adding to granted list... type=%d, "
-			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		mlog(0, "ast: Adding to granted list... type=%d, "
+		     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
 		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 
 leave:
-
 	if (res)
 		dlm_lockres_put(res);
 
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
 	unsigned int purge_count;
 	spinlock_t spinlock;
 	spinlock_t ast_lock;
+	spinlock_t track_lock;
 	char *name;
 	u8 node_num;
 	u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
 	 * put on a list for the dlm thread to run. */
 	unsigned long    last_used;
 
+	struct dlm_ctxt *dlm;
+
 	unsigned migration_pending:1;
 	atomic_t asts_reserved;
 	spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
 	struct debug_lockres *dl = m->private;
 	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *oldres = dl->dl_res;
 	struct dlm_lock_resource *res = NULL;
+	struct list_head *track_list;
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->track_lock);
+	if (oldres)
+		track_list = &oldres->tracking;
+	else
+		track_list = &dlm->tracking_list;
 
-	if (dl->dl_res) {
-		list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
-			if (dl->dl_res) {
-				dlm_lockres_put(dl->dl_res);
-				dl->dl_res = NULL;
-			}
-			if (&res->tracking == &dlm->tracking_list) {
-				mlog(0, "End of list found, %p\n", res);
-				dl = NULL;
-				break;
-			}
+	list_for_each_entry(res, track_list, tracking) {
+		if (&res->tracking == &dlm->tracking_list)
+			res = NULL;
+		else
 			dlm_lockres_get(res);
-			dl->dl_res = res;
-			break;
-		}
-	} else {
-		if (!list_empty(&dlm->tracking_list)) {
-			list_for_each_entry(res, &dlm->tracking_list, tracking)
-				break;
-			dlm_lockres_get(res);
-			dl->dl_res = res;
-		} else
-			dl = NULL;
+		break;
 	}
+	spin_unlock(&dlm->track_lock);
 
-	if (dl) {
-		spin_lock(&dl->dl_res->spinlock);
-		dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-		spin_unlock(&dl->dl_res->spinlock);
-	}
+	if (oldres)
+		dlm_lockres_put(oldres);
 
-	spin_unlock(&dlm->spinlock);
+	dl->dl_res = res;
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&res->spinlock);
+	} else
+		dl = NULL;
 
+	/* passed to seq_show */
 	return dl;
 }
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
 	spin_lock_init(&dlm->ast_lock);
+	spin_lock_init(&dlm->track_lock);
 	INIT_LIST_HEAD(&dlm->list);
 	INIT_LIST_HEAD(&dlm->dirty_list);
 	INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_blocks = 0;
 	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
 	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
 
 	res = container_of(kref, struct dlm_lock_resource, refs);
+	dlm = res->dlm;
 
 	/* This should not happen -- all lockres' have a name
 	 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	spin_lock(&dlm->track_lock);
 	if (!list_empty(&res->tracking))
 		list_del_init(&res->tracking);
 	else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
 		     res->lockname.len, res->lockname.name);
 		dlm_print_one_lock_resource(res);
 	}
+	spin_unlock(&dlm->track_lock);
+
+	dlm_put(dlm);
 
 	if (!hlist_unhashed(&res->hash_node) ||
 	    !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
 
+	/* put in dlm_lockres_release */
+	dlm_grab(dlm);
+	res->dlm = dlm;
+
 	kref_init(&res->refs);
 
 	/* just for consistency */
@@ -722,14 +732,21 @@ lookup:
 	if (tmpres) {
 		int dropping_ref = 0;
 
+		spin_unlock(&dlm->spinlock);
+
 		spin_lock(&tmpres->spinlock);
+		/* We wait for the other thread that is mastering the resource */
+		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			__dlm_wait_on_lockres(tmpres);
+			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+		}
+
 		if (tmpres->owner == dlm->node_num) {
 			BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
 			dlm_lockres_grab_inflight_ref(dlm, tmpres);
 		} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
 			dropping_ref = 1;
 		spin_unlock(&tmpres->spinlock);
-		spin_unlock(&dlm->spinlock);
 
 		/* wait until done messaging the master, drop our ref to allow
 		 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 				  struct dlm_node_iter *iter)
 {
 	struct dlm_migrate_request migrate;
-	int ret, status = 0;
+	int ret, skip, status = 0;
 	int nodenum;
 
 	memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 		    nodenum == new_master)
 			continue;
 
+		/* We could race exit domain. If exited, skip. */
+		spin_lock(&dlm->spinlock);
+		skip = (!test_bit(nodenum, dlm->domain_map));
+		spin_unlock(&dlm->spinlock);
+		if (skip) {
+			clear_bit(nodenum, iter->node_map);
+			continue;
+		}
+
 		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
 					 &migrate, sizeof(migrate), nodenum,
 					 &status);
-		if (ret < 0)
-			mlog_errno(ret);
-		else if (status < 0) {
+		if (ret < 0) {
+			mlog(0, "migrate_request returned %d!\n", ret);
+			if (!dlm_is_host_down(ret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+				BUG();
+			}
+			clear_bit(nodenum, iter->node_map);
+			ret = 0;
+		} else if (status < 0) {
 			mlog(0, "migrate request (node %u) returned %d!\n",
 			     nodenum, status);
 			ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+						  DLM_LOCK_RES_MIGRATING));
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 
 /*
  * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 				     struct ocfs2_lock_res *lockres);
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
 				     unsigned int line,
 				     struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	     lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+	.set_lvb	= ocfs2_set_qinfo_lvb,
+	.get_osb	= ocfs2_get_qinfo_osb,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
 	return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
 
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
 	if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
 	return OCFS2_SB(inode->i_sb);
 }
 
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+	return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
 	struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
 
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+			       struct ocfs2_mem_dqinfo *info)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+				   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+				   info);
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
 			goto out;
 		}
 
-		mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+		mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
 		     lockres->l_name);
 
 		/* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
 	mlog_entry_void();
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/*
 	 * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 
 	mlog_meta_lvb(0, lockres);
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/* We're safe here without the lockres lock... */
 	spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 					      struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	if (lvb->lvb_version == OCFS2_LVB_VERSION
 	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+		status = ocfs2_read_inode_block(inode, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
 
 		/* This is a good chance to make sure we're not
-		 * locking an invalid object.
+		 * locking an invalid object.  ocfs2_read_inode_block()
+		 * already checked that the inode block is sane.
 		 *
 		 * We bug on a stale inode here because we checked
 		 * above whether it was wiped from disk. The wiping
 		 * node provides a guarantee that we receive that
 		 * message and can mark the inode before dropping any
 		 * locks associated with it. */
-		if (!OCFS2_IS_VALID_DINODE(fe)) {
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-			status = -EIO;
-			goto bail_refresh;
-		}
 		mlog_bug_on_msg(inode->i_generation !=
 				le32_to_cpu(fe->i_generation),
 				"Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+	status = ocfs2_read_inode_block(inode, ret_bh);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 		ocfs2_dlm_dump_lksb(&lockres->l_lksb);
 		BUG();
 	}
-	mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+	mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
 	     lockres->l_name);
 
 	ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_qinfo_lvb *lvb;
+	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+
+	mlog_entry_void();
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+
+	mlog_exit_void();
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	mlog_entry_void();
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+	mlog_exit_void();
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_global_disk_dqinfo *gdinfo;
+	int status = 0;
+
+	if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+		info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+		info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+		oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+		oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+		oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					be32_to_cpu(lvb->lvb_free_entry);
+	} else {
+		status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		gdinfo = (struct ocfs2_global_disk_dqinfo *)
+					(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+		info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+		info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+		oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+		oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+		oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					le32_to_cpu(gdinfo->dqi_free_entry);
+		brelse(bh);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+bail:
+	return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	mlog_entry_void();
+
+	/* On RO devices, locking really isn't needed... */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+	/* OK, we have the lock but we need to refresh the quota info */
+	status = ocfs2_refresh_qinfo(oinfo);
+	if (status)
+		ocfs2_qinfo_unlock(oinfo, ex);
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
 /*
  * This is the filesystem locking protocol.  It provides the lock handling
  * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
 	__be32       lvb_reserved2;
 };
 
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_bgrace;
+	__be32	lvb_igrace;
+	__be32	lvb_syncms;
+	__be32	lvb_blocks;
+	__be32	lvb_free_blk;
+	__be32	lvb_free_entry;
+};
+
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 			      struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+	ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 	el = &eb->h_list;
 
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		ret = -EROFS;
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		goto out;
-	}
-
 	if (el->l_tree_depth) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_block(inode,
-				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      le64_to_cpu(eb->h_next_leaf_blk),
+					      &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-			ret = -EROFS;
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-			goto out;
-		}
 
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
 		el = &next_eb->h_list;
-
 		i = ocfs2_search_for_hole_index(el, v_cluster);
 	}
 
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -819,3 +806,74 @@ out:
 
 	return ret;
 }
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh))
+{
+	int rc = 0;
+	u64 p_block, p_count;
+	int i, count, done = 0;
+
+	mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+		   "flags = %x, validate = %p)\n",
+		   inode, (unsigned long long)v_block, nr, bhs, flags,
+		   validate);
+
+	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	while (done < nr) {
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+						 &p_block, &p_count, NULL);
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!p_block) {
+			rc = -EIO;
+			mlog(ML_ERROR,
+			     "Inode #%llu contains a hole at offset %llu\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)(v_block + done) <<
+			     inode->i_sb->s_blocksize_bits);
+			break;
+		}
+
+		count = nr - done;
+		if (p_count < count)
+			count = p_count;
+
+		/*
+		 * If the caller passed us bhs, they should have come
+		 * from a previous readahead call to this function.  Thus,
+		 * they should have the right b_blocknr.
+		 */
+		for (i = 0; i < count; i++) {
+			if (!bhs[done + i])
+				continue;
+			BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+		}
+
+		rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+				       flags, validate);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+		done += count;
+	}
+
+out:
+	mlog_exit(rc);
+	return rc;
+}
+
+
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
 			     struct ocfs2_extent_list *el);
 
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+					struct buffer_head **bh,
+					int (*validate)(struct super_block *sb,
+							struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -303,9 +306,9 @@ bail:
 	return status;
 }
 
-static int ocfs2_simple_size_update(struct inode *inode,
-				    struct buffer_head *di_bh,
-				    u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 
+	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
+	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
 
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
-
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto leave;
-	}
 
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
 	}
 
 restarted_transaction:
+	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+	    clusters_to_add))) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota = 1;
+
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* Release unused quota reservation */
+	vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	did_quota = 0;
 
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 
 leave:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
+	int locked[MAXQUOTAS] = {0, 0};
+	int credits, qtype;
+	struct ocfs2_mem_dqinfo *oinfo;
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto bail_unlock;
+	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		credits = OCFS2_INODE_UPDATE_CREDITS;
+		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+				ocfs2_calc_qdel_credits(sb, USRQUOTA);
+			locked[USRQUOTA] = 1;
+		}
+		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+			locked[GRPQUOTA] = 1;
+		}
+		handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		if (status < 0)
+			goto bail_commit;
+	} else {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
 	}
 
 	/*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+		if (!locked[qtype])
+			continue;
+		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+		ocfs2_unlock_global_qf(oinfo, 1);
+	}
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
 	brelse(bh);
 
+	if (!status && attr->ia_valid & ATTR_MODE) {
+		status = ocfs2_acl_chmod(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
 	mlog_exit(status);
 	return status;
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, NULL);
+	ret = generic_permission(inode, mask, ocfs2_check_acl);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+	ret = ocfs2_read_inode_block(inode, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				       &di_bh);
+		ret = ocfs2_read_inode_block(inode, &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1226,83 +1284,6 @@ out:
 	return ret;
 }
 
-static int __ocfs2_remove_inode_range(struct inode *inode,
-				      struct buffer_head *di_bh,
-				      u32 cpos, u32 phys_cpos, u32 len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-	int ret;
-	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-
-	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-				  dealloc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	OCFS2_I(inode)->ip_clusters -= len;
-	di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
-	return ret;
-}
-
 /*
  * Truncate a byte range, avoiding pages within partial clusters. This
  * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
+	struct ocfs2_extent_tree et;
 
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
 	if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 
 		/* Only do work for non-holes */
 		if (phys_cpos != 0) {
-			ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
-							 phys_cpos, alloc_size,
-							 &dealloc);
+			ret = ocfs2_remove_btree_range(inode, &et, cpos,
+						       phys_cpos, alloc_size,
+						       &dealloc);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
 	    !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 
 #include <asm/byteorder.h>
 
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	return 0;
 }
 
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-		     	 int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino)
 {
 	struct super_block *sb;
 	struct ocfs2_super *osb;
-	int status = -EINVAL;
 	int use_plocks = 1;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
 		use_plocks = 0;
 
-	/* this means that read_inode cannot create a superblock inode
-	 * today.  change if needed. */
-	if (!OCFS2_IS_VALID_DINODE(fe) ||
-	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-		mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
-		     "signature = %.*s, flags = 0x%x\n",
-		     inode->i_ino,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature, le32_to_cpu(fe->i_flags));
-		goto bail;
-	}
+	/*
+	 * These have all been checked by ocfs2_read_inode_block() or set
+	 * by ocfs2_mknod_locked(), so a failure is a code bug.
+	 */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
+						cannot create a superblock
+						inode today.  change if
+						that is needed. */
+	BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+	BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
 
-	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-		mlog(ML_ERROR, "file entry generation does not match "
-		     "superblock! osb->fs_generation=%x, "
-		     "fe->i_fs_generation=%x\n",
-		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-		goto bail;
-	}
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
-	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+		inode->i_flags |= S_NOQUOTA;
+	}
 
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+		inode->i_flags |= S_NOQUOTA;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
 		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		/* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	ocfs2_set_inode_flags(inode);
 
-	status = 0;
-bail:
-	mlog_exit(status);
-	return status;
+	mlog_exit_void();
 }
 
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
-	if (can_lock)
-		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
-	else
+	if (can_lock) {
+		status = ocfs2_read_inode_block_full(inode, &bh,
+						     OCFS2_BH_IGNORE_CACHE);
+	} else {
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+		if (!status)
+			status = ocfs2_validate_inode_block(osb->sb, bh);
+	}
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 	status = -EINVAL;
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)args->fi_blkno, 7,
-		     fe->i_signature);
-		goto bail;
-	}
 
 	/*
 	 * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
 	    S_ISBLK(le16_to_cpu(fe->i_mode)))
-    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
 
-	if (ocfs2_populate_inode(inode, fe, 0) < 0)
-		goto bail;
+	ocfs2_populate_inode(inode, fe, 0);
 
 	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
 
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 			goto out;
 		}
 
-		status = ocfs2_journal_access(handle, inode, fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, inode, fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+					ocfs2_quota_trans_credits(inode->i_sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	/* set the inodes dtime */
-	status = ocfs2_journal_access(handle, inode, di_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	ocfs2_remove_from_cache(inode, di_bh);
+	vfs_dq_free_inode(inode);
 
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	if (is_bad_inode(inode)) {
+	/* When we fail in read_inode() we mark inode as bad. The second test
+	 * catches the case when inode allocation fails before allocating
+	 * a block for inode. */
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
 		mlog(0, "Skipping delete of bad inode\n");
 		goto bail;
 	}
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	mlog_entry("(inode %llu)\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
 
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	mlog(0, "Validating dinode %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	rc = -EINVAL;
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    di->i_signature);
+		goto bail;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(di->i_blkno));
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+			    (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: fs_generation is %u\n",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(di->i_fs_generation));
+		goto bail;
+	}
+
+	rc = 0;
+
+bail:
+	return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+			       flags, ocfs2_validate_inode_block);
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+	return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
 			 int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-			 int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 			   struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada);
 
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
 
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num);
+			      int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 				      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 				 int slot);
 static int ocfs2_commit_thread(void *arg);
 
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 1);
+}
+
+
 
 /*
  * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
 	BUG_ON(max_buffs <= 0);
 
-	/* JBD might support this, but our journalling code doesn't yet. */
-	if (journal_current_handle()) {
-		mlog(ML_ERROR, "Recursive transaction attempted!\n");
-		BUG();
-	}
+	/* Nested transaction? Just return the handle... */
+	if (journal_current_handle())
+		return jbd2_journal_start(journal, max_buffs);
 
 	down_read(&osb->journal->j_trans_barrier);
 
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
 		       handle_t *handle)
 {
-	int ret;
+	int ret, nested;
 	struct ocfs2_journal *journal = osb->journal;
 
 	BUG_ON(!handle);
 
+	nested = handle->h_ref > 1;
 	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);
 
-	up_read(&journal->j_trans_barrier);
+	if (!nested)
+		up_read(&journal->j_trans_barrier);
 
 	return ret;
 }
@@ -357,10 +370,137 @@ bail:
 	return status;
 }
 
-int ocfs2_journal_access(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *bh,
-			 int type)
+struct ocfs2_triggers {
+	struct jbd2_buffer_trigger_type	ot_triggers;
+	int				ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_dir_trailer_from_size(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+				struct buffer_head *bh)
+{
+	mlog(ML_ERROR,
+	     "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+	     "bh->b_blocknr = %llu\n",
+	     (unsigned long)bh,
+	     (unsigned long long)bh->b_blocknr);
+
+	/* We aren't guaranteed to have the superblock here - but if we
+	 * don't, it'll just crash. */
+	ocfs2_error(bh->b_assoc_map->host->i_sb,
+		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_db_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers xb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_dq_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+				  struct inode *inode,
+				  struct buffer_head *bh,
+				  struct ocfs2_triggers *triggers,
+				  int type)
 {
 	int status;
 
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
 		status = -EINVAL;
 		mlog(ML_ERROR, "Uknown access type!\n");
 	}
+	if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
 	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
 	return status;
 }
 
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			       struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+				      type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
 int ocfs2_journal_dirty(handle_t *handle,
 			struct buffer_head *bh)
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }
 
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-			     struct buffer_head *bh)
-{
-	int err = journal_dirty_data(handle, bh);
-	if (err)
-		mlog_errno(err);
-	/* TODO: When we can handle it, abort the handle and go RO on
-	 * error here. */
-
-	return err;
-}
-#endif
-
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	mlog_entry_void();
 
 	fe = (struct ocfs2_dinode *)bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		/* This is called from startup/shutdown which will
-		 * handle the errors in a specific manner, so no need
-		 * to call ocfs2_error() here. */
-		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-		     "signature: %.*s",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		status = -EIO;
-		goto out;
-	}
+
+	/* The journal bh on the osb always comes from ocfs2_journal_init()
+	 * and was validated there inside ocfs2_inode_lock_full().  It's a
+	 * code bug if we mess it up. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
 	if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	if (replayed)
 		ocfs2_bump_recovery_generation(fe);
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, journal->j_inode);
 	if (status < 0)
 		mlog_errno(status);
 
-out:
 	mlog_exit(status);
 	return status;
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
 	int			lri_slot;
 	struct ocfs2_dinode	*lri_la_dinode;
 	struct ocfs2_dinode	*lri_tl_dinode;
+	struct ocfs2_quota_recovery *lri_qrec;
 };
 
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	struct ocfs2_super *osb = journal->j_osb;
 	struct ocfs2_dinode *la_dinode, *tl_dinode;
 	struct ocfs2_la_recovery_item *item, *n;
+	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 
 	mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 
 		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
 
+		ocfs2_wait_on_quotas(osb);
+
 		la_dinode = item->lri_la_dinode;
 		if (la_dinode) {
 			mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		if (ret < 0)
 			mlog_errno(ret);
 
+		qrec = item->lri_qrec;
+		if (qrec) {
+			mlog(0, "Recovering quota files");
+			ret = ocfs2_finish_quota_recovery(osb, qrec,
+							  item->lri_slot);
+			if (ret < 0)
+				mlog_errno(ret);
+			/* Recovery info is already freed now */
+		}
+
 		kfree(item);
 	}
 
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 					    int slot_num,
 					    struct ocfs2_dinode *la_dinode,
-					    struct ocfs2_dinode *tl_dinode)
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec)
 {
 	struct ocfs2_la_recovery_item *item;
 
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 		if (tl_dinode)
 			kfree(tl_dinode);
 
+		if (qrec)
+			ocfs2_free_quota_recovery(qrec);
+
 		mlog_errno(-ENOMEM);
 		return;
 	}
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 	item->lri_la_dinode = la_dinode;
 	item->lri_slot = slot_num;
 	item->lri_tl_dinode = tl_dinode;
+	item->lri_qrec = qrec;
 
 	spin_lock(&journal->j_lock);
 	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 		ocfs2_queue_recovery_completion(journal,
 						osb->slot_num,
 						osb->local_alloc_copy,
+						NULL,
 						NULL);
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 	}
 }
 
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+	if (osb->quota_rec) {
+		ocfs2_queue_recovery_completion(osb->journal,
+						osb->slot_num,
+						NULL,
+						NULL,
+						osb->quota_rec);
+		osb->quota_rec = NULL;
+	}
+}
+
 static int __ocfs2_recovery_thread(void *arg)
 {
-	int status, node_num;
+	int status, node_num, slot_num;
 	struct ocfs2_super *osb = arg;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	int *rm_quota = NULL;
+	int rm_quota_used = 0, i;
+	struct ocfs2_quota_recovery *qrec;
 
 	mlog_entry_void();
 
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
 		goto bail;
 	}
 
+	rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+	if (!rm_quota) {
+		status = -ENOMEM;
+		goto bail;
+	}
 restart:
 	status = ocfs2_super_lock(osb, 1);
 	if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		node_num = rm->rm_entries[0];
 		spin_unlock(&osb->osb_lock);
-
-		status = ocfs2_recover_node(osb, node_num);
+		mlog(0, "checking node %d\n", node_num);
+		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		if (slot_num == -ENOENT) {
+			status = 0;
+			mlog(0, "no slot for this node, so no recovery"
+			     "required.\n");
+			goto skip_recovery;
+		}
+		mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+		/* It is a bit subtle with quota recovery. We cannot do it
+		 * immediately because we have to obtain cluster locks from
+		 * quota files and we also don't want to just skip it because
+		 * then quota usage would be out of sync until some node takes
+		 * the slot. So we remember which nodes need quota recovery
+		 * and when everything else is done, we recover quotas. */
+		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+		if (i == rm_quota_used)
+			rm_quota[rm_quota_used++] = slot_num;
+
+		status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
 		if (!status) {
 			ocfs2_recovery_map_clear(osb, node_num);
 		} else {
@@ -1055,13 +1285,27 @@ restart:
 	if (status < 0)
 		mlog_errno(status);
 
+	/* Now it is right time to recover quotas... We have to do this under
+	 * superblock lock so that noone can start using the slot (and crash)
+	 * before we recover it */
+	for (i = 0; i < rm_quota_used; i++) {
+		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+		if (IS_ERR(qrec)) {
+			status = PTR_ERR(qrec);
+			mlog_errno(status);
+			continue;
+		}
+		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+						NULL, NULL, qrec);
+	}
+
 	ocfs2_super_unlock(osb, 1);
 
 	/* We always run recovery on our own orphan dir - the dead
 	 * node(s) may have disallowd a previos inode delete. Re-processing
 	 * is therefore required. */
 	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-					NULL);
+					NULL, NULL);
 
 bail:
 	mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
 
 	mutex_unlock(&osb->recovery_lock);
 
+	if (rm_quota)
+		kfree(rm_quota);
+
 	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
-				   OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	osb->slot_recovery_generations[slot_num] =
 					ocfs2_get_recovery_generation(fe);
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
  * far less concerning.
  */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num)
+			      int node_num, int slot_num)
 {
 	int status = 0;
-	int slot_num;
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
-	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-		   node_num, osb->node_num);
-
-	mlog(0, "checking node %d\n", node_num);
+	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+		   node_num, slot_num, osb->node_num);
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
 	BUG_ON(osb->node_num == node_num);
 
-	slot_num = ocfs2_node_num_to_slot(osb, node_num);
-	if (slot_num == -ENOENT) {
-		status = 0;
-		mlog(0, "no slot for this node, so no recovery required.\n");
-		goto done;
-	}
-
-	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
 		if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 	/* This will kfree the memory pointed to by la_copy and tl_copy */
 	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-					tl_copy);
+					tl_copy, NULL);
 
 	status = 0;
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	return ret;
 }
 
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
 	/* This check is good because ocfs2 will wait on our recovery
 	 * thread before changing it to something other than MOUNTED
 	 * or DISABLED. */
 	wait_event(osb->osb_mount_event,
-		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
 		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
 
 	/* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
 
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
 			     int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
  *                          commit the handle to disk in the process, but will
  *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
  *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
  *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
 
-int                  ocfs2_journal_access(handle_t *handle,
-					  struct inode *inode,
-					  struct buffer_head *bh,
-					  int type);
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type);
+
 /*
  * A word about the journal_access/journal_dirty "dance". It is
  * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-					      struct buffer_head *bh);
-#endif
 
 /*
  *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+	int credits = 0;
+
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	return credits;
+}
+
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
 
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC		\
-					 + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+	return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
 					 + OCFS2_TRUNCATE_LOG_UPDATE)
 
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+	return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
  * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
-			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+ * group descriptor + mkdir/symlink blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb)
+{
+	return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
  * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
  * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
-			      + OCFS2_LINK_CREDITS)
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+	/* The quota update from ocfs2_link_credits is unused here... */
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
  * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
  * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
-			     + OCFS2_UNLINK_CREDITS)
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 
 /* global bitmap dinode, group desc., relinked group,
  * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * credit for the dinode there. */
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+	       ocfs2_quota_trans_credits(sb);
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-	int blocks = OCFS2_MKNOD_CREDITS;
+	int blocks = ocfs2_mknod_credits(sb);
 
 	/* links can be longer than one block so we may update many
 	 * within our single allocated extent. */
 	blocks += ocfs2_clusters_to_blocks(sb, 1);
 
-	return blocks;
+	return blocks + ocfs2_quota_trans_credits(sb);
 }
 
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* update to the truncate log. */
 	credits += OCFS2_TRUNCATE_LOG_UPDATE;
 
+	credits += ocfs2_quota_trans_credits(sb);
+
 	return credits;
 }
 
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	}
 	memcpy(alloc_copy, alloc, bh->b_size);
 
-	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
 	ocfs2_clear_local_alloc(alloc);
 
+	ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
 	status = ocfs2_write_block(osb, alloc_bh, inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	 * delete bits from it! */
 	*num_bits = bits_wanted;
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	}
 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 
 #include "buffer_head_io.h"
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac);
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
 	return ret;
 }
 
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		mlog(ML_ERROR, "new_inode failed!\n");
+		return NULL;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_uid = current_fsuid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+	vfs_dq_init(inode);
+	return inode;
+}
+
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
 	struct inode *inode = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota_inode = 0;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	/* Reserve a cluster if creating an extent based directory. */
-	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
-		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
 			goto leave;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+	/* calculate meta data/clusters for setting security and acl xattr */
+	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+					&si, &want_clusters,
+					&xattr_credits, &xattr_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+				   xattr_credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota_inode = 1;
+
 	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+				    inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
 			goto leave;
 		}
 
-		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 	}
 
+	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+				xattr_ac, data_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
@@ -331,9 +425,13 @@ leave:
 	brelse(new_fe_bh);
 	brelse(de_bh);
 	brelse(parent_fe_bh);
+	kfree(si.name);
+	kfree(si.value);
 
-	if ((status < 0) && inode)
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
+	}
 
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+
 	mlog_exit(status);
 
 	return status;
@@ -348,12 +449,12 @@ leave:
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac)
 {
 	int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *fel;
 	u64 fe_blkno = 0;
 	u16 suballoc_bit;
-	struct inode *inode = NULL;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-		   (unsigned long)dev, dentry->d_name.len,
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 
 	*new_fe_bh = NULL;
-	*ret_inode = NULL;
 
 	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
 				       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	inode = new_inode(dir->i_sb);
-	if (!inode) {
-		status = -ENOMEM;
-		mlog(ML_ERROR, "new_inode failed!\n");
-		goto leave;
-	}
-
 	/* populate as many fields early on as possible - many of
 	 * these are used by the support functions here and in
 	 * callers. */
 	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
 	OCFS2_I(inode)->ip_blkno = fe_blkno;
-	if (S_ISDIR(mode))
-		inode->i_nlink = 2;
-	else
-		inode->i_nlink = 1;
-	inode->i_mode = mode;
 	spin_lock(&osb->osb_lock);
 	inode->i_generation = osb->s_next_generation++;
 	spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 
-	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-	fe->i_uid = cpu_to_le32(current_fsuid());
-	if (dir->i_mode & S_ISGID) {
-		fe->i_gid = cpu_to_le32(dir->i_gid);
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		fe->i_gid = cpu_to_le32(current_fsgid());
-	fe->i_mode = cpu_to_le16(mode);
-	if (S_ISCHR(mode) || S_ISBLK(mode))
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 
 	fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	/*
 	 * If supported, directories start with inline data.
 	 */
-	if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
 		u16 feat = le16_to_cpu(fe->i_dyn_features);
 
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
-		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-		     "i_blkno=%llu, i_ino=%lu\n",
-		     (unsigned long long)(*new_fe_bh)->b_blocknr,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno),
-		     inode->i_ino);
-		BUG();
-	}
-
+	ocfs2_populate_inode(inode, fe, 1);
 	ocfs2_inode_set_new(osb, inode);
 	if (!ocfs2_mount_local(osb)) {
 		status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	status = 0; /* error in ocfs2_create_new_inode_locks is not
 		     * critical */
 
-	*ret_inode = inode;
 leave:
 	if (status < 0) {
 		if (*new_fe_bh) {
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
-		if (inode) {
-			clear_nlink(inode);
-			iput(inode);
-		}
 	}
 
 	mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 	}
 
-	err = ocfs2_journal_access(handle, inode, fe_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	err = ocfs2_journal_access_di(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (err < 0) {
 		mlog_errno(err);
 		goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
 				goto bail;
 			}
 		}
-		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
 
-	status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status >= 0) {
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
 			     (int)old_dir_nlink, old_dir->i_nlink);
 		} else {
 			struct ocfs2_dinode *fe;
-			status = ocfs2_journal_access(handle, old_dir,
-						      old_dir_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = ocfs2_journal_access_di(handle, old_dir,
+							 old_dir_bh,
+							 OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota = 0, did_quota_inode = 0;
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	/* don't reserve bitmap space for fast symlinks. */
-	if (l > ocfs2_fast_symlink_chars(sb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
 		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+			mlog_errno(status);
 			goto bail;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, credits);
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, credits + xattr_credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	status = ocfs2_mknod_locked(osb, dir, dentry,
-				    S_IFLNK | S_IRWXUGO, 0,
-				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto bail;
+	}
+	did_quota_inode = 1;
+
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
+		if (vfs_dq_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 					      new_fe_bh,
 					      handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
 		}
 	}
 
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space_nodirty(inode,
+					ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
@@ -1640,12 +1772,18 @@ bail:
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
 	brelse(de_bh);
+	kfree(si.name);
+	kfree(si.value);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
-	if ((status < 0) && inode)
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
+	}
 
 	mlog_exit(status);
 
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	status = ocfs2_read_block(orphan_dir_inode,
-				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh);
+	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
 	VOLUME_INIT = 0,
 	VOLUME_MOUNTED,
+	VOLUME_MOUNTED_QUOTAS,
 	VOLUME_DISMOUNTED,
 	VOLUME_DISABLED
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
+	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* POSIX access control lists */
+	OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+	OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
 	char *local_alloc_debug_buf;
 #endif
 
-	/* Next two fields are for local node slot recovery during
+	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
 	struct ocfs2_dinode *local_alloc_copy;
+	struct ocfs2_quota_recovery *quota_rec;
 
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
 
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+					 struct buffer_head *bh, int type);
+
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
 	if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)					\
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
-	typeof(__di) ____di = (__di);					\
-	ocfs2_error((__sb), 						\
-		"Dinode # %llu has bad signature %.*s",			\
-		(unsigned long long)le64_to_cpu((____di)->i_blkno), 7, 	\
-		(____di)->i_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
-	typeof(__eb) ____eb = (__eb);					\
-	ocfs2_error((__sb), 						\
-		"Extent Block # %llu has bad signature %.*s",		\
-		(unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,	\
-		(____eb)->h_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
-	typeof(__gd) ____gd = (__gd);					\
-		ocfs2_error((__sb),					\
-		"Group Descriptor # %llu has bad signature %.*s",	\
-		(unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-		(____gd)->bg_signature);				\
-} while (0)
 
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
 	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
 
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
+	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
 
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -93,8 +94,11 @@
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-					 | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+					 | OCFS2_FEATURE_INCOMPAT_XATTR \
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -163,6 +170,12 @@
  */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
 
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
+
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 #define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+#define OCFS2_QUOTA_FL		(0x00001000)	/* Quota file */
 
 /*
  * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
 	HEARTBEAT_SYSTEM_INODE,
 	GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	USER_QUOTA_SYSTEM_INODE,
+	GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
 	ORPHAN_DIR_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
 	JOURNAL_SYSTEM_INODE,
 	LOCAL_ALLOC_SYSTEM_INODE,
 	TRUNCATE_LOG_SYSTEM_INODE,
+	LOCAL_USER_QUOTA_SYSTEM_INODE,
+	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
 	NUM_SYSTEM_INODES
 };
 
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
 	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
 	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+	[USER_QUOTA_SYSTEM_INODE]		= { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[GROUP_QUOTA_SYSTEM_INODE]		= { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 
 	/* Slot-specific system inodes (one copy per slot) */
 	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
 	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
 	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+	[LOCAL_USER_QUOTA_SYSTEM_INODE]		= { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[LOCAL_GROUP_QUOTA_SYSTEM_INODE]	= { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 
 /* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 #define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
 
 /*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/	__le32 bc_crc32e;	/* 802.3 Ethernet II CRC32 */
+	__le16 bc_ecc;		/* Single-error-correction parity vector.
+				   This is a simple Hamming code dependant
+				   on the blocksize.  OCFS2's maximum
+				   blocksize, 4K, requires 16 parity bits,
+				   so we fit in __le16. */
+	__le16 bc_reserved1;
+/*08*/
+};
+
+/*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
  *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
-	__le64 h_reserved1;
+	struct ocfs2_block_check h_check;	/* Error checking */
 /*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
 					   extent_header belongs to */
 	__le16 h_suballoc_bit;		/* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
 					   was set in i_flags */
 	__le16 i_dyn_features;
 	__le64 i_xattr_loc;
-/*80*/	__le64 i_reserved2[7];
+/*80*/	struct ocfs2_block_check i_check;	/* Error checking */
+/*88*/	__le64 i_reserved2[6];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
 } __attribute__ ((packed));
 
 /*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/	__le64		db_compat_inode;	/* Always zero. Was inode */
+
+	__le16		db_compat_rec_len;	/* Backwards compatible with
+						 * ocfs2_dir_entry. */
+	__u8		db_compat_name_len;	/* Always zero. Was name_len */
+	__u8		db_reserved0;
+	__le16		db_reserved1;
+	__le16		db_free_rec_len;	/* Size of largest empty hole
+						 * in this block. (unused) */
+/*10*/	__u8		db_signature[8];	/* Signature for verification */
+	__le64		db_reserved2;
+	__le64		db_free_next;		/* Next block in list (unused) */
+/*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
+	__le64		db_parent_dinode;	/* dinode which owns me, in
+						   blocks */
+/*30*/	struct ocfs2_block_check db_check;	/* Error checking */
+/*40*/
+};
+
+/*
  * On disk allocator group structure for OCFS2
  */
 struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
 					   blocks */
 	__le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/	__le64   bg_reserved2[2];
+/*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
+	__le64   bg_reserved2;
 /*40*/	__u8    bg_bitmap[0];
 };
 
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
 						   in this extent record,
 						   only valid in the first
 						   bucket. */
-	__le64  xh_csum;
+	struct ocfs2_block_check xh_check;	/* Error checking
+						   (Note, this is only
+						    used for xattr
+						    buckets.  A block uses
+						    xb_check and sets
+						    this field to zero.) */
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
 
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
 					block group */
 	__le32	xb_fs_generation;    /* Must match super block */
 /*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
-	__le64	xb_csum;
+	struct ocfs2_block_check xb_check;	/* Error checking */
 /*20*/	__le16	xb_flags;            /* Indicates whether this block contains
 					real xattr or a xattr tree. */
 	__le16	xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
 	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
 
+/*
+ *  On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+	0x0cf52470, /* USRQUOTA */ \
+	0x0cf52471  /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/	__le32 dqi_bgrace;	/* Grace time for space softlimit excess */
+	__le32 dqi_igrace;	/* Grace time for inode softlimit excess */
+	__le32 dqi_syncms;	/* Time after which we sync local changes to
+				 * global quota file */
+	__le32 dqi_blocks;	/* Number of blocks in quota file */
+/*10*/	__le32 dqi_free_blk;	/* First free block in quota file */
+	__le32 dqi_free_entry;	/* First block with free dquot entry in quota
+				 * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/	__le32 dqb_id;          /* ID the structure belongs to */
+	__le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+	__le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/	__le64 dqb_isoftlimit;  /* preferred inode limit */
+	__le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/	__le64 dqb_bhardlimit;  /* absolute limit on disk space */
+	__le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/	__le64 dqb_curspace;    /* current space occupied */
+	__le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/	__le64 dqb_itime;       /* time limit for excessive inode use */
+	__le64 dqb_pad1;
+/*50*/	__le64 dqb_pad2;
+};
+
+/*
+ *  On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+	0x0cf524c0, /* USRQUOTA */ \
+	0x0cf524c1  /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN	0x0001	/* Quota file is empty (this should be after\
+				 * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+	__le32 dqi_flags;	/* Flags for quota file */
+	__le32 dqi_chunks;	/* Number of chunks of quota structures
+				 * with a bitmap */
+	__le32 dqi_blocks;	/* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+	__le32 dqc_free;	/* Number of free entries in the bitmap */
+	u8 dqc_bitmap[0];	/* Bitmap of entries in the corresponding
+				 * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/	__le64 dqb_id;		/* id this quota applies to */
+	__le64 dqb_spacemod;	/* Change in the amount of used space */
+/*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/	struct ocfs2_block_check dq_check;	/* Error checking */
+/*08*/	/* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+								 void *buf)
+{
+	char *ptr = buf;
+	ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+	return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-
-struct jbd2_inode {
-	unsigned int dummy;
-};
-
-#define JBD2_BARRIER			JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
-
-#define jbd2_journal_ack_err			journal_ack_err
-#define jbd2_journal_clear_err			journal_clear_err
-#define jbd2_journal_destroy			journal_destroy
-#define jbd2_journal_dirty_metadata		journal_dirty_metadata
-#define jbd2_journal_errno			journal_errno
-#define jbd2_journal_extend			journal_extend
-#define jbd2_journal_flush			journal_flush
-#define jbd2_journal_force_commit		journal_force_commit
-#define jbd2_journal_get_write_access		journal_get_write_access
-#define jbd2_journal_get_undo_access		journal_get_undo_access
-#define jbd2_journal_init_inode			journal_init_inode
-#define jbd2_journal_invalidatepage		journal_invalidatepage
-#define jbd2_journal_load			journal_load
-#define jbd2_journal_lock_updates		journal_lock_updates
-#define jbd2_journal_restart			journal_restart
-#define jbd2_journal_start			journal_start
-#define jbd2_journal_start_commit		journal_start_commit
-#define jbd2_journal_stop			journal_stop
-#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates		journal_unlock_updates
-#define jbd2_journal_wipe			journal_wipe
-#define jbd2_log_wait_commit			log_wait_commit
-
-static inline int jbd2_journal_file_inode(handle_t *handle,
-					  struct jbd2_inode *inode)
-{
-	return 0;
-}
-
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-						      loff_t new_size)
-{
-	return 0;
-}
-
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-					       struct inode *inode)
-{
-	return;
-}
-
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-						  struct jbd2_inode *jinode)
-{
-	return;
-}
-
-
-#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_DENTRY,
 	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_LOCK_TYPE_FLOCK,
+	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_FLOCK:
 			c = 'F';
 			break;
+		case OCFS2_LOCK_TYPE_QINFO:
+			c = 'Q';
+			break;
 		default:
 			c = '\0';
 	}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+	struct dquot dq_dquot;	/* Generic VFS dquot */
+	loff_t dq_local_off;	/* Offset in the local quota file */
+	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
+	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
+	s64 dq_origspace;	/* Last globally synced space usage */
+	s64 dq_originodes;	/* Last globally synced inode usage */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+	struct list_head rc_list;	/* List of chunks */
+	int rc_chunk;			/* Chunk number */
+	unsigned long *rc_bitmap;	/* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+	struct list_head r_list[MAXQUOTAS];	/* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+	unsigned int dqi_type;		/* Quota type this structure describes */
+	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
+	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
+	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	unsigned int dqi_syncjiff;	/* Precomputed dqi_syncms in jiffies */
+	struct list_head dqi_chunk;	/* List of chunks */
+	struct inode *dqi_gqinode;	/* Global quota file inode */
+	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
+	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
+	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
+	struct buffer_head *dqi_ibh;	/* Buffer with information header */
+	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
+	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
+						 * information, in case we
+						 * enable quotas on file
+						 * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+	return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+	struct list_head qc_chunk;	/* List of quotafile chunks */
+	int qc_num;			/* Number of quota chunk */
+	struct buffer_head *qc_headerbh;	/* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+				struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh);
+
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	/* Update from disk only entries not set by the admin */
+	if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+		m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+		m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+		m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+		m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+		m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+		m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
+	.is_id = ocfs2_global_is_id,
+};
+
+static int ocfs2_validate_quota_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+	mlog(0, "Validating quota block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+				 struct buffer_head **bh)
+{
+	u64 pblock, pcount;
+	int err;
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	*bh = sb_getblk(inode->i_sb, pblock);
+	if (!*bh) {
+		err = -EIO;
+		mlog_errno(err);
+	}
+	return err;;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	loff_t i_size = i_size_read(gqinode);
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	struct buffer_head *bh;
+	size_t toread, tocopy;
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+		bh = NULL;
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		if (err) {
+			mlog_errno(err);
+			return err;
+		}
+		memcpy(data, bh->b_data + offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0, new = 0, ja_type;
+	struct buffer_head *bh = NULL;
+	handle_t *handle = journal_current_handle();
+
+	if (!handle) {
+		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+		     "because transaction was not started.\n",
+		     (unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+		WARN_ON(1);
+		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+	}
+
+	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+	if (gqinode->i_size < off + len) {
+		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		err = ocfs2_extend_no_holes(gqinode, off + len, off);
+		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		if (err < 0)
+			goto out;
+		err = ocfs2_simple_size_update(gqinode,
+					       oinfo->dqi_gqi_bh,
+					       off + len);
+		if (err < 0)
+			goto out;
+		new = 1;
+	}
+	/* Not rewriting whole block? */
+	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+	    !new) {
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	} else {
+		err = ocfs2_get_quota_block(gqinode, blk, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+	}
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	lock_buffer(bh);
+	if (new)
+		memset(bh->b_data, 0, sb->s_blocksize);
+	memcpy(bh->b_data + offset, data, len);
+	flush_dcache_page(bh->b_page);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	ocfs2_set_buffer_uptodate(gqinode, bh);
+	err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
+	err = ocfs2_journal_dirty(handle, bh);
+	brelse(bh);
+	if (err < 0)
+		goto out;
+out:
+	if (err) {
+		mutex_unlock(&gqinode->i_mutex);
+		mlog_errno(err);
+		return err;
+	}
+	gqinode->i_version++;
+	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+	mutex_unlock(&gqinode->i_mutex);
+	return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	int status;
+	struct buffer_head *bh = NULL;
+
+	status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+	if (status < 0)
+		return status;
+	spin_lock(&dq_data_lock);
+	if (!oinfo->dqi_gqi_count++)
+		oinfo->dqi_gqi_bh = bh;
+	else
+		WARN_ON(bh != oinfo->dqi_gqi_bh);
+	spin_unlock(&dq_data_lock);
+	return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+	brelse(oinfo->dqi_gqi_bh);
+	spin_lock(&dq_data_lock);
+	if (!--oinfo->dqi_gqi_count)
+		oinfo->dqi_gqi_bh = NULL;
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+	struct inode *gqinode = NULL;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct ocfs2_global_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	int status;
+
+	mlog_entry_void();
+
+	/* Read global header */
+	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+			OCFS2_INVALID_SLOT);
+	if (!gqinode) {
+		mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+			type);
+		status = -EINVAL;
+		goto out_err;
+	}
+	oinfo->dqi_gi.dqi_sb = sb;
+	oinfo->dqi_gi.dqi_type = type;
+	ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+	oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+	oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+	oinfo->dqi_gqi_bh = NULL;
+	oinfo->dqi_gqi_count = 0;
+	oinfo->dqi_gqinode = gqinode;
+	status = ocfs2_lock_global_qf(oinfo, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+				      sizeof(struct ocfs2_global_disk_dqinfo),
+				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_unlock_global_qf(oinfo, 0);
+	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+		     status);
+		if (status >= 0)
+			status = -EIO;
+		mlog_errno(status);
+		goto out_err;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+						OCFS2_QBLK_RESERVED_SPACE;
+	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_global_disk_dqinfo dinfo;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+	dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+				     sizeof(struct ocfs2_global_disk_dqinfo),
+				     OCFS2_GLOBAL_INFO_OFF);
+	if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot write global quota info structure\n");
+		if (size >= 0)
+			size = -EIO;
+		return size;
+	}
+	return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	int err;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 1);
+	if (err < 0)
+		return err;
+	err = __ocfs2_global_write_info(sb, type);
+	ocfs2_qinfo_unlock(info, 1);
+	return err;
+}
+
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+	int err, err2, ex = 0;
+	struct ocfs2_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 0);
+	if (err < 0)
+		goto out;
+	err = qtree_read_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		/* Upgrade to exclusive lock for allocation */
+		err = ocfs2_qinfo_lock(info, 1);
+		if (err < 0)
+			goto out_qlock;
+		ex = 1;
+	}
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+		if (!err)
+			err = err2;
+	}
+out_qlock:
+	if (ex)
+		ocfs2_qinfo_unlock(info, 1);
+	ocfs2_qinfo_unlock(info, 0);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+	int err, err2;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_global_disk_dqblk dqblk;
+	s64 spacechange, inodechange;
+	time_t olditime, oldbtime;
+
+	err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				   sizeof(struct ocfs2_global_disk_dqblk),
+				   dquot->dq_off);
+	if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+		if (err >= 0) {
+			mlog(ML_ERROR, "Short read from global quota file "
+				       "(%u read)\n", err);
+			err = -EIO;
+		}
+		goto out;
+	}
+
+	/* Update space and inode usage. Get also other information from
+	 * global quota file so that we don't overwrite any changes there.
+	 * We are */
+	spin_lock(&dq_data_lock);
+	spacechange = dquot->dq_dqb.dqb_curspace -
+					OCFS2_DQUOT(dquot)->dq_origspace;
+	inodechange = dquot->dq_dqb.dqb_curinodes -
+					OCFS2_DQUOT(dquot)->dq_originodes;
+	olditime = dquot->dq_dqb.dqb_itime;
+	oldbtime = dquot->dq_dqb.dqb_btime;
+	ocfs2_global_disk2memdqb(dquot, &dqblk);
+	mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+	     dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curspace += spacechange;
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curinodes += inodechange;
+	/* Set properly space grace time... */
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+		    oldbtime > 0) {
+			if (dquot->dq_dqb.dqb_btime > 0)
+				dquot->dq_dqb.dqb_btime =
+					min(dquot->dq_dqb.dqb_btime, oldbtime);
+			else
+				dquot->dq_dqb.dqb_btime = oldbtime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_btime = 0;
+		clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+	}
+	/* Set properly inode grace time... */
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+		    olditime > 0) {
+			if (dquot->dq_dqb.dqb_itime > 0)
+				dquot->dq_dqb.dqb_itime =
+					min(dquot->dq_dqb.dqb_itime, olditime);
+			else
+				dquot->dq_dqb.dqb_itime = olditime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_itime = 0;
+		clear_bit(DQ_INODES_B, &dquot->dq_flags);
+	}
+	/* All information is properly updated, clear the flags */
+	__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	spin_unlock(&dq_data_lock);
+	err = ocfs2_qinfo_lock(info, freeing);
+	if (err < 0) {
+		mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+			       " (type=%d, id=%u)\n", dquot->dq_type,
+			       (unsigned)dquot->dq_id);
+		goto out;
+	}
+	if (freeing)
+		OCFS2_DQUOT(dquot)->dq_use_count--;
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+		err = qtree_release_dquot(&info->dqi_gi, dquot);
+		if (info_dirty(sb_dqinfo(sb, type))) {
+			err2 = __ocfs2_global_write_info(sb, type);
+			if (!err)
+				err = err2;
+		}
+	}
+out_qlock:
+	ocfs2_qinfo_unlock(info, freeing);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+	handle_t *handle;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int status = 0;
+
+	mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+		   dquot->dq_type, type, sb->s_id);
+	if (type != dquot->dq_type)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	if (status < 0)
+		mlog_errno(status);
+	/* We have to write local structure as well... */
+	dquot_mark_dquot_dirty(dquot);
+	status = dquot_commit(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+	struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+						      struct ocfs2_mem_dqinfo,
+						      dqi_sync_work.work);
+	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+}
+
+/*
+ *  Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = dquot_commit(dquot);
+	ocfs2_commit_trans(osb, handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	/* We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode */
+	return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+	       2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_release(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+	struct ocfs2_dinode *lfe, *gfe;
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+	lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+	/* We can extend local file + global file. In local file we
+	 * can modify info, chunk header block and dquot block. In
+	 * global file we can modify info, tree and leaf block */
+	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+	       3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	/* We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure */
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_acquire(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+	unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+	int sync = 0;
+	int status;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	dquot_mark_dquot_dirty(dquot);
+
+	/* In case user set some limits, sync dquot immediately to global
+	 * quota file so that information propagates quicker */
+	spin_lock(&dq_data_lock);
+	if (dquot->dq_flags & mask)
+		sync = 1;
+	spin_unlock(&dq_data_lock);
+	if (!sync) {
+		status = ocfs2_write_dquot(dquot);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	/* Now write updated local dquot structure */
+	status = dquot_commit(dquot);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+	handle_t *handle;
+	int status = 0;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	mlog_entry_void();
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_commit_info(sb, type);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+	handle_t *handle = NULL;
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	qid_t id;
+
+	mlog_entry_void();
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		/* This is just a performance optimization not a reliable test.
+		 * Since we hold an inode lock, noone can actually release
+		 * the structure until we are finished with initialization. */
+		if (inode->i_dquot[cnt] != NODQUOT) {
+			ocfs2_unlock_global_qf(oinfo, 0);
+			continue;
+		}
+		/* When we have inode lock, we know that no dquot_release() can
+		 * run and thus we can safely check whether we need to
+		 * read+modify global file to get quota information or whether
+		 * our node already has it. */
+		if (cnt == USRQUOTA)
+			id = inode->i_uid;
+		else if (cnt == GRPQUOTA)
+			id = inode->i_gid;
+		else
+			BUG();
+		/* Obtain exclusion from quota off... */
+		down_write(&sb_dqopt(sb)->dqptr_sem);
+		exclusive = !dquot_is_cached(sb, id, cnt);
+		up_write(&sb_dqopt(sb)->dqptr_sem);
+		if (exclusive) {
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				exclusive = 0;
+				mlog_errno(status);
+				goto out_ilock;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+					ocfs2_calc_qinit_credits(sb, cnt));
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_ilock;
+			}
+		}
+		dquot_initialize(inode, cnt);
+		if (exclusive) {
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+		ocfs2_unlock_global_qf(oinfo, 0);
+	}
+	mlog_exit(0);
+	return 0;
+out_ilock:
+	if (exclusive)
+		ocfs2_unlock_global_qf(oinfo, 1);
+	ocfs2_unlock_global_qf(oinfo, 0);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+	int status = 0;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 1);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+			ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	dquot_drop(inode);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+	return status;
+}
+
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+
+	mlog_entry_void();
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	/* Lock against anyone releasing references so that when when we check
+	 * we know we are not going to be last ones to release dquot */
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Urgh, this is a terrible hack :( */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT &&
+		    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+			exclusive = 1;
+			break;
+		}
+	}
+	if (!exclusive)
+		dquot_drop_locked(inode);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 0);
+		}
+	/* In case we bailed out because we had to do expensive locking
+	 * do it now... */
+	if (exclusive)
+		status = ocfs2_dquot_drop_slow(inode);
+	mlog_exit(status);
+	return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+	struct ocfs2_dquot *dquot =
+				kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+	if (!dquot)
+		return NULL;
+	return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+	kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+struct dquot_operations ocfs2_quota_operations = {
+	.initialize	= ocfs2_dquot_initialize,
+	.drop		= ocfs2_dquot_drop,
+	.alloc_space	= dquot_alloc_space,
+	.alloc_inode	= dquot_alloc_inode,
+	.free_space	= dquot_free_space,
+	.free_inode	= dquot_free_inode,
+	.transfer	= dquot_transfer,
+	.write_dquot	= ocfs2_write_dquot,
+	.acquire_dquot	= ocfs2_acquire_dquot,
+	.release_dquot	= ocfs2_release_dquot,
+	.mark_dirty	= ocfs2_mark_dquot_dirty,
+	.write_info	= ocfs2_write_info,
+	.alloc_dquot	= ocfs2_alloc_dquot,
+	.destroy_dquot	= ocfs2_destroy_dquot,
+};
+
+int ocfs2_quota_setup(void)
+{
+	ocfs2_quota_wq = create_workqueue("o2quot");
+	if (!ocfs2_quota_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void ocfs2_quota_shutdown(void)
+{
+	if (ocfs2_quota_wq) {
+		flush_workqueue(ocfs2_quota_wq);
+		destroy_workqueue(ocfs2_quota_wq);
+		ocfs2_quota_wq = NULL;
+	}
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+	return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+		sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+	return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+		 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+	       ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+	return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+	/* 1 block for local quota file info, 1 block per chunk for chunk info */
+	return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+	       ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+	return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+	return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((off >> sb->s_blocksize_bits) -
+			ol_quota_chunk_block(sb, c) - 1) * epb
+	       + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+		 sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+		void (*modify)(struct buffer_head *, void *), void *private)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	int status;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_journal_access_dq(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	lock_buffer(bh);
+	modify(bh, private);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	return 0;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+	unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+	unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+	unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+	unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct buffer_head *bh = NULL;
+	struct inode *linode = sb_dqopt(sb)->files[type];
+	struct inode *ginode = NULL;
+	struct ocfs2_disk_dqheader *dqhead;
+	int status, ret = 0;
+
+	/* First check whether we understand local quota file */
+	status = ocfs2_read_quota_block(linode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+			type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+		mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+			lmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+		mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_version),
+			lversions[type], type);
+		goto out_err;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	/* Next check whether we understand global quota file */
+	ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+						OCFS2_INVALID_SLOT);
+	if (!ginode) {
+		mlog(ML_ERROR, "cannot get global quota file inode "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	/* Since the header is read only, we don't care about locking */
+	status = ocfs2_read_quota_block(ginode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read global quota file header "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+		mlog(ML_ERROR, "global quota file magic does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+		mlog(ML_ERROR, "global quota file version does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_version), gversions[type],
+			type);
+		goto out_err;
+	}
+
+	ret = 1;
+out_err:
+	brelse(bh);
+	iput(ginode);
+	return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+	struct ocfs2_quota_chunk *pos, *next;
+
+	list_for_each_entry_safe(pos, next, head, qc_chunk) {
+		list_del(&pos->qc_chunk);
+		brelse(pos->qc_headerbh);
+		kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+	}
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+			struct ocfs2_local_disk_dqinfo *ldinfo,
+			struct list_head *head)
+{
+	struct ocfs2_quota_chunk *newchunk;
+	int i, status;
+
+	INIT_LIST_HEAD(head);
+	for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+		newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+		if (!newchunk) {
+			ocfs2_release_local_quota_bitmaps(head);
+			return -ENOMEM;
+		}
+		newchunk->qc_num = i;
+		newchunk->qc_headerbh = NULL;
+		status = ocfs2_read_quota_block(inode,
+				ol_quota_chunk_block(inode->i_sb, i),
+				&newchunk->qc_headerbh);
+		if (status) {
+			mlog_errno(status);
+			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+			ocfs2_release_local_quota_bitmaps(head);
+			return status;
+		}
+		list_add_tail(&newchunk->qc_chunk, head);
+	}
+	return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+	struct mem_dqinfo *info = private;
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	spin_lock(&dq_data_lock);
+	ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+	ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+	ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+	spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+				    struct ocfs2_local_disk_chunk *dchunk,
+				    int chunk,
+				    struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *rc;
+
+	rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+	rc->rc_chunk = chunk;
+	rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+	if (!rc->rc_bitmap) {
+		kfree(rc);
+		return -ENOMEM;
+	}
+	memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+	       (ol_chunk_entries(sb) + 7) >> 3);
+	list_add_tail(&rc->rc_list, head);
+	return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *next;
+	struct ocfs2_recovery_chunk *rchunk;
+
+	list_for_each_entry_safe(rchunk, next, head, rc_list) {
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+	}
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		free_recovery_list(&(rec->r_list[type]));
+	kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+				     struct ocfs2_local_disk_dqinfo *ldinfo,
+				     int type,
+				     struct list_head *head)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct buffer_head *hbh;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	int status = 0;
+
+	for (i = 0; i < chunks; i++) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, i),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+			status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+		brelse(hbh);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(head);
+	return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+	int type;
+	struct ocfs2_quota_recovery *rec;
+
+	rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+	if (!rec)
+		return NULL;
+	for (type = 0; type < MAXQUOTAS; type++)
+		INIT_LIST_HEAD(&(rec->r_list[type]));
+	return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+						struct ocfs2_super *osb,
+						int slot_num)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct inode *lqinode;
+	struct buffer_head *bh;
+	int type;
+	int status = 0;
+	struct ocfs2_quota_recovery *rec;
+
+	mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+	rec = ocfs2_alloc_quota_recovery();
+	if (!rec)
+		return ERR_PTR(-ENOMEM);
+	/* First init... */
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		/* At this point, journal of the slot is already replayed so
+		 * we can trust metadata and data of the quota file */
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+					       OCFS2_META_LOCK_RECOVERY);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+						   &rec->r_list[type]);
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	if (status < 0) {
+		ocfs2_free_quota_recovery(rec);
+		rec = ERR_PTR(status);
+	}
+	return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+					  int type,
+					  struct ocfs2_quota_recovery *rec)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_local_disk_chunk *dchunk;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct dquot *dquot;
+	handle_t *handle;
+	struct buffer_head *hbh = NULL, *qbh = NULL;
+	int status = 0;
+	int bit, chunk;
+	struct ocfs2_recovery_chunk *rchunk, *next;
+	qsize_t spacechange, inodechange;
+
+	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+		chunk = rchunk->rc_chunk;
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, chunk),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+			qbh = NULL;
+			status = ocfs2_read_quota_block(lqinode,
+						ol_dqblk_block(sb, chunk, bit),
+						&qbh);
+			if (status) {
+				mlog_errno(status);
+				break;
+			}
+			dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+				ol_dqblk_block_off(sb, chunk, bit));
+			dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+			if (!dquot) {
+				status = -EIO;
+				mlog(ML_ERROR, "Failed to get quota structure "
+				     "for id %u, type %d. Cannot finish quota "
+				     "file recovery.\n",
+				     (unsigned)le64_to_cpu(dqblk->dqb_id),
+				     type);
+				goto out_put_bh;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+						   OCFS2_QSYNC_CREDITS);
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			spin_lock(&dq_data_lock);
+			/* Add usage from quota entry into quota changes
+			 * of our node. Auxiliary variables are important
+			 * due to signedness */
+			spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+			dquot->dq_dqb.dqb_curspace += spacechange;
+			dquot->dq_dqb.dqb_curinodes += inodechange;
+			spin_unlock(&dq_data_lock);
+			/* We want to drop reference held by the crashed
+			 * node. Since we have our own reference we know
+			 * global structure actually won't be freed. */
+			status = ocfs2_global_release_dquot(dquot);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			/* Release local quota file entry */
+			status = ocfs2_journal_access_dq(handle, lqinode,
+					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			lock_buffer(qbh);
+			WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+			ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+			le32_add_cpu(&dchunk->dqc_free, 1);
+			unlock_buffer(qbh);
+			status = ocfs2_journal_dirty(handle, qbh);
+			if (status < 0)
+				mlog_errno(status);
+out_commit:
+			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+			dqput(dquot);
+out_put_bh:
+			brelse(qbh);
+			if (status < 0)
+				break;
+		}
+		brelse(hbh);
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+		if (status < 0)
+			break;
+	}
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status < 0)
+		free_recovery_list(&(rec->r_list[type]));
+	mlog_exit(status);
+	return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num)
+{
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int type;
+	int status = 0;
+	struct inode *lqinode;
+	unsigned int flags;
+
+	mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (list_empty(&(rec->r_list[type])))
+			continue;
+		mlog(0, "Recovering quota in slot %d\n", slot_num);
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+						       OCFS2_META_LOCK_NOQUEUE);
+		/* Someone else is holding the lock? Then he must be
+		 * doing the recovery. Just skip the file... */
+		if (status == -EAGAIN) {
+			mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+			     "because quota file is locked.\n", slot_num);
+			status = 0;
+			goto out_put;
+		} else if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		/* Is recovery still needed? */
+		flags = le32_to_cpu(ldinfo->dqi_flags);
+		if (!(flags & OLQF_CLEAN))
+			status = ocfs2_recover_local_quota_file(lqinode,
+								type,
+								rec);
+		/* We don't want to mark file as clean when it is actually
+		 * active */
+		if (slot_num == osb->slot_num)
+			goto out_bh;
+		/* Mark quota file as clean if we are recovering quota file of
+		 * some other node. */
+		handle = ocfs2_start_trans(osb, 1);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out_bh;
+		}
+		status = ocfs2_journal_access_dq(handle, lqinode, bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_trans;
+		}
+		lock_buffer(bh);
+		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+		unlock_buffer(bh);
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0)
+			mlog_errno(status);
+out_trans:
+		ocfs2_commit_trans(osb, handle);
+out_bh:
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	kfree(rec);
+	return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	int status;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_quota_recovery *rec;
+	int locked = 0;
+
+	info->dqi_maxblimit = 0x7fffffffffffffffLL;
+	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+	if (!oinfo) {
+		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+			       " info.");
+		goto out_err;
+	}
+	info->dqi_priv = oinfo;
+	oinfo->dqi_type = type;
+	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_rec = NULL;
+	oinfo->dqi_lqi_bh = NULL;
+	oinfo->dqi_ibh = NULL;
+
+	status = ocfs2_global_read_info(sb, type);
+	if (status < 0)
+		goto out_err;
+
+	status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	locked = 1;
+
+	/* Now read local header */
+	status = ocfs2_read_quota_block(lqinode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file info header "
+			"(type=%d)\n", type);
+		goto out_err;
+	}
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+	oinfo->dqi_ibh = bh;
+
+	/* We crashed when using local quota file? */
+	if (!(info->dqi_flags & OLQF_CLEAN)) {
+		rec = OCFS2_SB(sb)->quota_rec;
+		if (!rec) {
+			rec = ocfs2_alloc_quota_recovery();
+			if (!rec) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto out_err;
+			}
+			OCFS2_SB(sb)->quota_rec = rec;
+		}
+
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_err;
+		}
+	}
+
+	status = ocfs2_load_local_quota_bitmaps(lqinode,
+						ldinfo,
+						&oinfo->dqi_chunk);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now mark quota file as used */
+	info->dqi_flags &= ~OLQF_CLEAN;
+	status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	return 0;
+out_err:
+	if (oinfo) {
+		iput(oinfo->dqi_gqinode);
+		ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+		ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+		brelse(oinfo->dqi_lqi_bh);
+		if (locked)
+			ocfs2_inode_unlock(lqinode, 1);
+		ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+		kfree(oinfo);
+	}
+	brelse(bh);
+	return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+						->dqi_ibh;
+	int status;
+
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int mark_clean = 1, len;
+	int status;
+
+	/* At this point we know there are no more dquots and thus
+	 * even if there's some sync in the pdflush queue, it won't
+	 * find any dquots and return without doing anything */
+	cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+	iput(oinfo->dqi_gqinode);
+	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+					(chunk->qc_headerbh->b_data);
+		if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+			len = ol_chunk_entries(sb);
+		} else {
+			len = (oinfo->dqi_blocks -
+			       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+			      * ol_quota_entries_per_block(sb);
+		}
+		/* Not all entries free? Bug! */
+		if (le32_to_cpu(dchunk->dqc_free) != len) {
+			mlog(ML_ERROR, "releasing quota file with used "
+					"entries (type=%d)\n", type);
+			mark_clean = 0;
+		}
+	}
+	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+	/* dqonoff_mutex protects us against racing with recovery thread... */
+	if (oinfo->dqi_rec) {
+		ocfs2_free_quota_recovery(oinfo->dqi_rec);
+		mark_clean = 0;
+	}
+
+	if (!mark_clean)
+		goto out;
+
+	/* Mark local file as clean */
+	info->dqi_flags |= OLQF_CLEAN;
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+				 oinfo->dqi_ibh,
+				 olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+out:
+	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+	brelse(oinfo->dqi_ibh);
+	brelse(oinfo->dqi_lqi_bh);
+	kfree(oinfo);
+	return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+	struct ocfs2_dquot *od = private;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct super_block *sb = od->dq_dquot.dq_sb;
+
+	dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+		+ ol_dqblk_block_offset(sb, od->dq_local_off));
+
+	dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+	spin_lock(&dq_data_lock);
+	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+					  od->dq_origspace);
+	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+					  od->dq_originodes);
+	spin_unlock(&dq_data_lock);
+	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+	     od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+	     (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct buffer_head *bh = NULL;
+	int status;
+
+	status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+				    ol_dqblk_file_block(sb, od->dq_local_off),
+				    &bh);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+				 olq_set_dquot, od);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	brelse(bh);
+	return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int found = 0, len;
+
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+						chunk->qc_headerbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) > 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return NULL;
+
+	if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+		len = ol_chunk_entries(sb);
+	} else {
+		len = (oinfo->dqi_blocks -
+		       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+		      * ol_quota_entries_per_block(sb);
+	}
+
+	found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+	/* We failed? */
+	if (found == len) {
+		mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+		     " entries free (type=%d)\n", chunk->qc_num,
+		     le32_to_cpu(dchunk->dqc_free), type);
+		return ERR_PTR(-EIO);
+	}
+	*offset = found;
+	return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+							struct super_block *sb,
+							int type,
+							int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk = NULL;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int status;
+	handle_t *handle;
+	struct buffer_head *bh = NULL;
+	u64 p_blkno;
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + 2 * sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + 2 * sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+	if (!chunk) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_dq(handle, lqinode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+	memset(dchunk->dqc_bitmap, 0,
+	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+	       OCFS2_QBLK_RESERVED_SPACE);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	oinfo->dqi_blocks += 2;
+	oinfo->dqi_chunks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+	chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+				   struct ocfs2_quota_chunk,
+				   qc_chunk)->qc_num + 1;
+	chunk->qc_headerbh = bh;
+	*offset = 0;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	brelse(bh);
+	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+	return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+						       struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_local_disk_chunk *dchunk;
+	int epb = ol_quota_entries_per_block(sb);
+	unsigned int chunk_blocks;
+	int status;
+	handle_t *handle;
+
+	if (list_empty(&oinfo->dqi_chunk))
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+	/* Is the last chunk full? */
+	chunk = list_entry(oinfo->dqi_chunk.prev,
+			struct ocfs2_quota_chunk, qc_chunk);
+	chunk_blocks = oinfo->dqi_blocks -
+			ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+	if (ol_chunk_blocks(sb) == chunk_blocks)
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+				 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+	lock_buffer(chunk->qc_headerbh);
+	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+	unlock_buffer(chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	oinfo->dqi_blocks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	*offset = chunk_blocks * epb;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+	int *offset = private;
+	struct ocfs2_local_disk_chunk *dchunk;
+
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	int offset;
+	int status;
+
+	chunk = ocfs2_find_free_entry(sb, type, &offset);
+	if (!chunk) {
+		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+		if (IS_ERR(chunk))
+			return PTR_ERR(chunk);
+	} else if (IS_ERR(chunk)) {
+		return PTR_ERR(chunk);
+	}
+	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+	od->dq_chunk = chunk;
+
+	/* Initialize dquot structure on disk */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Mark structure as allocated */
+	status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+				 &offset);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	return status;
+}
+
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+	int status;
+
+	mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_global_read_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now create entry in the local quota file */
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	mlog_exit(0);
+	return 0;
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+	int status;
+	int type = dquot->dq_type;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int offset;
+	handle_t *handle = journal_current_handle();
+
+	BUG_ON(!handle);
+	/* First write all local changes to global file */
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+					     od->dq_local_off);
+	dchunk = (struct ocfs2_local_disk_chunk *)
+			(od->dq_chunk->qc_headerbh->b_data);
+	/* Mark structure as freed */
+	lock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, 1);
+	unlock_buffer(od->dq_chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = 0;
+out:
+	/* Clear the read bit so that next time someone uses this
+	 * dquot he reads fresh info from disk and allocates local
+	 * dquot structure */
+	clear_bit(DQ_READ_B, &dquot->dq_flags);
+	return status;
+}
+
+static struct quota_format_ops ocfs2_format_ops = {
+	.check_quota_file	= ocfs2_local_check_quota_file,
+	.read_file_info		= ocfs2_local_read_info,
+	.write_file_info	= ocfs2_global_write_info,
+	.free_file_info		= ocfs2_local_free_info,
+	.read_dqblk		= ocfs2_local_read_dquot,
+	.commit_dqblk		= ocfs2_local_write_dquot,
+	.release_dqblk		= ocfs2_local_release_dquot,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+	.qf_fmt_id = QFMT_OCFS2,
+	.qf_ops = &ocfs2_format_ops,
+	.qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
 		   new_clusters, first_new_cluster);
 
-	ret = ocfs2_journal_access(handle, bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	}
 
 	/* update the inode accordingly. */
-	ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 
 	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
+	/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+	 * so any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
 				 ocfs2_group_bitmap_size(osb->sb) * 8) {
 		mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out_unlock;
 	}
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-		ret = -EIO;
-		goto out_unlock;
-	}
-
 	first_new_cluster = le32_to_cpu(fe->i_clusters);
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
-
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
 
-	ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
 		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
 				 struct buffer_head *group_bh)
 {
 	int ret;
-	struct ocfs2_group_desc *gd;
+	struct ocfs2_group_desc *gd =
+		(struct ocfs2_group_desc *)group_bh->b_data;
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-	unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-				le16_to_cpu(di->id2.i_chain.cl_bpc);
-
 
-	gd = (struct ocfs2_group_desc *)group_bh->b_data;
+	ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+	if (ret)
+		goto out;
 
-	ret = -EIO;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd))
-		mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-	else if (di->i_blkno != gd->bg_parent_dinode)
-		mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-		     "pointer (%llu, expected %llu)\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-		     (unsigned long long)le64_to_cpu(di->i_blkno));
-	else if (le16_to_cpu(gd->bg_bits) > max_bits)
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits));
-	else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "claims that %u are free\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     le16_to_cpu(gd->bg_free_bits_count));
-	else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "max bitmap bits of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     8 * le16_to_cpu(gd->bg_size));
-	else if (le16_to_cpu(gd->bg_chain) != input->chain)
+	ret = -EINVAL;
+	if (le16_to_cpu(gd->bg_chain) != input->chain)
 		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
 		     "while input has %u set.\n",
 		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
 	else
 		ret = 0;
 
+out:
 	return ret;
 }
 
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	cl = &fe->id2.i_chain;
 	cr = &cl->cl_recs[input->chain];
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_commit;
 	}
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
 	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-				OCFS2_BH_IGNORE_CACHE);
+				OCFS2_BH_IGNORE_CACHE, NULL);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 
 		bh = NULL;  /* Acquire a fresh bh */
 		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
+					   OCFS2_BH_IGNORE_CACHE, NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
 
-/* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
-				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd)
+#define do_error(fmt, ...)						\
+	do{								\
+		if (clean_error)					\
+			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
+		else							\
+			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int clean_error)
 {
-	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
-		return -EIO;
+		do_error("Group descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)bh->b_blocknr, 7,
+			 gd->bg_signature);
+		return -EINVAL;
 	}
 
+	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+		do_error("Group descriptor #%llu has an invalid bg_blkno "
+			 "of %llu",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+		do_error("Group descriptor #%llu has an invalid "
+			 "fs_generation of #%u",
+			 (unsigned long long)bh->b_blocknr,
+			 le32_to_cpu(gd->bg_generation));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct buffer_head *bh,
+				    int clean_error)
+{
+	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
 	if (di->i_blkno != gd->bg_parent_dinode) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad parent "
-			    "pointer (%llu, expected %llu)",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-			    (unsigned long long)le64_to_cpu(di->i_blkno));
-		return -EIO;
+		do_error("Group descriptor #%llu has bad parent "
+			 "pointer (%llu, expected %llu)",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+			 (unsigned long long)le64_to_cpu(di->i_blkno));
+		return -EINVAL;
 	}
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits));
-		return -EIO;
+		do_error("Group descriptor #%llu has bit count of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits));
+		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_chain) >=
 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_chain));
-		return -EIO;
+		do_error("Group descriptor #%llu has bad chain %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_chain));
+		return -EINVAL;
 	}
 
-	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "claims that %u are free",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    le16_to_cpu(gd->bg_free_bits_count));
-		return -EIO;
-	}
+	return 0;
+}
 
-	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "max bitmap bits of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    8 * le16_to_cpu(gd->bg_size));
-		return -EIO;
+#undef do_error
+
+/*
+ * This version only prints errors.  It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Checksum failed for group descriptor %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+	} else
+		rc = ocfs2_validate_gd_self(sb, bh, 1);
+	if (!rc)
+		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+	return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+					   struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	mlog(0, "Validating group descriptor %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+			      ocfs2_validate_group_descriptor);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+	if (rc) {
+		brelse(tmp);
+		goto out;
 	}
 
-	return 0;
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
 }
 
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      bg_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 
 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
-	status = ocfs2_journal_access(handle, alloc_inode,
-				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode,
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	ac->ac_alloc_slot = slot;
 
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read inside
+	 * ocfs2_inode_lock().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 	int offset, start, found, status = 0;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
-		return -EIO;
-	}
+	/* Callers got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 	found = start = best_offset = best_size = 0;
 	bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 
 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 group_bh,
+					 journal_type);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto out;
-	}
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto out;
-	}
-	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-		status = -EIO;
-		goto out;
-	}
+	/* The caller got these descriptors from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
 
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 
-	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
 		bg->bg_next_group = cpu_to_le64(bg_ptr);
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 	}
-out:
+
 	mlog_exit(status);
 	return status;
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	u16 found;
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *gd;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
 				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
-	status = ocfs2_read_block(alloc_inode,
-				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe,
+					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
+					     &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
 
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(alloc_inode,
-					  next_group, &group_bh);
+		status = ocfs2_read_group_descriptor(alloc_inode, fe,
+						     next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
-		status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-		if (status) {
-			mlog_errno(status);
-			goto bail;
-		}
 	}
 	if (status < 0) {
 		if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 
 	/* Ok, claim our bits now: set the info on dinode, chainlist
 	 * and then the group */
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      ac->ac_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,
+					 alloc_inode,
+					 ac->ac_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 	BUG_ON(!ac->ac_bh);
 
 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read during
+	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* The caller got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
-	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
+					 journal_type);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+	/* The alloc_bh comes from ocfs2_free_dinode() or
+	 * ocfs2_free_clusters().  The callers have all locked the
+	 * allocator and gotten alloc_bh from the lock call.  This
+	 * validates the dinode buffer.  Any corruption that has happended
+	 * is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 
 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+					     &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-
 	group = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
+
 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
  * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd);
+				 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh);
+
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 
 /* OCFS2 needs to schedule several differnt types of work which
  * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 
 static const struct super_operations ocfs2_sops = {
 	.statfs		= ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
+	.quota_read	= ocfs2_quota_read,
+	.quota_write	= ocfs2_quota_write,
 };
 
 enum {
@@ -158,6 +168,10 @@ enum {
 	Opt_user_xattr,
 	Opt_nouser_xattr,
 	Opt_inode64,
+	Opt_acl,
+	Opt_noacl,
+	Opt_usrquota,
+	Opt_grpquota,
 	Opt_err,
 };
 
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_inode64, "inode64"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_err, NULL}
 };
 
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+	    && (ino == USER_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+		return 0;
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+	    && (ino == GROUP_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+		return 0;
+	return 1;
+}
+
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
 	struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Disable quota accounting before remounting RO */
+		if (*flags & MS_RDONLY) {
+			ret = ocfs2_susp_quotas(osb, 0);
+			if (ret < 0)
+				goto out;
+		}
 		/* Lock here so the check of HARD_RO and the potential
 		 * setting of SOFT_RO is atomic. */
 		spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		}
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
+		/* Enable quota accounting after remounting RW */
+		if (!ret && !(*flags & MS_RDONLY)) {
+			if (sb_any_quota_suspended(sb))
+				ret = ocfs2_susp_quotas(osb, 1);
+			else
+				ret = ocfs2_enable_quotas(osb);
+			if (ret < 0) {
+				/* Return back changes... */
+				spin_lock(&osb->osb_lock);
+				sb->s_flags |= MS_RDONLY;
+				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+				spin_unlock(&osb->osb_lock);
+				goto out;
+			}
+		}
 	}
 
 	if (!ret) {
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
+		if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+			parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
 	return 0;
 }
 
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+	int type;
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	int status = 0;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		if (unsuspend)
+			status = vfs_quota_enable(
+					sb_dqopt(sb)->files[type],
+					type, QFMT_OCFS2,
+					DQUOT_SUSPENDED);
+		else
+			status = vfs_quota_disable(sb, type,
+						   DQUOT_SUSPENDED);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+		     "remount (error = %d).\n", status);
+	return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+	struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	int status;
+	int type;
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+							osb->slot_num);
+		if (!inode[type]) {
+			status = -ENOENT;
+			goto out_quota_off;
+		}
+		status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+						DQUOT_USAGE_ENABLED);
+		if (status < 0)
+			goto out_quota_off;
+	}
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	return 0;
+out_quota_off:
+	ocfs2_disable_quotas(osb);
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+	int type;
+	struct inode *inode;
+	struct super_block *sb = osb->sb;
+
+	/* We mostly ignore errors in this function because there's not much
+	 * we can do when we see them */
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!sb_has_quota_loaded(sb, type))
+			continue;
+		inode = igrab(sb->s_dquot.files[type]);
+		/* Turn off quotas. This will remove all dquot structures from
+		 * memory and so they will be automatically synced to global
+		 * quota files */
+		vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+					    DQUOT_LIMITS_ENABLED);
+		if (!inode)
+			continue;
+		iput(inode);
+	}
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+			  char *path, int remount)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+		return -EINVAL;
+
+	if (remount)
+		return 0;	/* Just ignore it has been handled in
+				 * ocfs2_remount() */
+	return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+				    format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+	if (remount)
+		return 0;	/* Ignore now and handle later in
+				 * ocfs2_remount() */
+	return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static struct quotactl_ops ocfs2_quotactl_ops = {
+	.quota_on	= ocfs2_quota_on,
+	.quota_off	= ocfs2_quota_off,
+	.quota_sync	= vfs_quota_sync,
+	.get_info	= vfs_get_dqinfo,
+	.set_info	= vfs_set_dqinfo,
+	.get_dqblk	= vfs_get_dqblk,
+	.set_dqblk	= vfs_set_dqblk,
+};
+
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	brelse(bh);
 	bh = NULL;
+
+	if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+		parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 	osb->local_alloc_bits = osb->local_alloc_default_bits;
+	if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "User quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Group quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	wake_up(&osb->osb_mount_event);
 
+	/* Now we can initialize quotas because we can afford to wait
+	 * for cluster locks recovery now. That also means that truncation
+	 * log recovery can happen but that waits for proper quota setup */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		status = ocfs2_enable_quotas(osb);
+		if (status < 0) {
+			/* We have to err-out specially here because
+			 * s_root is already set */
+			mlog_errno(status);
+			atomic_set(&osb->vol_state, VOLUME_DISABLED);
+			wake_up(&osb->osb_mount_event);
+			mlog_exit(status);
+			return status;
+		}
+	}
+
+	ocfs2_complete_quota_recovery(osb);
+
+	/* Now we wake up again for processes waiting for quotas */
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+	wake_up(&osb->osb_mount_event);
+
 	mlog_exit(status);
 	return status;
 
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			break;
+		case Opt_usrquota:
+			/* We check only on remount, otherwise features
+			 * aren't yet initialized. */
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+				mlog(ML_ERROR, "User quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+				mlog(ML_ERROR, "Group quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+			break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+		case Opt_acl:
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			break;
+		case Opt_noacl:
+			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+			break;
+#endif
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->osb_cluster_stack[0])
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
+	if (opts & OCFS2_MOUNT_USRQUOTA)
+		seq_printf(s, ",usrquota");
+	if (opts & OCFS2_MOUNT_GRPQUOTA)
+		seq_printf(s, ",grpquota");
 
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_INODE64)
 		seq_printf(s, ",inode64");
 
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	if (opts & OCFS2_MOUNT_POSIX_ACL)
+		seq_printf(s, ",acl");
+	else
+		seq_printf(s, ",noacl");
+#endif
+
 	return 0;
 }
 
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 
+	status = ocfs2_quota_setup();
+	if (status)
+		goto leave;
+
 	ocfs2_set_locking_protocol();
 
+	status = register_quota_format(&ocfs2_quota_format);
 leave:
 	if (status < 0) {
+		ocfs2_quota_shutdown();
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
 	}
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
 	mlog_entry_void();
 
+	ocfs2_quota_shutdown();
+
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
 	}
 
+	unregister_quota_format(&ocfs2_quota_format);
+
 	debugfs_remove(ocfs2_debugfs_root);
 
 	ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 				       ocfs2_inode_init_once);
-	if (!ocfs2_inode_cachep)
+	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+					sizeof(struct ocfs2_dquot),
+					0,
+					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					NULL);
+	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+					sizeof(struct ocfs2_quota_chunk),
+					0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					NULL);
+	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+	    !ocfs2_qf_chunk_cachep) {
+		if (ocfs2_inode_cachep)
+			kmem_cache_destroy(ocfs2_inode_cachep);
+		if (ocfs2_dquot_cachep)
+			kmem_cache_destroy(ocfs2_dquot_cachep);
+		if (ocfs2_qf_chunk_cachep)
+			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
-
 	ocfs2_inode_cachep = NULL;
+
+	if (ocfs2_dquot_cachep)
+		kmem_cache_destroy(ocfs2_dquot_cachep);
+	ocfs2_dquot_cachep = NULL;
+
+	if (ocfs2_qf_chunk_cachep)
+		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+	ocfs2_qf_chunk_cachep = NULL;
 }
 
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	ocfs2_disable_quotas(osb);
+
 	ocfs2_shutdown_local_alloc(osb);
 
 	ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_qcop = &ocfs2_quotactl_ops;
+	sb->dq_op = &ocfs2_quota_operations;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		/* We have to do a raw check of the feature here */
+		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+			status = ocfs2_block_check_validate(bh->b_data,
+							    bh->b_size,
+							    &di->i_check);
+			if (status)
+				goto out;
+		}
 		status = -EINVAL;
 		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
 			mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		}
 	}
 
+out:
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+	status = ocfs2_read_inode_block(inode, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
 };
 
 struct ocfs2_xattr_bucket {
-	struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
-	struct ocfs2_xattr_header *xh;
+	/* The inode these xattrs are associated with */
+	struct inode *bu_inode;
+
+	/* The actual buffers that make up the bucket */
+	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+	/* How many blocks make up one bucket for this filesystem */
+	int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
+					 - sizeof(struct ocfs2_xattr_block) \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - sizeof(__u32))
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 
 struct xattr_handler *ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	&ocfs2_xattr_acl_access_handler,
+	&ocfs2_xattr_acl_default_handler,
+#endif
 	&ocfs2_xattr_trusted_handler,
+	&ocfs2_xattr_security_handler,
 	NULL
 };
 
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+					= &ocfs2_xattr_acl_access_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+					= &ocfs2_xattr_acl_default_handler,
+#endif
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
 };
 
 struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
 	 */
 	struct buffer_head *xattr_bh;
 	struct ocfs2_xattr_header *header;
-	struct ocfs2_xattr_bucket bucket;
+	struct ocfs2_xattr_bucket *bucket;
 	void *base;
 	void *end;
 	struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 					size_t buffer_size);
 
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-					  struct ocfs2_xattr_search *xs);
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt);
 
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs);
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt);
 
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 	return len / sizeof(struct ocfs2_xattr_entry);
 }
 
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+	struct ocfs2_xattr_bucket *bucket;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+	bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+	if (bucket) {
+		bucket->bu_inode = inode;
+		bucket->bu_blocks = blks;
+	}
+
+	return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		brelse(bucket->bu_bhs[i]);
+		bucket->bu_bhs[i] = NULL;
+	}
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+	if (bucket) {
+		ocfs2_xattr_bucket_relse(bucket);
+		bucket->bu_inode = NULL;
+		kfree(bucket);
+	}
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+					      xb_blkno + i);
+		if (!bucket->bu_bhs[i]) {
+			rc = -EIO;
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+					   bucket->bu_bhs[i]))
+			ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+						      bucket->bu_bhs[i]);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int rc;
+
+	rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+			       bucket->bu_blocks, bucket->bu_bhs, 0,
+			       NULL);
+	if (!rc) {
+		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+						 bucket->bu_bhs,
+						 bucket->bu_blocks,
+						 &bucket_xh(bucket)->xh_check);
+		if (rc)
+			mlog_errno(rc);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(bucket);
+	return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int type)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		rc = ocfs2_journal_access(handle, bucket->bu_inode,
+					  bucket->bu_bhs[i], type);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+					     struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+				   bucket->bu_bhs, bucket->bu_blocks,
+				   &bucket_xh(bucket)->xh_check);
+
+	for (i = 0; i < bucket->bu_blocks; i++)
+		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+					 struct ocfs2_xattr_bucket *src)
+{
+	int i;
+	int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+	BUG_ON(dest->bu_blocks != src->bu_blocks);
+	BUG_ON(dest->bu_inode != src->bu_inode);
+
+	for (i = 0; i < src->bu_blocks; i++) {
+		memcpy(bucket_block(dest, i), bucket_block(src, i),
+		       blocksize);
+	}
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	mlog(0, "Validating xattr block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal
+	 */
+
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has bad "
+			    "signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    xb->xb_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an "
+			    "invalid xb_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an invalid "
+			    "xb_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(xb->xb_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+				  struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+			      ocfs2_validate_xattr_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
 	return;
 }
 
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+	int size = 0;
+
+	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+	else
+		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	size += sizeof(struct ocfs2_xattr_entry);
+
+	return size;
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+			     struct ocfs2_security_xattr_info *si,
+			     int *want_clusters,
+			     int *xattr_credits,
+			     struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						 si->value_len);
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * So reserve one metadata block for it is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	/* reserve clusters for xattr value which will be set in B tree*/
+	if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							    si->value_len);
+
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+			  struct buffer_head *dir_bh,
+			  int mode,
+			  struct ocfs2_security_xattr_info *si,
+			  int *want_clusters,
+			  int *xattr_credits,
+			  struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+	if (si->enable)
+		s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						     si->value_len);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+		acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+					OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+					"", NULL, 0);
+		if (acl_len > 0) {
+			a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+			if (S_ISDIR(mode))
+				a_size <<= 1;
+		} else if (acl_len != 0 && acl_len != -ENODATA) {
+			mlog_errno(ret);
+			return ret;
+		}
+	}
+
+	if (!(s_size + a_size))
+		return ret;
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * The max space of acl xattr taken inline is
+	 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+	 * when blocksize = 512, may reserve one more cluser for
+	 * xattr bucket, otherwise reserve one metadata block
+	 * for them is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+		*want_clusters += 1;
+		*xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+	}
+
+	/*
+	 * reserve credits and clusters for xattrs which has large value
+	 * and have to be set outside
+	 */
+	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							si->value_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    acl_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* for directory, it has DEFAULT and ACCESS two types of acls */
+		new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+				ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
+
+	return ret;
+}
+
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
-					 struct buffer_head *xattr_bh,
-					 struct ocfs2_xattr_value_root *xv)
+					 struct ocfs2_xattr_value_buf *vb,
+					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int status = 0;
-	int restart_func = 0;
-	int credits = 0;
-	handle_t *handle = NULL;
-	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
 	struct ocfs2_extent_tree et;
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
-
-restart_all:
-
-	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-				       &data_ac, &meta_ac);
-	if (status) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-					    clusters_to_add);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
 
-restarted_transaction:
-	status = ocfs2_journal_access(handle, inode, xattr_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = vb->vb_access(handle, inode, vb->vb_bh,
+			      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	prev_clusters = le32_to_cpu(xv->xr_clusters);
+	prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
 	status = ocfs2_add_clusters_in_btree(osb,
 					     inode,
 					     &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
 					     0,
 					     &et,
 					     handle,
-					     data_ac,
-					     meta_ac,
+					     ctxt->data_ac,
+					     ctxt->meta_ac,
 					     &why);
-	if ((status < 0) && (status != -EAGAIN)) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+	if (status < 0) {
+		mlog_errno(status);
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, xattr_bh);
+	status = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+	clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
 
-	if (why != RESTART_NONE && clusters_to_add) {
-		if (why == RESTART_META) {
-			mlog(0, "restarting function.\n");
-			restart_func = 1;
-		} else {
-			BUG_ON(why != RESTART_TRANS);
-
-			mlog(0, "restarting transaction.\n");
-			/* TODO: This can be more intelligent. */
-			credits = ocfs2_calc_extend_credits(osb->sb,
-							    et.et_root_el,
-							    clusters_to_add);
-			status = ocfs2_extend_trans(handle, credits);
-			if (status < 0) {
-				/* handle still has to be committed at
-				 * this point. */
-				status = -ENOMEM;
-				mlog_errno(status);
-				goto leave;
-			}
-			goto restarted_transaction;
-		}
-	}
+	/*
+	 * We should have already allocated enough space before the transaction,
+	 * so no need to restart.
+	 */
+	BUG_ON(why != RESTART_NONE || clusters_to_add);
 
 leave:
-	if (handle) {
-		ocfs2_commit_trans(osb, handle);
-		handle = NULL;
-	}
-	if (data_ac) {
-		ocfs2_free_alloc_context(data_ac);
-		data_ac = NULL;
-	}
-	if (meta_ac) {
-		ocfs2_free_alloc_context(meta_ac);
-		meta_ac = NULL;
-	}
-	if ((!status) && restart_func) {
-		restart_func = 0;
-		goto restart_all;
-	}
 
 	return status;
 }
 
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-				      struct buffer_head *root_bh,
-				      struct ocfs2_xattr_value_root *xv,
+				      struct ocfs2_xattr_value_buf *vb,
 				      u32 cpos, u32 phys_cpos, u32 len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_extent_tree et;
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
 
-	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-				  dealloc);
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+				  &ctxt->dealloc);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
-	le32_add_cpu(&xv->xr_clusters, -len);
+	le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
 
-	ret = ocfs2_journal_dirty(handle, root_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
-out_commit:
-	ocfs2_commit_trans(osb, handle);
 out:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
 	return ret;
 }
 
 static int ocfs2_xattr_shrink_size(struct inode *inode,
 				   u32 old_clusters,
 				   u32 new_clusters,
-				   struct buffer_head *root_bh,
-				   struct ocfs2_xattr_value_root *xv)
+				   struct ocfs2_xattr_value_buf *vb,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret = 0;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_cached_dealloc_ctxt dealloc;
-
-	ocfs2_init_dealloc_ctxt(&dealloc);
 
 	if (old_clusters <= new_clusters)
 		return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	trunc_len = old_clusters - new_clusters;
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-					       &alloc_size, &xv->xr_list);
+					       &alloc_size,
+					       &vb->vb_xv->xr_list);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 		if (alloc_size > trunc_len)
 			alloc_size = trunc_len;
 
-		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
 						 phys_cpos, alloc_size,
-						 &dealloc);
+						 ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	}
 
 out:
-	ocfs2_schedule_truncate_log_flush(osb, 1);
-	ocfs2_run_deallocs(osb, &dealloc);
-
 	return ret;
 }
 
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-				      struct buffer_head *root_bh,
-				      struct ocfs2_xattr_value_root *xv,
-				      int len)
+				      struct ocfs2_xattr_value_buf *vb,
+				      int len,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+	u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
 
 	if (new_clusters == old_clusters)
 		return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	if (new_clusters > old_clusters)
 		ret = ocfs2_xattr_extend_allocation(inode,
 						    new_clusters - old_clusters,
-						    root_bh, xv);
+						    vb, ctxt);
 	else
 		ret = ocfs2_xattr_shrink_size(inode,
 					      old_clusters, new_clusters,
-					      root_bh, xv);
+					      vb, ctxt);
 
 	return ret;
 }
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto cleanup;
-	}
-
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
 		ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
 						   buffer, buffer_size);
 	}
-cleanup:
+
 	brelse(blk_bh);
 
 	return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		/* Copy ocfs2_xattr_value */
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno, &bh);
+			ret = ocfs2_read_block(inode, blkno, &bh, NULL);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
-	memset(&xs->bucket, 0, sizeof(xs->bucket));
+	xs->bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xs->bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto cleanup;
+	}
 
 	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
 	if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								xs->bucket.xh,
+								bucket_xh(xs->bucket),
 								i,
 								&block_off,
 								&name_offset);
-			xs->base = xs->bucket.bhs[block_off]->b_data;
+			xs->base = bucket_block(xs->bucket, block_off);
 		}
 		if (ocfs2_xattr_is_local(xs->here)) {
 			memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	}
 	ret = size;
 cleanup:
-	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
-		brelse(xs->bucket.bhs[i]);
-	memset(&xs->bucket, 0, sizeof(xs->bucket));
+	ocfs2_xattr_bucket_free(xs->bucket);
 
 	brelse(xs->xattr_bh);
 	xs->xattr_bh = NULL;
 	return ret;
 }
 
-/* ocfs2_xattr_get()
- *
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
+int ocfs2_xattr_get_nolock(struct inode *inode,
+			   struct buffer_head *di_bh,
 			   int name_index,
 			   const char *name,
 			   void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
 	int ret;
 	struct ocfs2_dinode *di = NULL;
-	struct buffer_head *di_bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_xattr_search xis = {
 		.not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		ret = -ENODATA;
 
-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
 	up_read(&oi->ip_xattr_sem);
+
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+				     name, buffer, buffer_size);
+
 	ocfs2_inode_unlock(inode, 0);
 
 	brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   handle_t *handle,
 					   struct ocfs2_xattr_value_root *xv,
 					   const void *value,
 					   int value_len)
 {
-	int ret = 0, i, cp_len, credits;
+	int ret = 0, i, cp_len;
 	u16 blocksize = inode->i_sb->s_blocksize;
 	u32 p_cluster, num_clusters;
 	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
 	u64 blkno;
 	struct buffer_head *bh = NULL;
-	handle_t *handle;
 
 	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
 
-	credits = clusters * bpc;
-	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
 					       &num_clusters, &xv->xr_list);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno, &bh);
+			ret = ocfs2_read_block(inode, blkno, &bh, NULL);
 			if (ret) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 
 			ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 						   OCFS2_JOURNAL_ACCESS_WRITE);
 			if (ret < 0) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 
 			cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 			ret = ocfs2_journal_dirty(handle, bh);
 			if (ret < 0) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 			brelse(bh);
 			bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 		}
 		cpos += num_clusters;
 	}
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	brelse(bh);
 
@@ -960,28 +1249,22 @@ out:
 }
 
 static int ocfs2_xattr_cleanup(struct inode *inode,
+			       handle_t *handle,
 			       struct ocfs2_xattr_info *xi,
 			       struct ocfs2_xattr_search *xs,
+			       struct ocfs2_xattr_value_buf *vb,
 			       size_t offs)
 {
-	handle_t *handle = NULL;
 	int ret = 0;
 	size_t name_len = strlen(xi->name);
 	void *val = xs->base + offs;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 	/* Decrease xattr count */
 	le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
 	memset(val, 0, size);
 
-	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret < 0)
 		mlog_errno(ret);
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
 
 static int ocfs2_xattr_update_entry(struct inode *inode,
+				    handle_t *handle,
 				    struct ocfs2_xattr_info *xi,
 				    struct ocfs2_xattr_search *xs,
+				    struct ocfs2_xattr_value_buf *vb,
 				    size_t offs)
 {
-	handle_t *handle = NULL;
-	int ret = 0;
+	int ret;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
 		ocfs2_xattr_set_local(xs->here, 0);
 	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
 
-	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret < 0)
 		mlog_errno(ret);
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
@@ -1045,6 +1318,8 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
 					 struct ocfs2_xattr_info *xi,
 					 struct ocfs2_xattr_search *xs,
+					 struct ocfs2_xattr_set_ctxt *ctxt,
+					 struct ocfs2_xattr_value_buf *vb,
 					 size_t offs)
 {
 	size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	xv->xr_list.l_tree_depth = 0;
 	xv->xr_list.l_count = cpu_to_le16(1);
 	xv->xr_list.l_next_free_rec = 0;
+	vb->vb_xv = xv;
 
-	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
-					 xi->value_len);
+	ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
-					      xi->value_len);
+	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
 				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt,
 				 int flag)
 {
 	struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
 	size_t size_l = 0;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	int free, i, ret;
 	struct ocfs2_xattr_info xi_l = {
 		.name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		.value = xi->value,
 		.value_len = xi->value_len,
 	};
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = xs->xattr_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		BUG_ON(xs->xattr_bh == xs->inode_bh);
+		vb.vb_access = ocfs2_journal_access_xb;
+	} else
+		BUG_ON(xs->xattr_bh != xs->inode_bh);
 
 	/* Compute min_offs, last and free space. */
 	last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
 			/* Replace existing local xattr with tree root */
 			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-							    offs);
+							    ctxt, &vb, offs);
 			if (ret < 0)
 				mlog_errno(ret);
 			goto out;
 		} else if (!ocfs2_xattr_is_local(xs->here)) {
 			/* For existing xattr which has value outside */
-			struct ocfs2_xattr_value_root *xv = NULL;
-			xv = (struct ocfs2_xattr_value_root *)(val +
-				OCFS2_XATTR_SIZE(name_len));
+			vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(val + OCFS2_XATTR_SIZE(name_len));
 
 			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 				/*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 * then set new value with set_value_outside().
 				 */
 				ret = ocfs2_xattr_value_truncate(inode,
-								 xs->xattr_bh,
-								 xv,
-								 xi->value_len);
+								 &vb,
+								 xi->value_len,
+								 ctxt);
 				if (ret < 0) {
 					mlog_errno(ret);
 					goto out;
 				}
 
-				ret = __ocfs2_xattr_set_value_outside(inode,
-								xv,
-								xi->value,
-								xi->value_len);
+				ret = ocfs2_xattr_update_entry(inode,
+							       handle,
+							       xi,
+							       xs,
+							       &vb,
+							       offs);
 				if (ret < 0) {
 					mlog_errno(ret);
 					goto out;
 				}
 
-				ret = ocfs2_xattr_update_entry(inode,
-							       xi,
-							       xs,
-							       offs);
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								handle,
+								vb.vb_xv,
+								xi->value,
+								xi->value_len);
 				if (ret < 0)
 					mlog_errno(ret);
 				goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 * just trucate old value to zero.
 				 */
 				 ret = ocfs2_xattr_value_truncate(inode,
-								 xs->xattr_bh,
-								 xv,
-								 0);
+								  &vb,
+								  0,
+								  ctxt);
 				if (ret < 0)
 					mlog_errno(ret);
 			}
 		}
 	}
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		/* set extended attribute in external block. */
-		ret = ocfs2_extend_trans(handle,
-					 OCFS2_INODE_UPDATE_CREDITS +
-					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
-		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = vb.vb_access(handle, inode, vb.vb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 	}
 
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 	}
 
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	oi->ip_dyn_features |= flag;
 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 	spin_unlock(&oi->ip_lock);
-	/* Update inode ctime */
-	inode->i_ctime = CURRENT_TIME;
-	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
 	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
 	if (ret < 0)
 		mlog_errno(ret);
 
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-
 	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/*
 		 * Set value outside in B tree.
 		 * This is the second step for value size > INLINE_SIZE.
 		 */
 		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+						    &vb, offs);
 		if (ret < 0) {
 			int ret2;
 
@@ -1418,41 +1684,56 @@ out_commit:
 			 * If set value outside failed, we have to clean
 			 * the junk tree root we have already set in local.
 			 */
-			ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+			ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+						   xi, xs, &vb, offs);
 			if (ret2 < 0)
 				mlog_errno(ret2);
 		}
 	}
 out:
 	return ret;
-
 }
 
 static int ocfs2_remove_value_outside(struct inode*inode,
-				      struct buffer_head *bh,
+				      struct ocfs2_xattr_value_buf *vb,
 				      struct ocfs2_xattr_header *header)
 {
 	int ret = 0, i;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+	ctxt.handle = ocfs2_start_trans(osb,
+					ocfs2_remove_extent_credits(osb->sb));
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out;
+	}
 
 	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
 		if (!ocfs2_xattr_is_local(entry)) {
-			struct ocfs2_xattr_value_root *xv;
 			void *val;
 
 			val = (void *)header +
 				le16_to_cpu(entry->xe_name_offset);
-			xv = (struct ocfs2_xattr_value_root *)
+			vb->vb_xv = (struct ocfs2_xattr_value_root *)
 				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+			ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
 			if (ret < 0) {
 				mlog_errno(ret);
-				return ret;
+				break;
 			}
 		}
 	}
 
+	ocfs2_commit_trans(osb, ctxt.handle);
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
 	return ret;
 }
 
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_xattr_header *header;
 	int ret;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = di_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
 
 	header = (struct ocfs2_xattr_header *)
 		 ((void *)di + inode->i_sb->s_blocksize -
 		 le16_to_cpu(di->i_xattr_inline_size));
 
-	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+	ret = ocfs2_remove_value_outside(inode, &vb, header);
 
 	return ret;
 }
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
 	struct ocfs2_xattr_block *xb;
 	int ret = 0;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header);
 	} else
 		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
 
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 	u64 blk, bg_blkno;
 	u16 bit;
 
-	ret = ocfs2_read_block(inode, block, &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = ocfs2_xattr_block_remove(inode, blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
 	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 		mlog_errno(ret);
 		goto out;
 	}
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
  */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs)
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_xattr_set_entry(inode, xi, xs,
+	ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
 				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
 	up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto cleanup;
-	}
-
 	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
  */
 static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs)
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
 	u16 suballoc_bit_start;
 	u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 	int ret;
 
 	if (!xs->xattr_bh) {
-		/*
-		 * Alloc one external block for extended attribute
-		 * outside of inode.
-		 */
-		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out;
-		}
-		handle = ocfs2_start_trans(osb,
-					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			mlog_errno(ret);
-			goto out;
-		}
-		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
-		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+		ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
 					   &suballoc_bit_start, &num_got,
 					   &first_blkno);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
 		new_bh = sb_getblk(inode->i_sb, first_blkno);
 		ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
-		ret = ocfs2_journal_access(handle, inode, new_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+		ret = ocfs2_journal_access_xb(handle, inode, new_bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
 		/* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
 		xs->here = xs->header->xh_entries;
 
-
 		ret = ocfs2_journal_dirty(handle, new_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 		di->i_xattr_loc = cpu_to_le64(first_blkno);
-		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
-		if (ret < 0)
-			mlog_errno(ret);
-out_commit:
-		ocfs2_commit_trans(osb, handle);
-out:
-		if (meta_ac)
-			ocfs2_free_alloc_context(meta_ac);
-		if (ret < 0)
-			return ret;
+		ocfs2_journal_dirty(handle, xs->inode_bh);
 	} else
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
 	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		/* Set extended attribute into external block */
-		ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+		ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+					    OCFS2_HAS_XATTR_FL);
 		if (!ret || ret != -ENOSPC)
 			goto end;
 
-		ret = ocfs2_xattr_create_index_block(inode, xs);
+		ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
 		if (ret)
 			goto end;
 	}
 
-	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 
 end:
 
 	return ret;
 }
 
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+				       struct ocfs2_xattr_info *xi,
+				       struct ocfs2_xattr_search *xs)
+{
+	u64 value_size;
+	struct ocfs2_xattr_entry *last;
+	int free, i;
+	size_t min_offs = xs->end - xs->base;
+
+	if (!xs->header)
+		return 0;
+
+	last = xs->header->xh_entries;
+
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	if (free < 0)
+		return 0;
+
+	BUG_ON(!xs->not_found);
+
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	else
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (free >= sizeof(struct ocfs2_xattr_entry) +
+		   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+		return 1;
+
+	return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     int *clusters_need,
+				     int *meta_need,
+				     int *credits_need)
+{
+	int ret = 0, old_in_xb = 0;
+	int clusters_add = 0, meta_add = 0, credits = 0;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_block *xb = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	char *base = NULL;
+	int name_offset, name_len = 0;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+						    xi->value_len);
+	u64 value_size;
+
+	/*
+	 * Calculate the clusters we need to write.
+	 * No matter whether we replace an old one or add a new one,
+	 * we need this for writing.
+	 */
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		credits += new_clusters *
+			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	if (xis->not_found && xbs->not_found) {
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+			clusters_add += new_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
+		}
+
+		goto meta_guess;
+	}
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		credits += OCFS2_INODE_UPDATE_CREDITS;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+		old_in_xb = 1;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			base = bucket_block(xbs->bucket, block_off);
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		} else {
+			base = xbs->base;
+			credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+		}
+	}
+
+	/*
+	 * delete a xattr doesn't need metadata and cluster allocation.
+	 * so just calculate the credits and return.
+	 *
+	 * The credits for removing the value tree will be extended
+	 * by ocfs2_remove_extent itself.
+	 */
+	if (!xi->value) {
+		if (!ocfs2_xattr_is_local(xe))
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+		goto out;
+	}
+
+	/* do cluster allocation guess first. */
+	value_size = le64_to_cpu(xe->xe_value_size);
+
+	if (old_in_xb) {
+		/*
+		 * In xattr set, we always try to set the xe in inode first,
+		 * so if it can be inserted into inode successfully, the old
+		 * one will be removed from the xattr block, and this xattr
+		 * will be inserted into inode as a new xattr in inode.
+		 */
+		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+			clusters_add += new_clusters;
+			credits += ocfs2_remove_extent_credits(inode->i_sb) +
+				    OCFS2_INODE_UPDATE_CREDITS;
+			if (!ocfs2_xattr_is_local(xe))
+				credits += ocfs2_calc_extend_credits(
+							inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
+			goto out;
+		}
+	}
+
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* the new values will be stored outside. */
+		u32 old_clusters = 0;
+
+		if (!ocfs2_xattr_is_local(xe)) {
+			old_clusters =	ocfs2_clusters_for_bytes(inode->i_sb,
+								 value_size);
+			xv = (struct ocfs2_xattr_value_root *)
+			     (base + name_offset + name_len);
+			value_size = OCFS2_XATTR_ROOT_SIZE;
+		} else
+			xv = &def_xv.xv;
+
+		if (old_clusters >= new_clusters) {
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
+			goto out;
+		} else {
+			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+			clusters_add += new_clusters - old_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     &xv->xr_list,
+							     new_clusters -
+							     old_clusters);
+			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+				goto out;
+		}
+	} else {
+		/*
+		 * Now the new value will be stored inside. So if the new
+		 * value is smaller than the size of value root or the old
+		 * value, we don't need any allocation, otherwise we have
+		 * to guess metadata allocation.
+		 */
+		if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+		    (!ocfs2_xattr_is_local(xe) &&
+		     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+			goto out;
+	}
+
+meta_guess:
+	/* calculate metadata allocation. */
+	if (di->i_xattr_loc) {
+		if (!xbs->xattr_bh) {
+			ret = ocfs2_read_xattr_block(inode,
+						     le64_to_cpu(di->i_xattr_loc),
+						     &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xb = (struct ocfs2_xattr_block *)bh->b_data;
+		} else
+			xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+		/*
+		 * If there is already an xattr tree, good, we can calculate
+		 * like other b-trees. Otherwise we may have the chance of
+		 * create a tree, the credit calculation is borrowed from
+		 * ocfs2_calc_extend_credits with root_el = NULL. And the
+		 * new tree will be cluster based, so no meta is needed.
+		 */
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			struct ocfs2_extent_list *el =
+				 &xb->xb_attrs.xb_root.xt_list;
+			meta_add += ocfs2_extend_meta_needed(el);
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     el, 1);
+		} else
+			credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+		/*
+		 * This cluster will be used either for new bucket or for
+		 * new xattr block.
+		 * If the cluster size is the same as the bucket size, one
+		 * more is needed since we may need to extend the bucket
+		 * also.
+		 */
+		clusters_add += 1;
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		if (OCFS2_XATTR_BUCKET_SIZE ==
+			OCFS2_SB(inode->i_sb)->s_clustersize) {
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+			clusters_add += 1;
+		}
+	} else {
+		meta_add += 1;
+		credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+out:
+	if (clusters_need)
+		*clusters_need = clusters_add;
+	if (meta_need)
+		*meta_need = meta_add;
+	if (credits_need)
+		*credits_need = credits;
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int *credits)
+{
+	int clusters_add, meta_add, ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+	ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+	ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+					&clusters_add, &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+	     "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+
+	if (meta_add) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+							&ctxt->meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_add) {
+		ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (ctxt->meta_ac) {
+			ocfs2_free_alloc_context(ctxt->meta_ac);
+			ctxt->meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null ctxt->data_ac.
+		 */
+	}
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xis,
+				    struct ocfs2_xattr_search *xbs,
+				    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0, credits, old_found;
+
+	if (!xi->value) {
+		/* Remove existing extended attribute */
+		if (!xis->not_found)
+			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		else if (!xbs->not_found)
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		if (!ret && !xbs->not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi->value = NULL;
+			xi->value_len = 0;
+
+			old_found = xis->not_found;
+			xis->not_found = -ENODATA;
+			ret = ocfs2_calc_xattr_set_need(inode,
+							di,
+							xi,
+							xis,
+							xbs,
+							NULL,
+							NULL,
+							&credits);
+			xis->not_found = old_found;
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_extend_trans(ctxt->handle, credits +
+					ctxt->handle->h_buffer_credits);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs->xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode,
+							     xi->name_index,
+							     xi->name, xbs);
+				if (ret)
+					goto out;
+
+				old_found = xis->not_found;
+				xis->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				xis->not_found = old_found;
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits +
+					ctxt->handle->h_buffer_credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+			if (ret)
+				goto out;
+			if (!xis->not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi->value = NULL;
+				xi->value_len = 0;
+				xbs->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits +
+						ctxt->handle->h_buffer_credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+				ret = ocfs2_xattr_ibody_set(inode, xi,
+							    xis, ctxt);
+			}
+		}
+	}
+
+	if (!ret) {
+		/* Update inode ctime. */
+		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		inode->i_ctime = CURRENT_TIME;
+		di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+	}
+out:
+	return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   const void *value,
+			   size_t value_len,
+			   int flags,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_set_ctxt ctxt = {
+		.handle = handle,
+		.meta_ac = meta_ac,
+		.data_ac = data_ac,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	/*
+	 * In extreme situation, may need xattr bucket when
+	 * block size is too small. And we have already reserved
+	 * the credits for bucket in mknod.
+	 */
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+		xbs.bucket = ocfs2_xattr_bucket_new(inode);
+		if (!xbs.bucket) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+	}
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
+
+	return ret;
+}
+
 /*
  * ocfs2_xattr_set()
  *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
 {
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	int ret;
-	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
 	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
 
+	/*
+	 * Only xbs will be used on indexed trees.  xis doesn't need a
+	 * bucket.
+	 */
+	xbs.bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xbs.bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return ret;
+		goto cleanup_nolock;
 	}
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
 			goto cleanup;
 	}
 
-	if (!value) {
-		/* Remove existing extended attribute */
-		if (!xis.not_found)
-			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-		else if (!xbs.not_found)
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-	} else {
-		/* We always try to set extended attribute into inode first*/
-		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-		if (!ret && !xbs.not_found) {
-			/*
-			 * If succeed and that extended attribute existing in
-			 * external block, then we will remove it.
-			 */
-			xi.value = NULL;
-			xi.value_len = 0;
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-		} else if (ret == -ENOSPC) {
-			if (di->i_xattr_loc && !xbs.xattr_bh) {
-				ret = ocfs2_xattr_block_find(inode, name_index,
-							     name, &xbs);
-				if (ret)
-					goto cleanup;
-			}
-			/*
-			 * If no space in inode, we will set extended attribute
-			 * into external block.
-			 */
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-			if (ret)
-				goto cleanup;
-			if (!xis.not_found) {
-				/*
-				 * If succeed and that extended attribute
-				 * existing in inode, we will remove it.
-				 */
-				xi.value = NULL;
-				xi.value_len = 0;
-				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-			}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mutex_unlock(&tl_inode->i_mutex);
+			mlog_errno(ret);
+			goto cleanup;
 		}
 	}
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+					&xbs, &ctxt, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	/* we need to update inode's ctime field, so add credit for it. */
+	credits += OCFS2_INODE_UPDATE_CREDITS;
+	ctxt.handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+
+	if (ctxt.data_ac)
+		ocfs2_free_alloc_context(ctxt.data_ac);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
 	brelse(di_bh);
 	brelse(xbs.xattr_bh);
-	for (i = 0; i < blk_per_bucket; i++)
-		brelse(xbs.bucket.bhs[i]);
+	ocfs2_xattr_bucket_free(xbs.bucket);
 
 	return ret;
 }
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
 				void *para);
 
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-				   struct buffer_head *header_bh,
+				   struct ocfs2_xattr_bucket *bucket,
 				   int name_index,
 				   const char *name,
 				   u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 				   int *found)
 {
 	int i, ret = 0, cmp = 1, block_off, new_offset;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	size_t name_len = strlen(name);
 	struct ocfs2_xattr_entry *xe = NULL;
-	struct buffer_head *name_bh = NULL;
 	char *xe_name;
 
 	/*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 			break;
 		}
 
-		ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-				       &name_bh);
-		if (ret) {
-			mlog_errno(ret);
-			break;
-		}
-		xe_name = name_bh->b_data + new_offset;
 
-		cmp = memcmp(name, xe_name, name_len);
-		brelse(name_bh);
-		name_bh = NULL;
-
-		if (cmp == 0) {
+		xe_name = bucket_block(bucket, block_off) + new_offset;
+		if (!memcmp(name, xe_name, name_len)) {
 			*xe_index = i;
 			*found = 1;
 			ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 				   struct ocfs2_xattr_search *xs)
 {
 	int ret, found = 0;
-	struct buffer_head *bh = NULL;
-	struct buffer_head *lower_bh = NULL;
 	struct ocfs2_xattr_header *xh = NULL;
 	struct ocfs2_xattr_entry *xe = NULL;
 	u16 index = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int low_bucket = 0, bucket, high_bucket;
+	struct ocfs2_xattr_bucket *search;
 	u32 last_hash;
-	u64 blkno;
+	u64 blkno, lower_blkno = 0;
 
-	ret = ocfs2_read_block(inode, p_blkno, &bh);
+	search = ocfs2_xattr_bucket_new(inode);
+	if (!search) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(search, p_blkno);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh = bucket_xh(search);
 	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
-
 	while (low_bucket <= high_bucket) {
-		brelse(bh);
-		bh = NULL;
-		bucket = (low_bucket + high_bucket) / 2;
+		ocfs2_xattr_bucket_relse(search);
 
+		bucket = (low_bucket + high_bucket) / 2;
 		blkno = p_blkno + bucket * blk_per_bucket;
-
-		ret = ocfs2_read_block(inode, blkno, &bh);
+		ret = ocfs2_read_xattr_bucket(search, blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xh = bucket_xh(search);
 		xe = &xh->xh_entries[0];
 		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
 			high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		last_hash = le32_to_cpu(xe->xe_name_hash);
 
-		/* record lower_bh which may be the insert place. */
-		brelse(lower_bh);
-		lower_bh = bh;
-		bh = NULL;
+		/* record lower_blkno which may be the insert place. */
+		lower_blkno = blkno;
 
 		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
 			low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		}
 
 		/* the searched xattr should reside in this bucket if exists. */
-		ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+		ret = ocfs2_find_xe_in_bucket(inode, search,
 					      name_index, name, name_hash,
 					      &index, &found);
 		if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	 * When the xattr's hash value is in the gap of 2 buckets, we will
 	 * always set it to the previous bucket.
 	 */
-	if (!lower_bh) {
-		/*
-		 * We can't find any bucket whose first name_hash is less
-		 * than the find name_hash.
-		 */
-		BUG_ON(bh->b_blocknr != p_blkno);
-		lower_bh = bh;
-		bh = NULL;
+	if (!lower_blkno)
+		lower_blkno = p_blkno;
+
+	/* This should be in cache - we just read it during the search */
+	ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
-	xs->bucket.bhs[0] = lower_bh;
-	xs->bucket.xh = (struct ocfs2_xattr_header *)
-					xs->bucket.bhs[0]->b_data;
-	lower_bh = NULL;
 
-	xs->header = xs->bucket.xh;
-	xs->base = xs->bucket.bhs[0]->b_data;
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
 	if (found) {
-		/*
-		 * If we have found the xattr enty, read all the blocks in
-		 * this bucket.
-		 */
-		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					0);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
 		xs->here = &xs->header->xh_entries[index];
 		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+		     (unsigned long long)bucket_blkno(xs->bucket), index);
 	} else
 		ret = -ENODATA;
 
 out:
-	brelse(bh);
-	brelse(lower_bh);
+	ocfs2_xattr_bucket_free(search);
 	return ret;
 }
 
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 				       xattr_bucket_func *func,
 				       void *para)
 {
-	int i, j, ret = 0;
-	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int i, ret = 0;
 	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
 	u32 num_buckets = clusters * bpc;
-	struct ocfs2_xattr_bucket bucket;
+	struct ocfs2_xattr_bucket *bucket;
 
-	memset(&bucket, 0, sizeof(bucket));
+	bucket = ocfs2_xattr_bucket_new(inode);
+	if (!bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
 
 	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
 	     clusters, (unsigned long long)blkno);
 
-	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
-		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
-					bucket.bhs, 0);
+	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+		ret = ocfs2_read_xattr_bucket(bucket, blkno);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			break;
 		}
 
-		bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
 		/*
 		 * The real bucket num in this series of blocks is stored
 		 * in the 1st bucket.
 		 */
 		if (i == 0)
-			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
 
 		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
 		     (unsigned long long)blkno,
-		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
-			ret = func(inode, &bucket, para);
-			if (ret) {
+			ret = func(inode, bucket, para);
+			if (ret)
 				mlog_errno(ret);
-				break;
-			}
+			/* Fall through to bucket_relse() */
 		}
 
-		for (j = 0; j < blk_per_bucket; j++)
-			brelse(bucket.bhs[j]);
-		memset(&bucket, 0, sizeof(bucket));
+		ocfs2_xattr_bucket_relse(bucket);
+		if (ret)
+			break;
 	}
 
-out:
-	for (j = 0; j < blk_per_bucket; j++)
-		brelse(bucket.bhs[j]);
-
+	ocfs2_xattr_bucket_free(bucket);
 	return ret;
 }
 
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	int i, block_off, new_offset;
 	const char *prefix, *name;
 
-	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
-		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
 		prefix = ocfs2_xattr_prefix(type);
 
 		if (prefix) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								bucket->xh,
+								bucket_xh(bucket),
 								i,
 								&block_off,
 								&new_offset);
 			if (ret)
 				break;
 
-			name = (const char *)bucket->bhs[block_off]->b_data +
+			name = (const char *)bucket_block(bucket, block_off) +
 				new_offset;
 			ret = ocfs2_xattr_list_entry(xl->buffer,
 						     xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
  * When the ocfs2_xattr_block is filled up, new bucket will be created
  * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
  * Note: we need to sort the entries since they are not saved in order
  * in the ocfs2_xattr_block.
  */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 					   struct buffer_head *xb_bh,
-					   struct buffer_head *xh_bh,
-					   struct buffer_head *data_bh)
+					   struct ocfs2_xattr_bucket *bucket)
 {
 	int i, blocksize = inode->i_sb->s_blocksize;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 offset, size, off_change;
 	struct ocfs2_xattr_entry *xe;
 	struct ocfs2_xattr_block *xb =
 				(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-	struct ocfs2_xattr_header *xh =
-				(struct ocfs2_xattr_header *)xh_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 count = le16_to_cpu(xb_xh->xh_count);
-	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+	char *src = xb_bh->b_data;
+	char *target = bucket_block(bucket, blks - 1);
 
 	mlog(0, "cp xattr from block %llu to bucket %llu\n",
 	     (unsigned long long)xb_bh->b_blocknr,
-	     (unsigned long long)xh_bh->b_blocknr);
+	     (unsigned long long)bucket_blkno(bucket));
+
+	for (i = 0; i < blks; i++)
+		memset(bucket_block(bucket, i), 0, blocksize);
 
-	memset(xh_bh->b_data, 0, blocksize);
-	if (data_bh)
-		memset(data_bh->b_data, 0, blocksize);
 	/*
 	 * Since the xe_name_offset is based on ocfs2_xattr_header,
 	 * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	size = blocksize - offset;
 
 	/* copy all the names and values. */
-	if (data_bh)
-		target = data_bh->b_data;
 	memcpy(target + offset, src + offset, size);
 
 	/* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
 
 	/* copy all the entries. */
-	target = xh_bh->b_data;
+	target = bucket_block(bucket, 0);
 	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
 	size = count * sizeof(struct ocfs2_xattr_entry);
 	memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
  * While if the entry is in index b-tree, "bucket" indicates the
  * real place of the xattr.
  */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
-					   struct ocfs2_xattr_search *xs,
-					   struct buffer_head *old_bh,
-					   struct buffer_head *new_bh)
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+					    struct ocfs2_xattr_search *xs,
+					    struct buffer_head *old_bh)
 {
-	int ret = 0;
 	char *buf = old_bh->b_data;
 	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
 	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-	int i, blocksize = inode->i_sb->s_blocksize;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
-	xs->bucket.bhs[0] = new_bh;
-	get_bh(new_bh);
-	xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-	xs->header = xs->bucket.xh;
+	int i;
 
-	xs->base = new_bh->b_data;
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
-	if (!xs->not_found) {
-		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
-			ret = ocfs2_read_blocks(inode,
-					xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					0);
-			if (ret) {
-				mlog_errno(ret);
-				return ret;
-			}
-
-		}
-		i = xs->here - old_xh->xh_entries;
-		xs->here = &xs->header->xh_entries[i];
-	}
+	if (xs->not_found)
+		return;
 
-	return ret;
+	i = xs->here - old_xh->xh_entries;
+	xs->here = &xs->header->xh_entries[i];
 }
 
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-					  struct ocfs2_xattr_search *xs)
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	int ret;
 	u32 bit_off, len;
 	u64 blkno;
-	handle_t *handle;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_alloc_context *data_ac;
-	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
 	struct buffer_head *xb_bh = xs->xattr_bh;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_tree_root *xr;
 	u16 xb_flags = le16_to_cpu(xb->xb_flags);
-	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	mlog(0, "create xattr index block for %llu\n",
 	     (unsigned long long)xb_bh->b_blocknr);
 
 	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
-
-	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	BUG_ON(!xs->bucket);
 
 	/*
 	 * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	down_write(&oi->ip_alloc_sem);
 
-	/*
-	 * 3 more credits, one for xattr block update, one for the 1st block
-	 * of the new xattr bucket and one for the value/data.
-	 */
-	credits += 3;
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out_sem;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xb_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
-	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+				     1, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	/*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
 	     (unsigned long long)blkno);
 
-	xh_bh = sb_getblk(inode->i_sb, blkno);
-	if (!xh_bh) {
-		ret = -EIO;
+	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
-	ocfs2_set_new_buffer_uptodate(inode, xh_bh);
-
-	ret = ocfs2_journal_access(handle, inode, xh_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	if (bpb > 1) {
-		data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-		if (!data_bh) {
-			ret = -EIO;
-			mlog_errno(ret);
-			goto out_commit;
-		}
-
-		ocfs2_set_new_buffer_uptodate(inode, data_bh);
-
-		ret = ocfs2_journal_access(handle, inode, data_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
+		goto out;
 	}
 
-	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
-	ocfs2_journal_dirty(handle, xh_bh);
-	if (data_bh)
-		ocfs2_journal_dirty(handle, data_bh);
-
-	ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
 
 	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
 	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 
 	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
 
-	ret = ocfs2_journal_dirty(handle, xb_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-
-out_sem:
-	up_write(&oi->ip_alloc_sem);
+	ocfs2_journal_dirty(handle, xb_bh);
 
 out:
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-
-	brelse(xh_bh);
-	brelse(data_bh);
+	up_write(&oi->ip_alloc_sem);
 
 	return ret;
 }
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
  * so that we can spare some space for insertion.
  */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
 				     struct ocfs2_xattr_bucket *bucket)
 {
 	int ret, i;
 	size_t end, offset, len, value_len;
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
-	u64 blkno = bucket->bhs[0]->b_blocknr;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 blkno = bucket_blkno(bucket);
 	u16 xh_free_start;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	handle_t *handle;
-	struct buffer_head **bhs;
 	struct ocfs2_xattr_entry *xe;
 
-	bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!bhs)
-		return -ENOMEM;
-
-	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-	if (ret)
-		goto out;
-
 	/*
 	 * In order to make the operation more efficient and generic,
 	 * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	}
 
 	buf = bucket_buf;
-	for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
-		memcpy(buf, bhs[i]->b_data, blocksize);
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(buf, bucket_block(bucket, i), blocksize);
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, bhs[i],
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto commit;
-		}
-	}
-
 	xh = (struct ocfs2_xattr_header *)bucket_buf;
 	entries = (char *)xh->xh_entries;
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 			"bucket %llu\n", (unsigned long long)blkno);
 
 	if (xh_free_start == end)
-		goto commit;
+		goto out;
 
 	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
 	xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	     cmp_xe, swap_xe);
 
 	buf = bucket_buf;
-	for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
-		memcpy(bhs[i]->b_data, buf, blocksize);
-		ocfs2_journal_dirty(handle, bhs[i]);
-	}
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(bucket_block(bucket, i), buf, blocksize);
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
 
-commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-
-	if (bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(bhs[i]);
-	}
-	kfree(bhs);
-
 	kfree(bucket_buf);
 	return ret;
 }
 
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
- * cluster. We only touch the last cluster of the previous extend record.
+ * prev_blkno points to the start of an existing extent.  new_blkno
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
  *
- * first_bh is the first buffer_head of a series of bucket in the same
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
  */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 					       handle_t *handle,
-					       struct buffer_head **first_bh,
-					       struct buffer_head **header_bh,
+					       struct ocfs2_xattr_bucket *first,
+					       struct ocfs2_xattr_bucket *target,
 					       u64 new_blkno,
-					       u64 prev_blkno,
 					       u32 num_clusters,
 					       u32 *first_hash)
 {
-	int i, ret, credits;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-	int blocksize = inode->i_sb->s_blocksize;
-	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
-	struct ocfs2_xattr_header *new_xh;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
-
-	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-
-	prev_bh = *first_bh;
-	get_bh(prev_bh);
-	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+	int ret;
+	struct super_block *sb = inode->i_sb;
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+	int to_move = num_buckets / 2;
+	u64 src_blkno;
+	u64 last_cluster_blkno = bucket_blkno(first) +
+		((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
 
-	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+	     (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
 
-	/*
-	 * We need to update the 1st half of the new cluster and
-	 * 1 more for the update of the 1st bucket of the previous
-	 * extent record.
-	 */
-	credits = bpc / 2 + 1;
-	ret = ocfs2_extend_trans(handle, credits);
+	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+				     last_cluster_blkno, new_blkno,
+				     to_move, first_hash);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, prev_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	/* This is the first bucket that got moved */
+	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
 
-	for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
-		old_bh = new_bh = NULL;
-		new_bh = sb_getblk(inode->i_sb, new_blkno);
-		if (!new_bh) {
-			ret = -EIO;
-			mlog_errno(ret);
-			goto out;
-		}
+	/*
+	 * If the target bucket was part of the moved buckets, we need to
+	 * update first and target.
+	 */
+	if (bucket_blkno(target) >= src_blkno) {
+		/* Find the block for the new target bucket */
+		src_blkno = new_blkno +
+			(bucket_blkno(target) - src_blkno);
 
-		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+		ocfs2_xattr_bucket_relse(first);
+		ocfs2_xattr_bucket_relse(target);
 
-		ret = ocfs2_journal_access(handle, inode, new_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
+		/*
+		 * These shouldn't fail - the buffers are in the
+		 * journal from ocfs2_cp_xattr_bucket().
+		 */
+		ret = ocfs2_read_xattr_bucket(first, new_blkno);
+		if (ret) {
 			mlog_errno(ret);
-			brelse(new_bh);
 			goto out;
 		}
-
-		ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
-		if (ret < 0) {
+		ret = ocfs2_read_xattr_bucket(target, src_blkno);
+		if (ret)
 			mlog_errno(ret);
-			brelse(new_bh);
-			goto out;
-		}
 
-		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-
-		if (i == 0) {
-			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-			new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-
-			if (first_hash)
-				*first_hash = le32_to_cpu(
-					new_xh->xh_entries[0].xe_name_hash);
-			new_first_bh = new_bh;
-			get_bh(new_first_bh);
-		}
-
-		ocfs2_journal_dirty(handle, new_bh);
-
-		if (*header_bh == old_bh) {
-			brelse(*header_bh);
-			*header_bh = new_bh;
-			get_bh(*header_bh);
-
-			brelse(*first_bh);
-			*first_bh = new_first_bh;
-			get_bh(*first_bh);
-		}
-		brelse(new_bh);
-		brelse(old_bh);
 	}
 
-	le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-
-	ocfs2_journal_dirty(handle, prev_bh);
 out:
-	brelse(prev_bh);
-	brelse(new_first_bh);
-	return ret;
-}
-
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-				   u64 blkno,
-				   struct buffer_head **bhs,
-				   int new)
-{
-	int ret = 0;
-	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
-	if (!new)
-		return ocfs2_read_blocks(inode, blkno,
-					 blk_per_bucket, bhs, 0);
-
-	for (i = 0; i < blk_per_bucket; i++) {
-		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-		if (bhs[i] == NULL) {
-			ret = -EIO;
-			mlog_errno(ret);
-			break;
-		}
-		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-	}
-
 	return ret;
 }
 
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
 	int ret, i;
 	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
 	     (unsigned long long)blk, (unsigned long long)new_blk);
 
-	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!s_bhs)
-		return -ENOMEM;
-
-	ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
-	if (ret) {
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_read_xattr_bucket(s_bucket, blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!t_bhs) {
-		ret = -ENOMEM;
+	ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+	/*
+	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-					   new_bucket_head ?
-					   OCFS2_JOURNAL_ACCESS_CREATE :
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
+	 * same part of ocfs2_cp_xattr_bucket().
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						new_bucket_head ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
-	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	xh = bucket_xh(s_bucket);
 	count = le16_to_cpu(xh->xh_count);
 	start = ocfs2_xattr_find_divide_pos(xh);
 
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		 * The hash value is set as one larger than
 		 * that of the last entry in the previous bucket.
 		 */
-		for (i = 0; i < blk_per_bucket; i++)
-			memset(t_bhs[i]->b_data, 0, blocksize);
+		for (i = 0; i < t_bucket->bu_blocks; i++)
+			memset(bucket_block(t_bucket, i), 0, blocksize);
 
-		xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+		xh = bucket_xh(t_bucket);
 		xh->xh_free_start = cpu_to_le16(blocksize);
 		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
 		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	}
 
 	/* copy the whole bucket to the new first. */
-	for (i = 0; i < blk_per_bucket; i++)
-		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
 
 	/* update the new bucket. */
-	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+	xh = bucket_xh(t_bucket);
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
 	else
 		xh->xh_num_buckets = 0;
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ocfs2_journal_dirty(handle, t_bhs[i]);
-		if (ret)
-			mlog_errno(ret);
-	}
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
 
 	/* store the first_hash of the new bucket. */
 	if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
 	if (start == count)
 		goto out;
 
-	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	xh = bucket_xh(s_bucket);
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
 	xh->xh_count = cpu_to_le16(start);
 	xh->xh_free_start = cpu_to_le16(name_offset);
 	xh->xh_name_value_len = cpu_to_le16(name_value_len);
 
-	ocfs2_journal_dirty(handle, s_bhs[0]);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
 
 out:
-	if (s_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(s_bhs[i]);
-	}
-	kfree(s_bhs);
-
-	if (t_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(t_bhs[i]);
-	}
-	kfree(t_bhs);
+	ocfs2_xattr_bucket_free(s_bucket);
+	ocfs2_xattr_bucket_free(t_bucket);
 
 	return ret;
 }
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 				 u64 t_blkno,
 				 int t_is_new)
 {
-	int ret, i;
-	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int blocksize = inode->i_sb->s_blocksize;
-	struct buffer_head **s_bhs, **t_bhs = NULL;
+	int ret;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 
 	BUG_ON(s_blkno == t_blkno);
 
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
 	     t_is_new);
 
-	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!s_bhs)
-		return -ENOMEM;
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
-	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
 	if (ret)
 		goto out;
 
-	t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!t_bhs) {
-		ret = -ENOMEM;
+	/*
+	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+	if (ret)
 		goto out;
-	}
 
-	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+	 * cluster to fill, we came here from
+	 * ocfs2_mv_xattr_buckets(), and it is really new -
+	 * ACCESS_CREATE is required.  But we also might have moved data
+	 * out of t_bucket before extending back into it.
+	 * ocfs2_add_new_xattr_bucket() can do this - its call to
+	 * ocfs2_add_new_xattr_cluster() may have created a new extent
+	 * and copied out the end of the old extent.  Then it re-extends
+	 * the old extent back to create space for new xattrs.  That's
+	 * how we get here, and the bucket isn't really new.
+	 */
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+						t_is_new ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret)
 		goto out;
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-					   t_is_new ?
-					   OCFS2_JOURNAL_ACCESS_CREATE :
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret)
-			goto out;
-	}
-
-	for (i = 0; i < blk_per_bucket; i++) {
-		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-		ocfs2_journal_dirty(handle, t_bhs[i]);
-	}
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
 
 out:
-	if (s_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(s_bhs[i]);
-	}
-	kfree(s_bhs);
-
-	if (t_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(t_bhs[i]);
-	}
-	kfree(t_bhs);
+	ocfs2_xattr_bucket_free(t_bucket);
+	ocfs2_xattr_bucket_free(s_bucket);
 
 	return ret;
 }
 
 /*
- * Copy one xattr cluster from src_blk to to_blk.
- * The to_blk will become the first bucket header of the cluster, so its
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * src_blk points to the start of an existing extent.  last_blk points to
+ * last cluster in that extent.  to_blk points to a newly allocated
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
  */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
-				  handle_t *handle,
-				  struct buffer_head *first_bh,
-				  u64 src_blk,
-				  u64 to_blk,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
 				  u32 *first_hash)
 {
 	int i, ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-	struct buffer_head *bh = NULL;
-	struct ocfs2_xattr_header *xh;
-	u64 to_blk_start = to_blk;
+	struct ocfs2_xattr_bucket *old_first, *new_first;
+
+	mlog(0, "mv xattrs from cluster %llu to %llu\n",
+	     (unsigned long long)last_blk, (unsigned long long)to_blk);
+
+	BUG_ON(start_bucket >= num_buckets);
+	if (start_bucket) {
+		num_buckets -= start_bucket;
+		last_blk += (start_bucket * blks_per_bucket);
+	}
+
+	/* The first bucket of the original extent */
+	old_first = ocfs2_xattr_bucket_new(inode);
+	/* The first bucket of the new extent */
+	new_first = ocfs2_xattr_bucket_new(inode);
+	if (!old_first || !new_first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
-	mlog(0, "cp xattrs from cluster %llu to %llu\n",
-	     (unsigned long long)src_blk, (unsigned long long)to_blk);
+	ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
 	/*
-	 * We need to update the new cluster and 1 more for the update of
-	 * the 1st bucket of the previous extent rec.
+	 * We need to update the first bucket of the old extent and all
+	 * the buckets going to the new extent.
 	 */
-	credits = bpc + 1;
+	credits = ((num_buckets + 1) * blks_per_bucket) +
+		handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, first_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 
 	for (i = 0; i < num_buckets; i++) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle,
-					    src_blk, to_blk, 1);
+					    last_blk + (i * blks_per_bucket),
+					    to_blk + (i * blks_per_bucket),
+					    1);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-
-		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	}
 
-	/* update the old bucket header. */
-	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-	le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
-
-	ocfs2_journal_dirty(handle, first_bh);
-
-	/* update the new bucket header. */
-	ret = ocfs2_read_block(inode, to_blk_start, &bh);
-	if (ret < 0) {
+	/*
+	 * Get the new bucket ready before we dirty anything
+	 * (This actually shouldn't fail, because we already dirtied
+	 * it once in ocfs2_cp_xattr_bucket()).
+	 */
+	ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
-
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xh = (struct ocfs2_xattr_header *)bh->b_data;
-	xh->xh_num_buckets = cpu_to_le16(num_buckets);
+	/* Now update the headers */
+	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
 
-	ocfs2_journal_dirty(handle, bh);
+	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
 
 	if (first_hash)
-		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
 out:
-	brelse(bh);
+	ocfs2_xattr_bucket_free(new_first);
+	ocfs2_xattr_bucket_free(old_first);
 	return ret;
 }
 
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int ret, credits = 2 * blk_per_bucket;
+	int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
 
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
 
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
  */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 					    handle_t *handle,
-					    struct buffer_head **first_bh,
-					    struct buffer_head **header_bh,
+					    struct ocfs2_xattr_bucket *first,
+					    struct ocfs2_xattr_bucket *target,
 					    u64 new_blk,
-					    u64 prev_blk,
 					    u32 prev_clusters,
 					    u32 *v_start,
 					    int *extend)
 {
-	int ret = 0;
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int ret;
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     (unsigned long long)prev_blk, prev_clusters,
+	     (unsigned long long)bucket_blkno(first), prev_clusters,
 	     (unsigned long long)new_blk);
 
-	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
 							  handle,
-							  first_bh,
-							  header_bh,
+							  first, target,
 							  new_blk,
-							  prev_blk,
 							  prev_clusters,
 							  v_start);
-	else {
-		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
-
-		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
-			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
-						     last_blk, new_blk,
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		/* The start of the last cluster in the first extent */
+		u64 last_blk = bucket_blkno(first) +
+			((prev_clusters - 1) *
+			 ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+		if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+			ret = ocfs2_mv_xattr_buckets(inode, handle,
+						     bucket_blkno(first),
+						     last_blk, new_blk, 0,
 						     v_start);
-		else {
+			if (ret)
+				mlog_errno(ret);
+		} else {
 			ret = ocfs2_divide_xattr_cluster(inode, handle,
 							 last_blk, new_blk,
 							 v_start);
+			if (ret)
+				mlog_errno(ret);
 
-			if ((*header_bh)->b_blocknr == last_blk && extend)
+			if ((bucket_blkno(target) == last_blk) && extend)
 				*extend = 0;
 		}
 	}
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
  */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 				       struct buffer_head *root_bh,
-				       struct buffer_head **first_bh,
-				       struct buffer_head **header_bh,
+				       struct ocfs2_xattr_bucket *first,
+				       struct ocfs2_xattr_bucket *target,
 				       u32 *num_clusters,
 				       u32 prev_cpos,
-				       u64 prev_blkno,
-				       int *extend)
+				       int *extend,
+				       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret, credits;
+	int ret;
 	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	u32 prev_clusters = *num_clusters;
 	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
 	u64 block;
-	handle_t *handle = NULL;
-	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, (unsigned long long)prev_blkno);
+	     prev_cpos, (unsigned long long)bucket_blkno(first));
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
-	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-				    &data_ac, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
-
-	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-					    clusters_to_add);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(ret);
-		goto leave;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
-	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
 				     clusters_to_add, &bit_off, &num_bits);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	if (prev_blkno + prev_clusters * bpc == block &&
+	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
 	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
 	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
 		/*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	} else {
 		ret = ocfs2_adjust_xattr_cross_cluster(inode,
 						       handle,
-						       first_bh,
-						       header_bh,
+						       first,
+						       target,
 						       block,
-						       prev_blkno,
 						       prev_clusters,
 						       &v_start,
 						       extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 
-	if (handle->h_buffer_credits < credits) {
-		/*
-		 * The journal has been restarted before, and don't
-		 * have enough space for the insertion, so extend it
-		 * here.
-		 */
-		ret = ocfs2_extend_trans(handle, credits);
-		if (ret) {
-			mlog_errno(ret);
-			goto leave;
-		}
-	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
 	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-				  num_bits, 0, meta_ac);
+				  num_bits, 0, ctxt->meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
 	ret = ocfs2_journal_dirty(handle, root_bh);
-	if (ret < 0) {
+	if (ret < 0)
 		mlog_errno(ret);
-		goto leave;
-	}
 
 leave:
-	if (handle)
-		ocfs2_commit_trans(osb, handle);
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
 	return ret;
 }
 
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * We are given an extent.  'first' is the bucket at the very front of
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
  */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
-				     struct buffer_head *first_bh,
-				     struct buffer_head *start_bh,
+				     handle_t *handle,
+				     struct ocfs2_xattr_bucket *first,
+				     u64 target_blk,
 				     u32 num_clusters)
 {
 	int ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	u64 start_blk = start_bh->b_blocknr, end_blk;
-	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
-	handle_t *handle;
-	struct ocfs2_xattr_header *first_xh =
-				(struct ocfs2_xattr_header *)first_bh->b_data;
-	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+	u64 end_blk;
+	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
 
 	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", (unsigned long long)start_blk,
-	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+	     "from %llu, len = %u\n", (unsigned long long)target_blk,
+	     (unsigned long long)bucket_blkno(first), num_clusters);
 
-	BUG_ON(bucket >= num_buckets);
+	/* The extent must have room for an additional bucket */
+	BUG_ON(new_bucket >=
+	       (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
 
-	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+	/* end_blk points to the last existing bucket */
+	end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
 
 	/*
-	 * We will touch all the buckets after the start_bh(include it).
-	 * Add one more bucket and modify the first_bh.
+	 * end_blk is the start of the last existing bucket.
+	 * Thus, (end_blk - target_blk) covers the target bucket and
+	 * every bucket after it up to, but not including, the last
+	 * existing bucket.  Then we add the last existing bucket, the
+	 * new bucket, and the first bucket (3 * blk_per_bucket).
 	 */
-	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
+	credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+		  handle->h_buffer_credits;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, first_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto commit;
+		goto out;
 	}
 
-	while (end_blk != start_blk) {
+	while (end_blk != target_blk) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
 					    end_blk + blk_per_bucket, 0);
 		if (ret)
-			goto commit;
+			goto out;
 		end_blk -= blk_per_bucket;
 	}
 
-	/* Move half of the xattr in start_blk to the next bucket. */
-	ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
-					start_blk + blk_per_bucket, NULL, 0);
+	/* Move half of the xattr in target_blkno to the next bucket. */
+	ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+					target_blk + blk_per_bucket, NULL, 0);
 
-	le16_add_cpu(&first_xh->xh_num_buckets, 1);
-	ocfs2_journal_dirty(handle, first_bh);
+	le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+	ocfs2_xattr_bucket_journal_dirty(handle, first);
 
-commit:
-	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
- * xb_bh is the ocfs2_xattr_block.
- * We will move all the buckets starting from header_bh to the next place. As
- * for this one, half num of its xattrs will be moved to the next one.
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
  *
- * We will allocate a new cluster if current cluster is full and adjust
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * If current cluster is full, we'll allocate a new one.  This may not
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
  */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 				      struct buffer_head *xb_bh,
-				      struct buffer_head *header_bh)
+				      struct ocfs2_xattr_bucket *target,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	struct ocfs2_xattr_header *first_xh = NULL;
-	struct buffer_head *first_bh = NULL;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
 	struct ocfs2_extent_list *el = &xb_root->xt_list;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
-	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u32 name_hash =
+		le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int ret, num_buckets, extend = 1;
 	u64 p_blkno;
 	u32 e_cpos, num_clusters;
+	/* The bucket at the front of the extent */
+	struct ocfs2_xattr_bucket *first;
 
-	mlog(0, "Add new xattr bucket starting form %llu\n",
-	     (unsigned long long)header_bh->b_blocknr);
+	mlog(0, "Add new xattr bucket starting from %llu\n",
+	     (unsigned long long)bucket_blkno(target));
 
-	/*
-	 * Add refrence for header_bh here because it may be
-	 * changed in ocfs2_add_new_xattr_cluster and we need
-	 * to free it in the end.
-	 */
-	get_bh(header_bh);
+	/* The first bucket of the original extent */
+	first = ocfs2_xattr_bucket_new(inode);
+	if (!first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
 	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
 				  &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+	ret = ocfs2_read_xattr_bucket(first, p_blkno);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-
-	if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+	if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+		/*
+		 * This can move first+target if the target bucket moves
+		 * to the new extent.
+		 */
 		ret = ocfs2_add_new_xattr_cluster(inode,
 						  xb_bh,
-						  &first_bh,
-						  &header_bh,
+						  first,
+						  target,
 						  &num_clusters,
 						  e_cpos,
-						  p_blkno,
-						  &extend);
+						  &extend,
+						  ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	if (extend)
+	if (extend) {
 		ret = ocfs2_extend_xattr_bucket(inode,
-						first_bh,
-						header_bh,
+						ctxt->handle,
+						first,
+						bucket_blkno(target),
 						num_clusters);
-	if (ret)
-		mlog_errno(ret);
+		if (ret)
+			mlog_errno(ret);
+	}
+
 out:
-	brelse(first_bh);
-	brelse(header_bh);
+	ocfs2_xattr_bucket_free(first);
+
 	return ret;
 }
 
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 	int block_off = offs >> inode->i_sb->s_blocksize_bits;
 
 	offs = offs % inode->i_sb->s_blocksize;
-	return bucket->bhs[block_off]->b_data + offs;
+	return bucket_block(bucket, block_off) + offs;
 }
 
 /*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 				xe->xe_value_size = 0;
 
 			val = ocfs2_xattr_bucket_get_val(inode,
-							 &xs->bucket, offs);
+							 xs->bucket, offs);
 			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
 			       size - OCFS2_XATTR_SIZE(name_len));
 			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
 		xh->xh_free_start = cpu_to_le16(offs);
 	}
 
-	val = ocfs2_xattr_bucket_get_val(inode,
-					 &xs->bucket, offs - size);
+	val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
 	xe->xe_name_offset = cpu_to_le16(offs - size);
 
 	memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
 	return;
 }
 
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-					     handle_t *handle,
-					     struct ocfs2_xattr_search *xs,
-					     struct buffer_head **bhs,
-					     u16 bh_num)
-{
-	int ret = 0, off, block_off;
-	struct ocfs2_xattr_entry *xe = xs->here;
-
-	/*
-	 * First calculate all the blocks we should journal_access
-	 * and journal_dirty. The first block should always be touched.
-	 */
-	ret = ocfs2_journal_dirty(handle, bhs[0]);
-	if (ret)
-		mlog_errno(ret);
-
-	/* calc the data. */
-	off = le16_to_cpu(xe->xe_name_offset);
-	block_off = off >> inode->i_sb->s_blocksize_bits;
-	ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-	if (ret)
-		mlog_errno(ret);
-
-	return ret;
-}
-
 /*
  * Set the xattr entry in the specified bucket.
  * The bucket is indicated by xs->bucket and it should have the enough
  * space for the xattr insertion.
  */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   handle_t *handle,
 					   struct ocfs2_xattr_info *xi,
 					   struct ocfs2_xattr_search *xs,
 					   u32 name_hash,
 					   int local)
 {
-	int i, ret;
-	handle_t *handle = NULL;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int ret;
+	u64 blkno;
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->value_len, xi->name_index,
-	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+	     (unsigned long long)bucket_blkno(xs->bucket));
 
-	if (!xs->bucket.bhs[1]) {
-		ret = ocfs2_read_blocks(inode,
-					xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
-					0);
+	if (!xs->bucket->bu_bhs[1]) {
+		blkno = bucket_blkno(xs->bucket);
+		ocfs2_xattr_bucket_relse(xs->bucket);
+		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, blk_per_bucket);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
 	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
-	/*Only dirty the blocks we have touched in set xattr. */
-	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-						xs->bucket.bhs, blk_per_bucket);
-	if (ret)
-		mlog_errno(ret);
-out:
-	ocfs2_commit_trans(osb, handle);
-
-	return ret;
-}
-
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-					 struct buffer_head *xe_bh,
-					 struct ocfs2_xattr_entry *xe,
-					 u64 new_size)
-{
-	int ret;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle = NULL;
-
-	handle = ocfs2_start_trans(osb, 1);
-	if (IS_ERR(handle)) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xe_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	xe->xe_value_size = cpu_to_le64(new_size);
-
-	ret = ocfs2_journal_dirty(handle, xe_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
@@ -4210,18 +4697,19 @@ out:
  * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
  */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-					     struct buffer_head *header_bh,
+					     struct ocfs2_xattr_bucket *bucket,
 					     int xe_off,
-					     int len)
+					     int len,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, offset;
 	u64 value_blk;
-	struct buffer_head *value_bh = NULL;
-	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_xattr_entry *xe;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
 
 	xe = &xh->xh_entries[xe_off];
 
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 
 	/* We don't allow ocfs2_xattr_value to be stored in different block. */
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-	value_blk += header_bh->b_blocknr;
 
-	ret = ocfs2_read_block(inode, value_blk, &value_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	vb.vb_bh = bucket->bu_bhs[value_blk];
+	BUG_ON(!vb.vb_bh);
 
-	xv = (struct ocfs2_xattr_value_root *)
-		(value_bh->b_data + offset % blocksize);
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+		(vb.vb_bh->b_data + offset % blocksize);
 
-	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
-	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+	/*
+	 * From here on out we have to dirty the bucket.  The generic
+	 * value calls only modify one of the bucket's bhs, but we need
+	 * to send the bucket at once.  So if they error, they *could* have
+	 * modified something.  We have to assume they did, and dirty
+	 * the whole bucket.  This leaves us in a consistent state.
+	 */
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)bucket_blkno(bucket), len);
+	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_dirty;
 	}
 
+	xe->xe_value_size = cpu_to_le64(len);
+
+out_dirty:
+	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
 out:
-	brelse(value_bh);
 	return ret;
 }
 
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-						struct ocfs2_xattr_search *xs,
-						int len)
+					struct ocfs2_xattr_search *xs,
+					int len,
+					struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, offset;
 	struct ocfs2_xattr_entry *xe = xs->here;
 	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
 
-	BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+	BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
 
 	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
-						offset, len);
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
+						offset, len, ctxt);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						handle_t *handle,
 						struct ocfs2_xattr_search *xs,
 						char *val,
 						int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 
 	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
 
-	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+	return __ocfs2_xattr_set_value_outside(inode, handle,
+					       xv, val, value_len);
 }
 
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -4392,26 +4891,19 @@ out:
 }
 
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+					 handle_t *handle,
 					 struct ocfs2_xattr_search *xs)
 {
-	handle_t *handle = NULL;
-	struct ocfs2_xattr_header *xh = xs->bucket.xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
 	struct ocfs2_xattr_entry *last = &xh->xh_entries[
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		return;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		return;
 	}
 
 	/* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
 	le16_add_cpu(&xh->xh_count, -1);
 
-	ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
-	if (ret < 0)
-		mlog_errno(ret);
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 }
 
 /*
@@ -4440,7 +4928,8 @@ out_commit:
  */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_info *xi,
-				     struct ocfs2_xattr_search *xs)
+				     struct ocfs2_xattr_search *xs,
+				     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, local = 1;
 	size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			value_len = 0;
 
 		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-							   value_len);
+							   value_len,
+							   ctxt);
 		if (ret)
 			goto out;
 
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
 	}
 
-	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+					      name_hash, local);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 
 	/* allocate the space now for the outside block storage. */
 	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-						   value_len);
+						   value_len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
 
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			 * storage and we have allocated xattr already,
 			 * so need to remove it.
 			 */
-			ocfs2_xattr_bucket_remove_xs(inode, xs);
+			ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
 		}
 		goto out;
 	}
 
 set_value_outside:
-	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+						   xs, val, value_len);
 out:
 	return ret;
 }
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 					      struct ocfs2_xattr_bucket *bucket,
 					      const char *name)
 {
-	struct ocfs2_xattr_header *xh = bucket->xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
 
 	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 	    xh->xh_entries[0].xe_name_hash) {
 		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
 		     "hash = %u\n",
-		     (unsigned long long)bucket->bhs[0]->b_blocknr,
+		     (unsigned long long)bucket_blkno(bucket),
 		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
 		return -ENOSPC;
 	}
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs)
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	u16 count, header_size, xh_free_start;
-	int i, free, max_free, need, old;
+	int free, max_free, need, old;
 	size_t value_size = 0, name_len = strlen(xi->name);
 	size_t blocksize = inode->i_sb->s_blocksize;
 	int ret, allocation = 0;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
 
@@ -4574,7 +5066,7 @@ try_again:
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
-			(unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+			(unsigned long long)bucket_blkno(xs->bucket),
 			header_size);
 
 	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
 	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
 	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
 	     " %u\n", xs->not_found,
-	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+	     (unsigned long long)bucket_blkno(xs->bucket),
 	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
 	     le16_to_cpu(xh->xh_name_value_len));
 
-	if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+	if (free < need ||
+	    (xs->not_found &&
+	     count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
 		if (need <= max_free &&
 		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
 			/*
@@ -4626,7 +5120,8 @@ try_again:
 			 * name/value will be moved, the xe shouldn't be changed
 			 * in xs.
 			 */
-			ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+			ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+							xs->bucket);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -4658,7 +5153,7 @@ try_again:
 		 * add a new bucket for the insert.
 		 */
 		ret = ocfs2_check_xattr_bucket_collision(inode,
-							 &xs->bucket,
+							 xs->bucket,
 							 xi->name);
 		if (ret) {
 			mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
 
 		ret = ocfs2_add_new_xattr_bucket(inode,
 						 xs->xattr_bh,
-						 xs->bucket.bhs[0]);
+						 xs->bucket,
+						 ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(xs->bucket.bhs[i]);
-
-		memset(&xs->bucket, 0, sizeof(xs->bucket));
-
+		/*
+		 * ocfs2_add_new_xattr_bucket() will have updated
+		 * xs->bucket if it moved, but it will not have updated
+		 * any of the other search fields.  Thus, we drop it and
+		 * re-search.  Everything should be cached, so it'll be
+		 * quick.
+		 */
+		ocfs2_xattr_bucket_relse(xs->bucket);
 		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
 						   xi->name_index,
 						   xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
 	}
 
 xattr_set:
-	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
 	mlog_exit(ret);
 	return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 					void *para)
 {
 	int ret = 0;
-	struct ocfs2_xattr_header *xh = bucket->xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+	int credits = ocfs2_remove_extent_credits(osb->sb) +
+		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
-		ret = ocfs2_xattr_bucket_value_truncate(inode,
-							bucket->bhs[0],
-							i, 0);
+		ctxt.handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+							i, 0, &ctxt);
+
+		ocfs2_commit_trans(osb, ctxt.handle);
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 	}
 
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 	return ret;
 }
 
@@ -4768,6 +5284,74 @@ out:
 }
 
 /*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+					size_t list_size, const char *name,
+					size_t name_len)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+				    void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+				    const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+			       size, flags);
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+			    struct inode *dir,
+			    struct ocfs2_security_xattr_info *si)
+{
+	/* check whether ocfs2 support feature xattr */
+	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+		return -EOPNOTSUPP;
+	return security_inode_init_security(inode, dir, &si->name, &si->value,
+					    &si->value_len);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *di_bh,
+			    struct ocfs2_security_xattr_info *si,
+			    struct ocfs2_alloc_context *xattr_ac,
+			    struct ocfs2_alloc_context *data_ac)
+{
+	return ocfs2_xattr_set_handle(handle, inode, di_bh,
+				     OCFS2_XATTR_INDEX_SECURITY,
+				     si->name, si->value, si->value_len, 0,
+				     xattr_ac, data_ac);
+}
+
+struct xattr_handler ocfs2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ocfs2_xattr_security_list,
+	.get	= ocfs2_xattr_security_get,
+	.set	= ocfs2_xattr_security_set,
+};
+
+/*
  * 'trusted' attributes support
  */
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
 	OCFS2_XATTR_MAX
 };
 
+struct ocfs2_security_xattr_info {
+	int enable;
+	char *name;
+	void *value;
+	size_t value_len;
+};
+
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+			   const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 		    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+			   int, const char *, const void *, size_t, int,
+			   struct ocfs2_alloc_context *,
+			   struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+			    struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+			    struct buffer_head *,
+			    struct ocfs2_security_xattr_info *,
+			    struct ocfs2_alloc_context *,
+			    struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+			     struct ocfs2_security_xattr_info *,
+			     int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+			  int, struct ocfs2_security_xattr_info *,
+			  int *, int *, struct ocfs2_alloc_context **);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+	struct buffer_head		*vb_bh;
+	ocfs2_journal_access_func	vb_access;
+	struct ocfs2_xattr_value_root	*vb_xv;
+};
+
 
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_blocks = 0;
 	inode->i_mapping->a_ops = &omfs_aops;
 
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 1cd7d40e9991..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		goto out_fput;
 
-	if (inode->i_op && inode->i_op->fallocate)
+	if (inode->i_op->fallocate)
 		ret = inode->i_op->fallocate(inode, mode, offset, len);
 	else
 		ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
 		break;
 	}
 
-	inode->i_gid = 0;
-	inode->i_uid = 0;
-
 	d_add(dentry, inode);
 	return NULL;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..5198ada67398 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -384,9 +384,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
-		snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+		dev_set_name(pdev, "%sp%d", dname, partno);
 	else
-		snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+		dev_set_name(pdev, "%s%d", dname, partno);
 
 	device_initialize(pdev);
 	pdev->class = &block_class;
@@ -447,16 +447,11 @@ void register_disk(struct gendisk *disk)
 	struct block_device *bdev;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	char *s;
 	int err;
 
 	ddev->parent = disk->driverfs_dev;
 
-	strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
-	/* ewww... some of these buggers have / in the name... */
-	s = strchr(ddev->bus_id, '/');
-	if (s)
-		*s = '!';
+	dev_set_name(ddev, disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
 	ddev->uevent_suppress = 1;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
 #include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
 #include <linux/resource.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
 	.op   = OP,					\
 }
 
-#define DIR(NAME, MODE, OTYPE)							\
-	NOD(NAME, (S_IFDIR|(MODE)),						\
-		&proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,	\
-		{} )
-#define LNK(NAME, OTYPE)					\
+#define DIR(NAME, MODE, iops, fops)	\
+	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link)					\
 	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
 		&proc_pid_link_inode_operations, NULL,		\
-		{ .proc_get_link = &proc_##OTYPE##_link } )
-#define REG(NAME, MODE, OTYPE)				\
-	NOD(NAME, (S_IFREG|(MODE)), NULL,		\
-		&proc_##OTYPE##_operations, {})
-#define INF(NAME, MODE, OTYPE)				\
+		{ .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops)				\
+	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define INF(NAME, MODE, read)				\
 	NOD(NAME, (S_IFREG|(MODE)), 			\
 		NULL, &proc_info_file_operations,	\
-		{ .proc_read = &proc_##OTYPE } )
-#define ONE(NAME, MODE, OTYPE)				\
+		{ .proc_read = read } )
+#define ONE(NAME, MODE, show)				\
 	NOD(NAME, (S_IFREG|(MODE)), 			\
 		NULL, &proc_single_file_operations,	\
-		{ .proc_show = &proc_##OTYPE } )
+		{ .proc_show = show } )
 
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
 	struct mm_struct *mm = get_task_mm(task);
 	if (mm) {
 		unsigned int nwords = 0;
-		do
+		do {
 			nwords += 2;
-		while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+		} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
 		res = nwords * sizeof(mm->saved_auxv[0]);
 		if (res > PAGE_SIZE)
 			res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
 
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH	64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	struct stack_trace trace;
+	unsigned long *entries;
+	int i;
+
+	entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+	save_stack_trace_tsk(task, &trace);
+
+	for (i = 0; i < trace.nr_entries; i++) {
+		seq_printf(m, "[<%p>] %pS\n",
+			   (void *)entries[i], (void *)entries[i]);
+	}
+	kfree(entries);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
 	struct inode *inode = m->private;
 	struct task_struct *p;
 
-	WARN_ON(!inode);
-
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct task_struct *p;
 
-	WARN_ON(!inode);
-
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	if (!ei->pid)
 		goto out_unlock;
 
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	if (task_dumpable(task)) {
 		rcu_read_lock();
 		cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 					 const struct pid_entry *ents,
 					 unsigned int nents)
 {
-	struct inode *inode;
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct pid_entry *p, *last;
 
 	error = ERR_PTR(-ENOENT);
-	inode = NULL;
 
 	if (!task)
 		goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
 };
 
 static const struct pid_entry attr_dir_stuff[] = {
-	REG("current",    S_IRUGO|S_IWUGO, pid_attr),
-	REG("prev",       S_IRUGO,	   pid_attr),
-	REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
-	REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
-	REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
-	REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+	REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("prev",       S_IRUGO,	   proc_pid_attr_operations),
+	REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 
 static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	if (!ei->pid)
 		goto out_iput;
 
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_mode = p->mode;
 	if (S_ISDIR(inode->i_mode))
 		inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 
 static const struct pid_entry tgid_base_stuff[] = {
-	DIR("task",       S_IRUGO|S_IXUGO, task),
-	DIR("fd",         S_IRUSR|S_IXUSR, fd),
-	DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 #ifdef CONFIG_NET
-	DIR("net",        S_IRUGO|S_IXUGO, net),
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
-	REG("environ",    S_IRUSR, environ),
-	INF("auxv",       S_IRUSR, pid_auxv),
-	ONE("status",     S_IRUGO, pid_status),
-	ONE("personality", S_IRUSR, pid_personality),
-	INF("limits",	  S_IRUSR, pid_limits),
+	REG("environ",    S_IRUSR, proc_environ_operations),
+	INF("auxv",       S_IRUSR, proc_pid_auxv),
+	ONE("status",     S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	INF("limits",	  S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",    S_IRUSR, pid_syscall),
+	INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-	INF("cmdline",    S_IRUGO, pid_cmdline),
-	ONE("stat",       S_IRUGO, tgid_stat),
-	ONE("statm",      S_IRUGO, pid_statm),
-	REG("maps",       S_IRUGO, maps),
+	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+	ONE("stat",       S_IRUGO, proc_tgid_stat),
+	ONE("statm",      S_IRUGO, proc_pid_statm),
+	REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-	REG("numa_maps",  S_IRUGO, numa_maps),
+	REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
 #endif
-	REG("mem",        S_IRUSR|S_IWUSR, mem),
-	LNK("cwd",        cwd),
-	LNK("root",       root),
-	LNK("exe",        exe),
-	REG("mounts",     S_IRUGO, mounts),
-	REG("mountinfo",  S_IRUGO, mountinfo),
-	REG("mountstats", S_IRUSR, mountstats),
+	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",        proc_cwd_link),
+	LNK("root",       proc_root_link),
+	LNK("exe",        proc_exe_link),
+	REG("mounts",     S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+	REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-	REG("clear_refs", S_IWUSR, clear_refs),
-	REG("smaps",      S_IRUGO, smaps),
-	REG("pagemap",    S_IRUSR, pagemap),
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",      S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
+	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-	INF("wchan",      S_IRUGO, pid_wchan),
+	INF("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-	INF("schedstat",  S_IRUGO, pid_schedstat),
+	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-	REG("latency",  S_IRUGO, lstats),
+	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",     S_IRUGO, cpuset),
+	REG("cpuset",     S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, cgroup),
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-	INF("oom_score",  S_IRUGO, oom_score),
-	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
+	INF("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUGO, sessionid),
+	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, tgid_io_accounting),
+	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
 };
 
@@ -2805,66 +2827,69 @@ out_no_task:
  * Tasks
  */
 static const struct pid_entry tid_base_stuff[] = {
-	DIR("fd",        S_IRUSR|S_IXUSR, fd),
-	DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
-	REG("environ",   S_IRUSR, environ),
-	INF("auxv",      S_IRUSR, pid_auxv),
-	ONE("status",    S_IRUGO, pid_status),
-	ONE("personality", S_IRUSR, pid_personality),
-	INF("limits",	 S_IRUSR, pid_limits),
+	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+	REG("environ",   S_IRUSR, proc_environ_operations),
+	INF("auxv",      S_IRUSR, proc_pid_auxv),
+	ONE("status",    S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	INF("limits",	 S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",   S_IRUSR, pid_syscall),
+	INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-	INF("cmdline",   S_IRUGO, pid_cmdline),
-	ONE("stat",      S_IRUGO, tid_stat),
-	ONE("statm",     S_IRUGO, pid_statm),
-	REG("maps",      S_IRUGO, maps),
+	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+	ONE("stat",      S_IRUGO, proc_tid_stat),
+	ONE("statm",     S_IRUGO, proc_pid_statm),
+	REG("maps",      S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-	REG("numa_maps", S_IRUGO, numa_maps),
+	REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
 #endif
-	REG("mem",       S_IRUSR|S_IWUSR, mem),
-	LNK("cwd",       cwd),
-	LNK("root",      root),
-	LNK("exe",       exe),
-	REG("mounts",    S_IRUGO, mounts),
-	REG("mountinfo",  S_IRUGO, mountinfo),
+	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",       proc_cwd_link),
+	LNK("root",      proc_root_link),
+	LNK("exe",       proc_exe_link),
+	REG("mounts",    S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-	REG("clear_refs", S_IWUSR, clear_refs),
-	REG("smaps",     S_IRUGO, smaps),
-	REG("pagemap",    S_IRUSR, pagemap),
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",     S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-	INF("wchan",     S_IRUGO, pid_wchan),
+	INF("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-	INF("schedstat", S_IRUGO, pid_schedstat),
+	INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-	REG("latency",  S_IRUGO, lstats),
+	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",    S_IRUGO, cpuset),
+	REG("cpuset",    S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, cgroup),
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-	INF("oom_score", S_IRUGO, oom_score),
-	REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-	REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUSR, sessionid),
+	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, tid_io_accounting),
+	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
 };
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/idr.h>
 #include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	struct inode *inode = NULL;
 	int error = -ENOENT;
 
-	lock_kernel();
 	spin_lock(&proc_subdir_lock);
 	for (de = de->subdir; de ; de = de->next) {
 		if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	}
 	spin_unlock(&proc_subdir_lock);
 out_unlock:
-	unlock_kernel();
 
 	if (inode) {
 		dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
 
-	lock_kernel();
-
 	ino = inode->i_ino;
 	i = filp->f_pos;
 	switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
-out:	unlock_kernel();
+out:
 	return ret;	
 }
 
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
  * the /proc directory.
  */
 static const struct file_operations proc_dir_operations = {
+	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
 	.readdir		= proc_readdir,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
  */
 void de_put(struct proc_dir_entry *de)
 {
-	lock_kernel();
 	if (!atomic_read(&de->count)) {
 		printk("de_put: entry %s already free!\n", de->name);
-		unlock_kernel();
 		return;
 	}
 
 	if (atomic_dec_and_test(&de->count))
 		free_proc_entry(de);
-	unlock_kernel();
 }
 
 /*
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 }
 
 const struct file_operations proc_net_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= proc_tgid_net_readdir,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
-	inode->i_uid = inode->i_gid = 0;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
 	unsigned int nr = filp->f_pos;
 	int ret;
 
-	lock_kernel();
-
 	if (nr < FIRST_PROCESS_ENTRY) {
 		int error = proc_readdir(filp, dirent, filldir);
-		if (error <= 0) {
-			unlock_kernel();
+		if (error <= 0)
 			return error;
-		}
 		filp->f_pos = FIRST_PROCESS_ENTRY;
 	}
-	unlock_kernel();
 
 	ret = proc_pid_readdir(filp, dirent, filldir);
 	return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Clean:  %8lu kB\n"
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
-		   "Swap:           %8lu kB\n",
+		   "Swap:           %8lu kB\n"
+		   "KernelPageSize: %8lu kB\n"
+		   "MMUPageSize:    %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_clean >> 10,
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
-		   mss.swap >> 10);
+		   mss.swap >> 10,
+		   vma_kernel_pagesize(vma) >> 10,
+		   vma_mmu_pagesize(vma) >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..d4a8be32b902 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,7 +9,7 @@
 
 /*
  * Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
  * each process that owns it. Non-shared memory is counted
  * accurately.
  */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 
 	offset = (unsigned long)(*ppos % PAGE_SIZE);
 	pfn = (unsigned long)(*ppos / PAGE_SIZE);
-	if (pfn > saved_max_pfn)
-		return -EINVAL;
 
 	do {
 		if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 		case Q_SETQUOTA:
 		case Q_GETQUOTA:
 			/* This is just informative test so we are satisfied without a lock */
-			if (!sb_has_quota_enabled(sb, type))
+			if (!sb_has_quota_active(sb, type))
 				return -ESRCH;
 	}
 
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	int cnt;
 
 	sb->s_qcop->quota_sync(sb, type);
+
+	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+		return;
 	/* This is not very clever (and fast) but currently I don't know about
 	 * any other simple way of getting quota data to disk and we must get
 	 * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 			if (type != -1 && type != cnt)
 				continue;
-			if (!sb_has_quota_enabled(sb, cnt))
+			if (!sb_has_quota_active(sb, cnt))
 				continue;
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
 			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 			__u32 fmt;
 
 			down_read(&sb_dqopt(sb)->dqptr_sem);
-			if (!sb_has_quota_enabled(sb, type)) {
+			if (!sb_has_quota_active(sb, type)) {
 				up_read(&sb_dqopt(sb)->dqptr_sem);
 				return -ESRCH;
 			}
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+typedef char *dqbuf_t;
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+
+	depth = info->dqi_qtree_depth - depth - 1;
+	while (depth--)
+		id /= epb;
+	return id % epb;
+}
+
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+	       / info->dqi_entry_size;
+}
+
+static dqbuf_t getdqbuf(size_t size)
+{
+	dqbuf_t buf = kmalloc(size, GFP_NOFS);
+	if (!buf)
+		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+	return buf;
+}
+
+static inline void freedqbuf(dqbuf_t buf)
+{
+	kfree(buf);
+}
+
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	memset(buf, 0, info->dqi_usable_bs);
+	return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int ret, blk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (info->dqi_free_blk) {
+		blk = info->dqi_free_blk;
+		ret = read_blk(info, blk, buf);
+		if (ret < 0)
+			goto out_buf;
+		info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+	}
+	else {
+		memset(buf, 0, info->dqi_usable_bs);
+		/* Assure block allocation... */
+		ret = write_blk(info, info->dqi_blocks, buf);
+		if (ret < 0)
+			goto out_buf;
+		blk = info->dqi_blocks++;
+	}
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	ret = blk;
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	dh->dqdh_entries = cpu_to_le16(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		return err;
+	info->dqi_free_blk = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	if (nextblk) {
+		err = read_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							dh->dqdh_prev_free;
+		err = write_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	if (prevblk) {
+		err = read_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+							dh->dqdh_next_free;
+		err = write_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	} else {
+		info->dqi_free_entry = nextblk;
+		mark_info_dirty(info->dqi_sb, info->dqi_type);
+	}
+	freedqbuf(tmpbuf);
+	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+	/* No matter whether write succeeds block is out of list */
+	if (write_blk(info, blk, buf) < 0)
+		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		goto out_buf;
+	if (info->dqi_free_entry) {
+		err = read_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							cpu_to_le32(blk);
+		err = write_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	freedqbuf(tmpbuf);
+	info->dqi_free_entry = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+	int i;
+
+	for (i = 0; i < info->dqi_entry_size; i++)
+		if (disk[i])
+			return 0;
+	return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+			      struct dquot *dquot, int *err)
+{
+	uint blk, i;
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *ddquot;
+
+	*err = 0;
+	if (!buf) {
+		*err = -ENOMEM;
+		return 0;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	if (info->dqi_free_entry) {
+		blk = info->dqi_free_entry;
+		*err = read_blk(info, blk, buf);
+		if (*err < 0)
+			goto out_buf;
+	} else {
+		blk = get_free_dqblk(info);
+		if ((int)blk < 0) {
+			*err = blk;
+			freedqbuf(buf);
+			return 0;
+		}
+		memset(buf, 0, info->dqi_usable_bs);
+		/* This is enough as block is already zeroed and entry list is empty... */
+		info->dqi_free_entry = blk;
+		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+	}
+	/* Block will be full? */
+	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+		*err = remove_free_dqentry(info, buf, blk);
+		if (*err < 0) {
+			printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+			       "remove block (%u) from entry free list.\n",
+			       blk);
+			goto out_buf;
+		}
+	}
+	le16_add_cpu(&dh->dqdh_entries, 1);
+	/* Find free structure in block */
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+	     i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+				"but it shouldn't.\n");
+		*err = -EIO;
+		goto out_buf;
+	}
+#endif
+	*err = write_blk(info, blk, buf);
+	if (*err < 0) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+				"data block %u.\n", blk);
+		goto out_buf;
+	}
+	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+			sizeof(struct qt_disk_dqdbheader) +
+			i * info->dqi_entry_size;
+	freedqbuf(buf);
+	return blk;
+out_buf:
+	freedqbuf(buf);
+	return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			  uint *treeblk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0, newson = 0, newact = 0;
+	__le32 *ref;
+	uint newblk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (!*treeblk) {
+		ret = get_free_dqblk(info);
+		if (ret < 0)
+			goto out_buf;
+		*treeblk = ret;
+		memset(buf, 0, info->dqi_usable_bs);
+		newact = 1;
+	} else {
+		ret = read_blk(info, *treeblk, buf);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't read tree quota block "
+					"%u.\n", *treeblk);
+			goto out_buf;
+		}
+	}
+	ref = (__le32 *)buf;
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!newblk)
+		newson = 1;
+	if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+		if (newblk) {
+			printk(KERN_ERR "VFS: Inserting already present quota "
+					"entry (block %u).\n",
+			       le32_to_cpu(ref[get_index(info,
+						dquot->dq_id, depth)]));
+			ret = -EIO;
+			goto out_buf;
+		}
+#endif
+		newblk = find_free_dqentry(info, dquot, &ret);
+	} else {
+		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+	}
+	if (newson && ret >= 0) {
+		ref[get_index(info, dquot->dq_id, depth)] =
+							cpu_to_le32(newblk);
+		ret = write_blk(info, *treeblk, buf);
+	} else if (newact && ret < 0) {
+		put_free_dqblk(info, buf, *treeblk);
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot)
+{
+	int tmp = QT_TREEOFF;
+	return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	ssize_t ret;
+	dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+
+	if (!ddquot)
+		return -ENOMEM;
+
+	/* dq_off is guarded by dqio_mutex */
+	if (!dquot->dq_off) {
+		ret = dq_insert_tree(info, dquot);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Error %zd occurred while "
+					"creating quota.\n", ret);
+			freedqbuf(ddquot);
+			return ret;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+	spin_unlock(&dq_data_lock);
+	ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+					info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+		       sb->s_id);
+		if (ret >= 0)
+			ret = -ENOSPC;
+	} else {
+		ret = 0;
+	}
+	dqstats.writes++;
+	freedqbuf(ddquot);
+
+	return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			uint blk)
+{
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+
+	if (!buf)
+		return -ENOMEM;
+	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+		printk(KERN_ERR "VFS: Quota structure has offset to other "
+		  "block (%u) than it should (%u).\n", blk,
+		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		goto out_buf;
+	}
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		goto out_buf;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	le16_add_cpu(&dh->dqdh_entries, -1);
+	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+		ret = remove_free_dqentry(info, buf, blk);
+		if (ret >= 0)
+			ret = put_free_dqblk(info, buf, blk);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+			  "to free list.\n", blk);
+			goto out_buf;
+		}
+	} else {
+		memset(buf +
+		       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+		       0, info->dqi_entry_size);
+		if (le16_to_cpu(dh->dqdh_entries) ==
+		    qtree_dqstr_in_blk(info) - 1) {
+			/* Insert will write block itself */
+			ret = insert_free_dqentry(info, buf, blk);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't insert quota data "
+				       "block (%u) to free entry list.\n", blk);
+				goto out_buf;
+			}
+		} else {
+			ret = write_blk(info, blk, buf);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't write quota data "
+				  "block %u\n", blk);
+				goto out_buf;
+			}
+		}
+	}
+	dquot->dq_off = 0;	/* Quota is now unattached */
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+		       uint *blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+	uint newblk;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, *blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		goto out_buf;
+	}
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (depth == info->dqi_qtree_depth - 1) {
+		ret = free_dqentry(info, dquot, newblk);
+		newblk = 0;
+	} else {
+		ret = remove_tree(info, dquot, &newblk, depth+1);
+	}
+	if (ret >= 0 && !newblk) {
+		int i;
+		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+		/* Block got empty? */
+		for (i = 0;
+		     i < (info->dqi_usable_bs >> 2) && !ref[i];
+		     i++);
+		/* Don't put the root block into the free block list */
+		if (i == (info->dqi_usable_bs >> 2)
+		    && *blk != QT_TREEOFF) {
+			put_free_dqblk(info, buf, *blk);
+			*blk = 0;
+		} else {
+			ret = write_blk(info, *blk, buf);
+			if (ret < 0)
+				printk(KERN_ERR "VFS: Can't write quota tree "
+				  "block %u.\n", *blk);
+		}
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	uint tmp = QT_TREEOFF;
+
+	if (!dquot->dq_off)	/* Even not allocated? */
+		return 0;
+	return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot, uint blk)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	int i;
+	char *ddquot;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+	     i++, ddquot += info->dqi_entry_size);
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: Quota for id %u referenced "
+		  "but not present.\n", dquot->dq_id);
+		ret = -EIO;
+		goto out_buf;
+	} else {
+		ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+				struct dquot *dquot, uint blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	ret = 0;
+	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!blk)	/* No reference? */
+		goto out_buf;
+	if (depth < info->dqi_qtree_depth - 1)
+		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+	else
+		ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+				  struct dquot *dquot)
+{
+	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	loff_t offset;
+	dqbuf_t ddquot;
+	int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+	/* Invalidated quota? */
+	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		return -EIO;
+	}
+#endif
+	/* Do we know offset of the dquot entry in the quota file? */
+	if (!dquot->dq_off) {
+		offset = find_dqentry(info, dquot);
+		if (offset <= 0) {	/* Entry not present? */
+			if (offset < 0)
+				printk(KERN_ERR "VFS: Can't read quota "
+				  "structure for id %u.\n", dquot->dq_id);
+			dquot->dq_off = 0;
+			set_bit(DQ_FAKE_B, &dquot->dq_flags);
+			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+			ret = offset;
+			goto out;
+		}
+		dquot->dq_off = offset;
+	}
+	ddquot = getdqbuf(info->dqi_entry_size);
+	if (!ddquot)
+		return -ENOMEM;
+	ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+				   info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		if (ret >= 0)
+			ret = -EIO;
+		printk(KERN_ERR "VFS: Error while reading quota "
+				"structure for id %u.\n", dquot->dq_id);
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+		freedqbuf(ddquot);
+		goto out;
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+	if (!dquot->dq_dqb.dqb_bhardlimit &&
+	    !dquot->dq_dqb.dqb_bsoftlimit &&
+	    !dquot->dq_dqb.dqb_ihardlimit &&
+	    !dquot->dq_dqb.dqb_isoftlimit)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	freedqbuf(ddquot);
+out:
+	dqstats.reads++;
+	return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+		return qtree_delete_dquot(info, dquot);
+	return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF	1		/* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 
 #include <asm/byteorder.h>
 
+#include "quotaio_v1.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
 
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
 	m->dqb_ihardlimit = d->dqb_ihardlimit;
 	m->dqb_isoftlimit = d->dqb_isoftlimit;
 	m->dqb_curinodes = d->dqb_curinodes;
-	m->dqb_bhardlimit = d->dqb_bhardlimit;
-	m->dqb_bsoftlimit = d->dqb_bsoftlimit;
-	m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+	m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+	m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+	m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
 	m->dqb_itime = d->dqb_itime;
 	m->dqb_btime = d->dqb_btime;
 }
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
 	d->dqb_ihardlimit = m->dqb_ihardlimit;
 	d->dqb_isoftlimit = m->dqb_isoftlimit;
 	d->dqb_curinodes = m->dqb_curinodes;
-	d->dqb_bhardlimit = m->dqb_bhardlimit;
-	d->dqb_bsoftlimit = m->dqb_bsoftlimit;
-	d->dqb_curblocks = toqb(m->dqb_curspace);
+	d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+	d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
 	d->dqb_itime = m->dqb_itime;
 	d->dqb_btime = m->dqb_btime;
 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,16 +14,37 @@
 
 #include <asm/byteorder.h>
 
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
 
 #define __QUOTA_V2_PARANOIA
 
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2_qtree_ops = {
+	.mem2disk_dqblk = v2_mem2diskdqb,
+	.disk2mem_dqblk = v2_disk2memdqb,
+	.is_id = v2_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
 
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
 
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo;
 	ssize_t size;
 
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
 			sb->s_id);
 		return -1;
 	}
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	if (!info->dqi_priv) {
+		printk(KERN_WARNING
+		       "Not enough memory for quota information structure.\n");
+		return -1;
+	}
+	qinfo = info->dqi_priv;
 	/* limits are stored as unsigned 32-bit data */
 	info->dqi_maxblimit = 0xffffffff;
 	info->dqi_maxilimit = 0xffffffff;
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-	info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
-	info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
-	info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_sb = sb;
+	qinfo->dqi_type = type;
+	qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+	qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+	qinfo->dqi_ops = &v2_qtree_ops;
 	return 0;
 }
 
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
 	ssize_t size;
 
 	spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
 	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
-	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
-	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+	dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	return 0;
 }
 
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
+	struct v2_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
 	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
 	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
 	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
 	m->dqb_itime = le64_to_cpu(d->dqb_itime);
-	m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
-	m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+	m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+		m->dqb_itime = 0;
 }
 
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
+	struct v2_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
 	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
-	d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
-	d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+	d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
-	d->dqb_id = cpu_to_le32(id);
-}
-
-static dqbuf_t getdqbuf(void)
-{
-	dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-	if (!buf)
-		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-	return buf;
-}
-
-static inline void freedqbuf(dqbuf_t buf)
-{
-	kfree(buf);
-}
-
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	memset(buf, 0, V2_DQBLKSIZE);
-	return sb->s_op->quota_read(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	return sb->s_op->quota_write(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-	dqbuf_t buf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int ret, blk;
-
-	if (!buf)
-		return -ENOMEM;
-	if (info->u.v2_i.dqi_free_blk) {
-		blk = info->u.v2_i.dqi_free_blk;
-		if ((ret = read_blk(sb, type, blk, buf)) < 0)
-			goto out_buf;
-		info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-	}
-	else {
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* Assure block allocation... */
-		if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-			goto out_buf;
-		blk = info->u.v2_i.dqi_blocks++;
-	}
-	mark_info_dirty(sb, type);
-	ret = blk;
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	dh->dqdh_entries = cpu_to_le16(0);
-	info->u.v2_i.dqi_free_blk = blk;
-	mark_info_dirty(sb, type);
-	/* Some strange block. We had better leave it... */
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		return err;
-	return 0;
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
 }
 
-/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+static int v2_is_id(void *dp, struct dquot *dquot)
 {
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-	int err;
+	struct v2_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
 
-	if (!tmpbuf)
-		return -ENOMEM;
-	if (nextblk) {
-		if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-		if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	if (prevblk) {
-		if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-		if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	else {
-		info->u.v2_i.dqi_free_entry = nextblk;
-		mark_info_dirty(sb, type);
-	}
-	freedqbuf(tmpbuf);
-	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-	/* No matter whether write succeeds block is out of list */
-	if (write_blk(sb, type, blk, buf) < 0)
-		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	if (!tmpbuf)
-		return -ENOMEM;
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		goto out_buf;
-	if (info->u.v2_i.dqi_free_entry) {
-		if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-		if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	freedqbuf(tmpbuf);
-	info->u.v2_i.dqi_free_entry = blk;
-	mark_info_dirty(sb, type);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-	struct super_block *sb = dquot->dq_sb;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-	uint blk, i;
-	struct v2_disk_dqdbheader *dh;
-	struct v2_disk_dqblk *ddquot;
-	struct v2_disk_dqblk fakedquot;
-	dqbuf_t buf;
-
-	*err = 0;
-	if (!(buf = getdqbuf())) {
-		*err = -ENOMEM;
+	if (qtree_entry_unused(info, dp))
 		return 0;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	ddquot = GETENTRIES(buf);
-	if (info->u.v2_i.dqi_free_entry) {
-		blk = info->u.v2_i.dqi_free_entry;
-		if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-			goto out_buf;
-	}
-	else {
-		blk = get_free_dqblk(sb, dquot->dq_type);
-		if ((int)blk < 0) {
-			*err = blk;
-			freedqbuf(buf);
-			return 0;
-		}
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* This is enough as block is already zeroed and entry list is empty... */
-		info->u.v2_i.dqi_free_entry = blk;
-		mark_info_dirty(sb, dquot->dq_type);
-	}
-	if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)	/* Block will be full? */
-		if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-			goto out_buf;
-		}
-	le16_add_cpu(&dh->dqdh_entries, 1);
-	memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-	/* Find free structure in block */
-	for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-		*err = -EIO;
-		goto out_buf;
-	}
-#endif
-	if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-		goto out_buf;
-	}
-	dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-	freedqbuf(buf);
-	return blk;
-out_buf:
-	freedqbuf(buf);
-	return 0;
-}
-
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	dqbuf_t buf;
-	int ret = 0, newson = 0, newact = 0;
-	__le32 *ref;
-	uint newblk;
-
-	if (!(buf = getdqbuf()))
-		return -ENOMEM;
-	if (!*treeblk) {
-		ret = get_free_dqblk(sb, dquot->dq_type);
-		if (ret < 0)
-			goto out_buf;
-		*treeblk = ret;
-		memset(buf, 0, V2_DQBLKSIZE);
-		newact = 1;
-	}
-	else {
-		if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-			printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-			goto out_buf;
-		}
-	}
-	ref = (__le32 *)buf;
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!newblk)
-		newson = 1;
-	if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-			ret = -EIO;
-			goto out_buf;
-		}
-#endif
-		newblk = find_free_dqentry(dquot, &ret);
-	}
-	else
-		ret = do_insert_tree(dquot, &newblk, depth+1);
-	if (newson && ret >= 0) {
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-		ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-	}
-	else if (newact && ret < 0)
-		put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
 }
 
-/* Wrapper for inserting quota structure into tree */
-static inline int dq_insert_tree(struct dquot *dquot)
+static int v2_read_dquot(struct dquot *dquot)
 {
-	int tmp = V2_DQTREEOFF;
-	return do_insert_tree(dquot, &tmp, 0);
+	return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 
-/*
- *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 {
-	int type = dquot->dq_type;
-	ssize_t ret;
-	struct v2_disk_dqblk ddquot, empty;
-
-	/* dq_off is guarded by dqio_mutex */
-	if (!dquot->dq_off)
-		if ((ret = dq_insert_tree(dquot)) < 0) {
-			printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-			return ret;
-		}
-	spin_lock(&dq_data_lock);
-	mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-	/* Argh... We may need to write structure full of zeroes but that would be
-	 * treated as an empty place by the rest of the code. Format change would
-	 * be definitely cleaner but the problems probably are not worth it */
-	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-	if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-		ddquot.dqb_itime = cpu_to_le64(1);
-	spin_unlock(&dq_data_lock);
-	ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-	      (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-	if (ret != sizeof(struct v2_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-		if (ret >= 0)
-			ret = -ENOSPC;
-	}
-	else
-		ret = 0;
-	dqstats.writes++;
-
-	return ret;
+	return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 
-/* Free dquot entry in data block */
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	struct v2_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-
-	if (!buf)
-		return -ENOMEM;
-	if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-		printk(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-		goto out_buf;
-	}
-	if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-		goto out_buf;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	le16_add_cpu(&dh->dqdh_entries, -1);
-	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
-		if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-		    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
-			goto out_buf;
-		}
-	}
-	else {
-		memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-		  sizeof(struct v2_disk_dqblk));
-		if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-			/* Insert will write block itself */
-			if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-				printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-				goto out_buf;
-			}
-		}
-		else
-			if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-				printk(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
-				goto out_buf;
-			}
-	}
-	dquot->dq_off = 0;	/* Quota is now unattached */
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-	uint newblk;
-	__le32 *ref = (__le32 *)buf;
-	
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-		goto out_buf;
-	}
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (depth == V2_DQTREEDEPTH-1) {
-		ret = free_dqentry(dquot, newblk);
-		newblk = 0;
-	}
-	else
-		ret = remove_tree(dquot, &newblk, depth+1);
-	if (ret >= 0 && !newblk) {
-		int i;
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-		for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);	/* Block got empty? */
-		/* Don't put the root block into the free block list */
-		if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-			put_free_dqblk(sb, type, buf, *blk);
-			*blk = 0;
-		}
-		else
-			if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-				printk(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
-	}
-out_buf:
-	freedqbuf(buf);
-	return ret;	
-}
-
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-	uint tmp = V2_DQTREEOFF;
-
-	if (!dquot->dq_off)	/* Even not allocated? */
-		return 0;
-	return remove_tree(dquot, &tmp, 0);
-}
-
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	int i;
-	struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	if (dquot->dq_id)
-		for (i = 0; i < V2_DQSTRINBLK &&
-		     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-	else {	/* ID 0 as a bit more complicated searching... */
-		struct v2_disk_dqblk fakedquot;
-
-		memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-		for (i = 0; i < V2_DQSTRINBLK; i++)
-			if (!le32_to_cpu(ddquot[i].dqb_id) &&
-			    memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-				break;
-	}
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
-		ret = -EIO;
-		goto out_buf;
-	}
-	else
-		ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-		  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	__le32 *ref = (__le32 *)buf;
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	ret = 0;
-	blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!blk)	/* No reference? */
-		goto out_buf;
-	if (depth < V2_DQTREEDEPTH-1)
-		ret = find_tree_dqentry(dquot, blk, depth+1);
-	else
-		ret = find_block_dqentry(dquot, blk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-	return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-
-static int v2_read_dquot(struct dquot *dquot)
+static int v2_release_dquot(struct dquot *dquot)
 {
-	int type = dquot->dq_type;
-	loff_t offset;
-	struct v2_disk_dqblk ddquot, empty;
-	int ret = 0;
-
-#ifdef __QUOTA_V2_PARANOIA
-	/* Invalidated quota? */
-	if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-		return -EIO;
-	}
-#endif
-	offset = find_dqentry(dquot);
-	if (offset <= 0) {	/* Entry not present? */
-		if (offset < 0)
-			printk(KERN_ERR "VFS: Can't read quota "
-			  "structure for id %u.\n", dquot->dq_id);
-		dquot->dq_off = 0;
-		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-		ret = offset;
-	}
-	else {
-		dquot->dq_off = offset;
-		if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-		    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-		    != sizeof(struct v2_disk_dqblk)) {
-			if (ret >= 0)
-				ret = -EIO;
-			printk(KERN_ERR "VFS: Error while reading quota "
-			  "structure for id %u.\n", dquot->dq_id);
-			memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-		}
-		else {
-			ret = 0;
-			/* We need to escape back all-zero structure */
-			memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-			empty.dqb_itime = cpu_to_le64(1);
-			if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-				ddquot.dqb_itime = 0;
-		}
-		disk2memdqb(&dquot->dq_dqb, &ddquot);
-		if (!dquot->dq_dqb.dqb_bhardlimit &&
-			!dquot->dq_dqb.dqb_bsoftlimit &&
-			!dquot->dq_dqb.dqb_ihardlimit &&
-			!dquot->dq_dqb.dqb_isoftlimit)
-			set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	}
-	dqstats.reads++;
-
-	return ret;
+	return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 
-/* Check whether dquot should not be deleted. We know we are
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
+static int v2_free_file_info(struct super_block *sb, int type)
 {
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
-		return v2_delete_dquot(dquot);
+	kfree(sb_dqinfo(sb, type)->dqi_priv);
 	return 0;
 }
 
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
 	.check_quota_file	= v2_check_quota_file,
 	.read_file_info		= v2_read_file_info,
 	.write_file_info	= v2_write_file_info,
-	.free_file_info		= NULL,
+	.free_file_info		= v2_free_file_info,
 	.read_dqblk		= v2_read_dquot,
 	.commit_dqblk		= v2_write_dquot,
 	.release_dqblk		= v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+	__u32 dqb_bhardlimit;	/* absolute limit on disk blks alloc */
+	__u32 dqb_bsoftlimit;	/* preferred limit on disk blks */
+	__u32 dqb_curblocks;	/* current block count */
+	__u32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__u32 dqb_isoftlimit;	/* preferred inode limit */
+	__u32 dqb_curinodes;	/* current # allocated inodes */
+	time_t dqb_btime;	/* time limit for excessive disk use */
+	time_t dqb_itime;	/* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif	/* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,	/* USRQUOTA */\
+	0xd9c01927	/* GRPQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+	0,		/* USRQUOTA */\
+	0		/* GRPQUOTA */\
+}
+
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le32 dqb_isoftlimit;	/* preferred inode limit */
+	__le32 dqb_curinodes;	/* current # allocated inodes */
+	__le32 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le32 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
+	__le32 dqi_igrace;	/* Time before inode soft limit becomes hard limit */
+	__le32 dqi_flags;	/* Flags for quotafile (DQF_*) */
+	__le32 dqi_blocks;	/* Number of blocks in file */
+	__le32 dqi_free_blk;	/* Number of first free block in the list */
+	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
+};
+
+#define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10				/* Size of leaf block in tree */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		offset += inode->i_size;
 		break;
 	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0)
+			return file->f_pos;
 		offset += file->f_pos;
 		break;
 	}
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 			offset += i_size_read(file->f_path.dentry->d_inode);
 			break;
 		case SEEK_CUR:
+			if (offset == 0) {
+				retval = file->f_pos;
+				goto out;
+			}
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
+out:
 	unlock_kernel();
 	return retval;
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ed04f47007f8..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_bad_inode;
 	}
 	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	if (old_format_only(sb))
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+	else
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
 	if (insert_inode_locked4(inode, args.objectid,
@@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
-	if (old_format_only(sb))
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-	else
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-
 	/* key to search for correct place for new stat data */
 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..c55651f1407c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
 	.release_dquot = reiserfs_release_dquot,
 	.mark_dirty = reiserfs_mark_dquot_dirty,
 	.write_info = reiserfs_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		if (c == 'u' || c == 'g') {
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 						 "reiserfs_parse_options: unknown quota format specified.");
 				return 0;
 			}
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
-	    && sb_any_quota_enabled(s)) {
+	    && sb_any_quota_loaded(s)) {
 		reiserfs_warning(s,
 				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
 		return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-	int nextfh;
+	int nextfh, ret;
 	struct romfs_inode ri;
 	struct inode *i;
 
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
 	i->i_size = be32_to_cpu(ri.size);
 	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
 	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-	i->i_uid = i->i_gid = 0;
 
         /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
-        else
-                ino = 0;
+	ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
+	if (ret >= 0)
+		ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
+	else
+		ino = 0;
 
         ROMFS_I(i)->i_metasize = ino;
         ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 void poll_initwait(struct poll_wqueues *pwq)
 {
 	init_poll_funcptr(&pwq->pt, __pollwait);
+	pwq->polling_task = current;
 	pwq->error = 0;
 	pwq->table = NULL;
 	pwq->inline_index = 0;
 }
-
 EXPORT_SYMBOL(poll_initwait);
 
 static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
 		free_page((unsigned long) old);
 	}
 }
-
 EXPORT_SYMBOL(poll_freewait);
 
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
-	struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
 	struct poll_table_page *table = p->table;
 
 	if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
 		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
 		if (!new_table) {
 			p->error = -ENOMEM;
-			__set_current_state(TASK_RUNNING);
 			return NULL;
 		}
 		new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
 	return table->entry++;
 }
 
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_wqueues *pwq = wait->private;
+	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+	/*
+	 * Although this function is called under waitqueue lock, LOCK
+	 * doesn't imply write barrier and the users expect write
+	 * barrier semantics on wakeup functions.  The following
+	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+	 * and is paired with set_mb() in poll_schedule_timeout.
+	 */
+	smp_wmb();
+	pwq->triggered = 1;
+
+	/*
+	 * Perform the default wake up operation using a dummy
+	 * waitqueue.
+	 *
+	 * TODO: This is hacky but there currently is no interface to
+	 * pass in @sync.  @sync is scheduled to be removed and once
+	 * that happens, wake_up_process() can be used directly.
+	 */
+	return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 				poll_table *p)
 {
-	struct poll_table_entry *entry = poll_get_entry(p);
+	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+	struct poll_table_entry *entry = poll_get_entry(pwq);
 	if (!entry)
 		return;
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
-	init_waitqueue_entry(&entry->wait, current);
+	init_waitqueue_func_entry(&entry->wait, pollwake);
+	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+			  ktime_t *expires, unsigned long slack)
+{
+	int rc = -EINTR;
+
+	set_current_state(state);
+	if (!pwq->triggered)
+		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * Prepare for the next iteration.
+	 *
+	 * The following set_mb() serves two purposes.  First, it's
+	 * the counterpart rmb of the wmb in pollwake() such that data
+	 * written before wake up is always visible after wake up.
+	 * Second, the full barrier guarantees that triggered clearing
+	 * doesn't pass event check of the next iteration.  Note that
+	 * this problem doesn't exist for the first iteration as
+	 * add_wait_queue() has full barrier semantics.
+	 */
+	set_mb(pwq->triggered, 0);
+
+	return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
  * @to:		pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
 
-		set_current_state(TASK_INTERRUPTIBLE);
-
 		inp = fds->in; outp = fds->out; exp = fds->ex;
 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
 
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+					   to, slack))
 			timed_out = 1;
 	}
-	__set_current_state(TASK_RUNNING);
 
 	poll_freewait(&table);
 
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	for (;;) {
 		struct poll_list *walk;
 
-		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
 			struct pollfd * pfd, * pfd_end;
 
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
 			timed_out = 1;
 	}
-	__set_current_state(TASK_RUNNING);
 	return count;
 }
 
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 		struct inode *inode = path.dentry->d_inode;
 
 		error = -EINVAL;
-		if (inode->i_op && inode->i_op->readlink) {
+		if (inode->i_op->readlink) {
 			error = security_inode_readlink(path.dentry);
 			if (!error) {
 				touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
+		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
 {
 	const struct super_operations *sop = sb->s_op;
 
+
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		fsync_super(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
+
+		/*
+		 * wait for asynchronous fs operations to finish before going further
+		 */
+		async_synchronize_full_special(&sb->s_async_list);
+
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
 		lock_kernel();
@@ -461,6 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
+		async_synchronize_full_special(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		}
 
 		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
 	}
 
 	return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
 	struct block_device *bdev = sb->s_bdev;
 	fmode_t mode = sb->s_mode;
 
+	bdev->bd_super = 0;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..ac02b56548bc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	return ret;
 }
 
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @data:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	int ret;
-	int err;
-	struct address_space *mapping = file->f_mapping;
+	const struct file_operations *fop;
+	struct address_space *mapping;
+	int err, ret;
+
+	/*
+	 * Get mapping and operations from the file in case we have
+	 * as file, or get the default values for them in case we
+	 * don't have a struct file available.  Damn nfsd..
+	 */
+	if (file) {
+		mapping = file->f_mapping;
+		fop = file->f_op;
+	} else {
+		mapping = dentry->d_inode->i_mapping;
+		fop = dentry->d_inode->i_fop;
+	}
 
-	if (!file->f_op || !file->f_op->fsync) {
-		/* Why?  We can still call filemap_fdatawrite */
+	if (!fop || !fop->fsync) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+	err = fop->fsync(file, dentry, datasync);
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
 out:
 	return ret;
 }
+EXPORT_SYMBOL(vfs_fsync);
 
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
 {
 	struct file *file;
 	int ret = -EBADF;
 
 	file = fget(fd);
 	if (file) {
-		ret = do_fsync(file, datasync);
+		ret = vfs_fsync(file, file->f_path.dentry, datasync);
 		fput(file);
 	}
 	return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
 
 asmlinkage long sys_fsync(unsigned int fd)
 {
-	return __do_fsync(fd, 0);
+	return do_fsync(fd, 0);
 }
 
 asmlinkage long sys_fdatasync(unsigned int fd)
 {
-	return __do_fsync(fd, 1);
+	return do_fsync(fd, 1);
 }
 
 /*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
 		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-						WB_SYNC_NONE);
+						WB_SYNC_ALL);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
 
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
 
-	inode->i_blocks = 0;
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
 	depends on UBIFS_FS
 	default y
 	help
-	  Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0e5e54d82924..175f9c590b77 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c)
  *
  * This function is called when an operation cannot be budgeted because there
  * is supposedly no free space. But in most cases there is some free space:
- *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
  *     needed, so shrinking the liability is one way to make free space - the
  *     cached data will take less space then it was budgeted for;
  *   o GC may turn some dark space into free space (budgeting treats dark space
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
  * @c: UBIFS file-system description object
  *
  * This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
  * the former, so this function only does simple re-calculation and does not
  * involve any write-back.
  */
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..9832f9abe28e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
 #define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
 
 /*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
  */
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 10ae25b7d1db..9b7c54e0cd2a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
 	if (wbuf->lnum != -1 && avail >= len) {
 		/*
 		 * Someone else has switched the journal head and we have
-		 * enough space now. This happens when more then one process is
+		 * enough space now. This happens when more than one process is
 		 * trying to write to the same journal head at the same time.
 		 */
 		dbg_jnl("return LEB %d back, already have LEB %d:%d",
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
  * @contention: if any contention, this is set to %1
  *
  * This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
  * Returns the number of freed znodes.
  */
 static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0d7564b95f8e..89556ee72518 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
 	struct writeback_control wbc = {
-		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
 		.range_start = 0,
 		.range_end   = LLONG_MAX,
 		.nr_to_write = LONG_MAX,
 	};
 
+	/*
+	 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
+	 * advisory thing to help the file system shove lots of data into the
+	 * queues. If some gets missed then it'll be picked up on the second
+	 * '->sync_fs()' call, with non-zero @wait.
+	 */
+
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
 	if (error)
 		return error;
 	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+	if (d->d_inode->i_op->listxattr) {
 		error = d->d_inode->i_op->listxattr(d, list, size);
 	} else {
 		error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef2..be846d606ae8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1348,7 +1348,7 @@ xfs_finish_flags(
 {
 	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
-	/* Fail a mount where the logbuf is smaller then the log stripe */
+	/* Fail a mount where the logbuf is smaller than the log stripe */
 	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
 		if (mp->m_logbsize <= 0 &&
 		    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {