151 files changed, 5407 insertions, 5005 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
 source "fs/nfs/Kconfig"
 source "fs/nfsd/Kconfig"
 
+config GRACE_PERIOD
+	tristate
+
 config LOCKD
 	tristate
 	depends on FILE_LOCKING
+	select GRACE_PERIOD
 
 config LOCKD_V4
 	bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
 
 config NFS_COMMON
 	bool
-	depends on NFSD || NFS_FS
+	depends on NFSD || NFS_FS || LOCKD
 	default y
 
 source "net/sunrpc/Kconfig"
diff --git a/fs/aio.c b/fs/aio.c
index 733750096b71..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
 
-	if (percpu_ref_init(&ctx->users, free_ioctx_users))
+	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
 		goto err;
 
-	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
 		goto err;
 
 	ctx->cpu = alloc_percpu(struct kioctx_cpu);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..e2f3ad0879ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
+	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..56b8522d5767 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -234,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	    BTRFS_I(inode)->last_sub_trans <=
 	    BTRFS_I(inode)->last_log_commit &&
 	    BTRFS_I(inode)->last_sub_trans <=
-	    BTRFS_I(inode)->root->last_log_commit)
-		return 1;
+	    BTRFS_I(inode)->root->last_log_commit) {
+		/*
+		 * After a ranged fsync we might have left some extent maps
+		 * (that fall outside the fsync's range). So return false
+		 * here if the list isn't empty, to make sure btrfs_log_inode()
+		 * will be called and process those extent maps.
+		 */
+		smp_mb();
+		if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+			return 1;
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a1d36e62179c..d0d78dc07792 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1183,7 +1183,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
 	if (!writers)
 		return ERR_PTR(-ENOMEM);
 
-	ret = percpu_counter_init(&writers->counter, 0);
+	ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
 	if (ret < 0) {
 		kfree(writers);
 		return ERR_PTR(ret);
@@ -2188,7 +2188,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_srcu;
 	}
 
-	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_bdi;
@@ -2196,13 +2196,13 @@ int open_ctree(struct super_block *sb,
 	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
 					(1 + ilog2(nr_cpu_ids));
 
-	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_dirty_metadata_bytes;
 	}
 
-	ret = percpu_counter_init(&fs_info->bio_counter, 0);
+	ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_delalloc_bytes;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3877bf..caaf015d6e4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
-	ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+	ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
 	if (ret) {
 		kfree(found);
 		return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d296efe2d3e7..d0262ceb85e1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3994,7 +3994,8 @@ again:
 		if (ret < 0) {
 			err = ret;
 			goto out_unlock;
-		} if (ret) {
+		}
+		if (ret) {
 			ins_nr = 0;
 			btrfs_release_path(path);
 			continue;
@@ -4093,18 +4094,8 @@ log_extents:
 		}
 	}
 
-	write_lock(&em_tree->lock);
-	/*
-	 * If we're doing a ranged fsync and there are still modified extents
-	 * in the list, we must run on the next fsync call as it might cover
-	 * those extents (a full fsync or an fsync for other range).
-	 */
-	if (list_empty(&em_tree->modified_extents)) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
-		BTRFS_I(inode)->last_log_commit =
-			BTRFS_I(inode)->last_sub_trans;
-	}
-	write_unlock(&em_tree->lock);
+	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
 	if (unlikely(err))
 		btrfs_put_logged_extents(&logged_list);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 340a92d08e84..2c2d6d1d8eee 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -529,12 +529,12 @@ static noinline int device_list_add(const char *path,
 		 */
 
 		/*
-		 * As of now don't allow update to btrfs_fs_device through
-		 * the btrfs dev scan cli, after FS has been mounted.
+		 * For now, we do allow update to btrfs_fs_device through the
+		 * btrfs dev scan cli after FS has been mounted.  We're still
+		 * tracking a problem where systems fail mount by subvolume id
+		 * when we reject replacement on a mounted FS.
 		 */
-		if (fs_devices->opened) {
-			return -EBUSY;
-		} else {
+		if (!fs_devices->opened && found_transid < device->generation) {
 			/*
 			 * That is if the FS is _not_ mounted and if you
 			 * are here, that means there is more than one
@@ -542,8 +542,7 @@ static noinline int device_list_add(const char *path,
 			 * with larger generation number or the last-in if
 			 * generation are equal.
 			 */
-			if (found_transid < device->generation)
-				return -EEXIST;
+			return -EEXIST;
 		}
 
 		name = rcu_string_strdup(path, GFP_NOFS);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..44c14a87750e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
 			end_block = init_page_buffers(page, bdev,
-						index << sizebits, size);
+						(sector_t)index << sizebits,
+						size);
 			goto done;
 		}
 		if (!try_to_free_buffers(page))
@@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
-	end_block = init_page_buffers(page, bdev, index << sizebits, size);
+	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+			size);
 	spin_unlock(&inode->i_mapping->private_lock);
 done:
 	ret = (block < end_block) ? 1 : -ENXIO;
@@ -1251,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
  * a local interrupt disable for that.
  */
 
-#define BH_LRU_SIZE	8
+#define BH_LRU_SIZE	16
 
 struct bh_lru {
 	struct buffer_head *bhs[BH_LRU_SIZE];
@@ -2954,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 /*
  * This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
  * of the physical sector size.
  *
  * We'll just truncate the bio to the size of the device,
@@ -2964,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
  * errors, this only handles the "we need to be able to
  * do IO at the final sector" case.
  */
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
 {
 	sector_t maxsector;
-	unsigned bytes;
+	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	unsigned truncated_bytes;
 
 	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
 	if (!maxsector)
@@ -2982,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 		return;
 
 	maxsector -= bio->bi_iter.bi_sector;
-	bytes = bio->bi_iter.bi_size;
-	if (likely((bytes >> 9) <= maxsector))
+	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
 		return;
 
-	/* Uhhuh. We've got a bh that straddles the device size! */
-	bytes = maxsector << 9;
+	/* Uhhuh. We've got a bio that straddles the device size! */
+	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
 
 	/* Truncate the bio.. */
-	bio->bi_iter.bi_size = bytes;
-	bio->bi_io_vec[0].bv_len = bytes;
+	bio->bi_iter.bi_size -= truncated_bytes;
+	bvec->bv_len -= truncated_bytes;
 
 	/* ..and clear the end of the buffer for reads */
 	if ((rw & RW_MASK) == READ) {
-		void *kaddr = kmap_atomic(bh->b_page);
-		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(bh->b_page);
+		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+				truncated_bytes);
 	}
 }
 
@@ -3039,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	bio->bi_flags |= bio_flags;
 
 	/* Take care of bh's that straddle the end of the device */
-	guard_bh_eod(rw, bio, bh);
+	guard_bio_eod(rw, bio);
 
 	if (buffer_meta(bh))
 		rw |= REQ_META;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 	       cache->brun_percent  < 100);
 
 	if (*args) {
-		pr_err("'bind' command doesn't take an argument");
+		pr_err("'bind' command doesn't take an argument\n");
 		return -EINVAL;
 	}
 
 	if (!cache->rootdirname) {
-		pr_err("No cache directory specified");
+		pr_err("No cache directory specified\n");
 		return -EINVAL;
 	}
 
 	/* don't permit already bound caches to be re-bound */
 	if (test_bit(CACHEFILES_READY, &cache->flags)) {
-		pr_err("Cache already bound");
+		pr_err("Cache already bound\n");
 		return -EBUSY;
 	}
 
@@ -248,7 +248,7 @@ error_open_root:
 	kmem_cache_free(cachefiles_object_jar, fsdef);
 error_root_object:
 	cachefiles_end_secure(cache, saved_cred);
-	pr_err("Failed to register: %d", ret);
+	pr_err("Failed to register: %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
 					 char *args)
 {
-	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
+	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
 
 	return -EINVAL;
 }
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		pr_err("Empty directory specified");
+		pr_err("Empty directory specified\n");
 		return -EINVAL;
 	}
 
 	if (cache->rootdirname) {
-		pr_err("Second cache directory specified");
+		pr_err("Second cache directory specified\n");
 		return -EEXIST;
 	}
 
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		pr_err("Empty security context specified");
+		pr_err("Empty security context specified\n");
 		return -EINVAL;
 	}
 
 	if (cache->secctx) {
-		pr_err("Second security context specified");
+		pr_err("Second security context specified\n");
 		return -EINVAL;
 	}
 
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		pr_err("Empty tag specified");
+		pr_err("Empty tag specified\n");
 		return -EINVAL;
 	}
 
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		pr_err("cull applied to unready cache");
+		pr_err("cull applied to unready cache\n");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		pr_err("cull applied to dead cache");
+		pr_err("cull applied to dead cache\n");
 		return -EIO;
 	}
 
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	pr_err("cull command requires dirfd to be a directory");
+	pr_err("cull command requires dirfd to be a directory\n");
 	return -ENOTDIR;
 
 inval:
-	pr_err("cull command requires dirfd and filename");
+	pr_err("cull command requires dirfd and filename\n");
 	return -EINVAL;
 }
 
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
 	return 0;
 
 inval:
-	pr_err("debug command requires mask");
+	pr_err("debug command requires mask\n");
 	return -EINVAL;
 }
 
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		pr_err("inuse applied to unready cache");
+		pr_err("inuse applied to unready cache\n");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		pr_err("inuse applied to dead cache");
+		pr_err("inuse applied to dead cache\n");
 		return -EIO;
 	}
 
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	pr_err("inuse command requires dirfd to be a directory");
+	pr_err("inuse command requires dirfd to be a directory\n");
 	return -ENOTDIR;
 
 inval:
-	pr_err("inuse command requires dirfd and filename");
+	pr_err("inuse command requires dirfd and filename\n");
 	return -EINVAL;
 }
 
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 
 #define cachefiles_io_error(___cache, FMT, ...)		\
 do {							\
-	pr_err("I/O Error: " FMT, ##__VA_ARGS__);	\
+	pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__);	\
 	fscache_io_error(&(___cache)->cache);		\
 	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
 } while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
 error_object_jar:
 	misc_deregister(&cachefiles_dev);
 error_dev:
-	pr_err("failed to register: %d", ret);
+	pr_err("failed to register: %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..dad7d9542a24 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -543,7 +543,7 @@ lookup_again:
 			       next, next->d_inode, next->d_inode->i_ino);
 
 		} else if (!S_ISDIR(next->d_inode->i_mode)) {
-			pr_err("inode %lu is not a directory",
+			pr_err("inode %lu is not a directory\n",
 			       next->d_inode->i_ino);
 			ret = -ENOBUFS;
 			goto error;
@@ -574,7 +574,7 @@ lookup_again:
 		} else if (!S_ISDIR(next->d_inode->i_mode) &&
 			   !S_ISREG(next->d_inode->i_mode)
 			   ) {
-			pr_err("inode %lu is not a file or directory",
+			pr_err("inode %lu is not a file or directory\n",
 			       next->d_inode->i_ino);
 			ret = -ENOBUFS;
 			goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	ASSERT(subdir->d_inode);
 
 	if (!S_ISDIR(subdir->d_inode->i_mode)) {
-		pr_err("%s is not a directory", dirname);
+		pr_err("%s is not a directory\n", dirname);
 		ret = -EIO;
 		goto check_error;
 	}
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	    !subdir->d_inode->i_op->lookup ||
 	    !subdir->d_inode->i_op->mkdir ||
 	    !subdir->d_inode->i_op->create ||
-	    !subdir->d_inode->i_op->rename ||
+	    (!subdir->d_inode->i_op->rename &&
+	     !subdir->d_inode->i_op->rename2) ||
 	    !subdir->d_inode->i_op->rmdir ||
 	    !subdir->d_inode->i_op->unlink)
 		goto check_error;
@@ -795,13 +796,13 @@ check_error:
 mkdir_error:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(subdir);
-	pr_err("mkdir %s failed with error %d", dirname, ret);
+	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 lookup_error:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	ret = PTR_ERR(subdir);
-	pr_err("Lookup %s failed with error %d", dirname, ret);
+	pr_err("Lookup %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
 	if (ret == -EIO) {
 		cachefiles_io_error(cache, "Lookup failed");
 	} else if (ret != -ENOMEM) {
-		pr_err("Internal error: %d", ret);
+		pr_err("Internal error: %d\n", ret);
 		ret = -EIO;
 	}
 
@@ -950,7 +951,7 @@ error:
 	}
 
 	if (ret != -ENOMEM) {
-		pr_err("Internal error: %d", ret);
+		pr_err("Internal error: %d\n", ret);
 		ret = -EIO;
 	}
 
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..25e745b8eb1b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 	struct cachefiles_one_read *monitor;
 	struct cachefiles_object *object;
 	struct fscache_retrieval *op;
-	struct pagevec pagevec;
 	int error, max;
 
 	op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 
 	_enter("{ino=%lu}", object->backer->d_inode->i_ino);
 
-	pagevec_init(&pagevec, 0);
-
 	max = 8;
 	spin_lock_irq(&object->work_lock);
 
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 {
 	struct cachefiles_object *object;
 	struct cachefiles_cache *cache;
-	struct pagevec pagevec;
 	struct inode *inode;
 	sector_t block0, block;
 	unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
-	pagevec_init(&pagevec, 0);
-
 	/* we assume the absence or presence of the first block is a good
 	 * enough indication for the page as a whole
 	 * - TODO: don't use bmap() for this as it is _not_ actually good
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	}
 
 	if (ret != -EEXIST) {
-		pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
+		pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
 		       dentry->d_name.len, dentry->d_name.len,
 		       dentry->d_name.name, dentry->d_inode->i_ino,
 		       -ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 		if (ret == -ERANGE)
 			goto bad_type_length;
 
-		pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
+		pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
 		       dentry->d_name.len, dentry->d_name.len,
 		       dentry->d_name.name, dentry->d_inode->i_ino,
 		       -ret);
@@ -85,14 +85,14 @@ error:
 	return ret;
 
 bad_type_length:
-	pr_err("Cache object %lu type xattr length incorrect",
+	pr_err("Cache object %lu type xattr length incorrect\n",
 	       dentry->d_inode->i_ino);
 	ret = -EIO;
 	goto error;
 
 bad_type:
 	xtype[2] = 0;
-	pr_err("Cache object %*.*s [%lu] type %s not %s",
+	pr_err("Cache object %*.*s [%lu] type %s not %s\n",
 	       dentry->d_name.len, dentry->d_name.len,
 	       dentry->d_name.name, dentry->d_inode->i_ino,
 	       xtype, type);
@@ -293,7 +293,7 @@ error:
 	return ret;
 
 bad_type_length:
-	pr_err("Cache object %lu xattr length incorrect",
+	pr_err("Cache object %lu xattr length incorrect\n",
 	       dentry->d_inode->i_ino);
 	ret = -EIO;
 	goto error;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b0fafa499505..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.04"
+#define CIFS_VERSION   "2.05"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8a9fded7c135..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -837,6 +837,7 @@ cifs_demultiplex_thread(void *p)
 	struct TCP_Server_Info *server = p;
 	unsigned int pdu_length;
 	char *buf = NULL;
+	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 
 	current->flags |= PF_MEMALLOC;
@@ -927,7 +928,19 @@ cifs_demultiplex_thread(void *p)
 	if (server->smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(server->smallbuf);
 
+	task_to_wake = xchg(&server->tsk, NULL);
 	clean_demultiplex_info(server);
+
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
 	module_put_and_exit(0);
 }
 
@@ -2050,6 +2063,8 @@ cifs_find_tcp_session(struct smb_vol *vol)
 static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
+	struct task_struct *task;
+
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--server->srv_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -2073,6 +2088,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
 }
 
 static struct TCP_Server_Info *
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7c018a1c52f7..5f29354b072a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3568,15 +3568,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				lru_cache_add_file(page);
 				unlock_page(page);
 				page_cache_release(page);
-				if (rc == -EAGAIN)
-					list_add_tail(&page->lru, &tmplist);
 			}
+			/* Fallback to the readpage in error/reconnect cases */
 			kref_put(&rdata->refcount, cifs_readdata_release);
-			if (rc == -EAGAIN) {
-				/* Re-add pages to the page_list and retry */
-				list_splice(&tmplist, page_list);
-				continue;
-			}
 			break;
 		}
 
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..5657416d3483 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		goto out;
 
-	rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
-					fromName, buf, &bytes_written);
+	if (tcon->ses->server->ops->create_mf_symlink)
+		rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+					cifs_sb, fromName, buf, &bytes_written);
+	else
+		rc = -EOPNOTSUPP;
+
 	if (rc)
 		goto out;
 
@@ -339,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		rc = -ENOENT;
 		/* it's not a symlink */
 		goto out;
+	}
 
 	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
 	/* BB what about the timezone? BB */
 
 	/* Subtract the NTFS time offset, then convert to 1s intervals. */
-	u64 t;
+	s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+
+	/*
+	 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+	 * the alternative, do_div, does not work with negative numbers so have
+	 * to special case them
+	 */
+	if (t < 0) {
+		t = -t;
+		ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+		ts.tv_nsec = -ts.tv_nsec;
+		ts.tv_sec = -t;
+	} else {
+		ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+		ts.tv_sec = t;
+	}
 
-	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t;
 	return ts;
 }
 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3a5e83317683..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -745,14 +745,6 @@ out:
 	sess_free_buffer(sess_data);
 }
 
-#else
-
-static void
-sess_auth_lanman(struct sess_data *sess_data)
-{
-	sess_data->result = -EOPNOTSUPP;
-	sess_data->func = NULL;
-}
 #endif
 
 static void
@@ -1103,15 +1095,6 @@ out:
 	ses->auth_key.response = NULL;
 }
 
-#else
-
-static void
-sess_auth_kerberos(struct sess_data *sess_data)
-{
-	cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
-	sess_data->result = -ENOSYS;
-	sess_data->func = NULL;
-}
 #endif /* ! CONFIG_CIFS_UPCALL */
 
 /*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 1a6df4b03f67..52131d8cb4d5 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
 		if (tmprc == -EOPNOTSUPP)
 			*symlink = true;
-		else
+		else if (tmprc == 0)
 			CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index af59d03db492..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
 	"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
 	{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+	{STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+	"STATUS_REPARSE_NOT_HANDLED"},
 	{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
 	"STATUS_DEVICE_REQUIRES_CLEANING"},
 	{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
diff --git a/fs/dcache.c b/fs/dcache.c
index 7a5b51440afa..cb25a1a5e307 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2372,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 }
 EXPORT_SYMBOL(dentry_update_name_case);
 
-static void switch_names(struct dentry *dentry, struct dentry *target)
+static void switch_names(struct dentry *dentry, struct dentry *target,
+			 bool exchange)
 {
 	if (dname_external(target)) {
 		if (dname_external(dentry)) {
@@ -2406,13 +2407,19 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+			if (!exchange) {
+				memcpy(dentry->d_iname, target->d_name.name,
+						target->d_name.len + 1);
+				dentry->d_name.hash_len = target->d_name.hash_len;
+				return;
+			}
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
 			}
 		}
 	}
-	swap(dentry->d_name.len, target->d_name.len);
+	swap(dentry->d_name.hash_len, target->d_name.hash_len);
 }
 
 static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2442,25 +2449,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
 	}
 }
 
-static void dentry_unlock_parents_for_move(struct dentry *dentry,
-					struct dentry *target)
+static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
 {
 	if (target->d_parent != dentry->d_parent)
 		spin_unlock(&dentry->d_parent->d_lock);
 	if (target->d_parent != target)
 		spin_unlock(&target->d_parent->d_lock);
+	spin_unlock(&target->d_lock);
+	spin_unlock(&dentry->d_lock);
 }
 
 /*
  * When switching names, the actual string doesn't strictly have to
  * be preserved in the target - because we're dropping the target
  * anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch.
- *
- * Note that we have to be a lot more careful about getting the hash
- * switched - we have to switch the hash value properly even if it
- * then no longer matches the actual (corrupted) string of the target.
- * The hash value has to match the hash queue that the dentry is on..
+ * the new name before we switch, unless we are going to rehash
+ * it.  Note that if we *do* unhash the target, we are not allowed
+ * to rehash it without giving it a new name/hash key - whether
+ * we swap or overwrite the names here, resulting name won't match
+ * the reality in filesystem; it's only there for d_path() purposes.
+ * Note that all of this is happening under rename_lock, so the
+ * any hash lookup seeing it in the middle of manipulations will
+ * be discarded anyway.  So we do not care what happens to the hash
+ * key in that case.
  */
 /*
  * __d_move - move a dentry
@@ -2506,36 +2517,30 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 			   d_hash(dentry->d_parent, dentry->d_name.hash));
 	}
 
-	list_del(&dentry->d_u.d_child);
-	list_del(&target->d_u.d_child);
-
 	/* Switch the names.. */
-	switch_names(dentry, target);
-	swap(dentry->d_name.hash, target->d_name.hash);
+	switch_names(dentry, target, exchange);
 
-	/* ... and switch the parents */
+	/* ... and switch them in the tree */
 	if (IS_ROOT(dentry)) {
+		/* splicing a tree */
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
-		INIT_LIST_HEAD(&target->d_u.d_child);
+		list_del_init(&target->d_u.d_child);
+		list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	} else {
+		/* swapping two dentries */
 		swap(dentry->d_parent, target->d_parent);
-
-		/* And add them back to the (new) parent lists */
-		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
+		list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
+		list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+		if (exchange)
+			fsnotify_d_move(target);
+		fsnotify_d_move(dentry);
 	}
 
-	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
-
 	write_seqcount_end(&target->d_seq);
 	write_seqcount_end(&dentry->d_seq);
 
-	dentry_unlock_parents_for_move(dentry, target);
-	if (exchange)
-		fsnotify_d_move(target);
-	spin_unlock(&target->d_lock);
-	fsnotify_d_move(dentry);
-	spin_unlock(&dentry->d_lock);
+	dentry_unlock_for_move(dentry, target);
 }
 
 /*
@@ -2633,45 +2638,6 @@ out_err:
 	return ret;
 }
 
-/*
- * Prepare an anonymous dentry for life in the superblock's dentry tree as a
- * named dentry in place of the dentry to be replaced.
- * returns with anon->d_lock held!
- */
-static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
-{
-	struct dentry *dparent;
-
-	dentry_lock_for_move(anon, dentry);
-
-	write_seqcount_begin(&dentry->d_seq);
-	write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
-
-	dparent = dentry->d_parent;
-
-	switch_names(dentry, anon);
-	swap(dentry->d_name.hash, anon->d_name.hash);
-
-	dentry->d_parent = dentry;
-	list_del_init(&dentry->d_u.d_child);
-	anon->d_parent = dparent;
-	if (likely(!d_unhashed(anon))) {
-		hlist_bl_lock(&anon->d_sb->s_anon);
-		__hlist_bl_del(&anon->d_hash);
-		anon->d_hash.pprev = NULL;
-		hlist_bl_unlock(&anon->d_sb->s_anon);
-	}
-	list_move(&anon->d_u.d_child, &dparent->d_subdirs);
-
-	write_seqcount_end(&dentry->d_seq);
-	write_seqcount_end(&anon->d_seq);
-
-	dentry_unlock_parents_for_move(anon, dentry);
-	spin_unlock(&dentry->d_lock);
-
-	/* anon->d_lock still locked, returns locked */
-}
-
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
  * @inode:  the inode which may have a disconnected dentry
@@ -2717,10 +2683,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 				return ERR_PTR(-EIO);
 			}
 			write_seqlock(&rename_lock);
-			__d_materialise_dentry(dentry, new);
+			__d_move(new, dentry, false);
 			write_sequnlock(&rename_lock);
-			_d_rehash(new);
-			spin_unlock(&new->d_lock);
 			spin_unlock(&inode->i_lock);
 			security_d_instantiate(new, inode);
 			iput(inode);
@@ -2780,7 +2744,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 			} else if (IS_ROOT(alias)) {
 				/* Is this an anonymous mountpoint that we
 				 * could splice into our tree? */
-				__d_materialise_dentry(dentry, alias);
+				__d_move(alias, dentry, false);
 				write_sequnlock(&rename_lock);
 				goto found;
 			} else {
@@ -2807,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 	actual = __d_instantiate_unique(dentry, inode);
 	if (!actual)
 		actual = dentry;
-	else
-		BUG_ON(!d_unhashed(actual));
 
-	spin_lock(&actual->d_lock);
+	d_rehash(actual);
 found:
-	_d_rehash(actual);
-	spin_unlock(&actual->d_lock);
 	spin_unlock(&inode->i_lock);
 out_nolock:
 	if (actual == dentry) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c3116404ab49..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
+	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
 				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b88edc05c230..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
 
 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-				ext2_count_free_blocks(sb));
+				ext2_count_free_blocks(sb), GFP_KERNEL);
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext2_count_free_inodes(sb));
+				ext2_count_free_inodes(sb), GFP_KERNEL);
 	}
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext2_count_dirs(sb));
+				ext2_count_dirs(sb), GFP_KERNEL);
 	}
 	if (err) {
 		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 622e88249024..bb0fdacad058 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount2;
 	}
 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext3_count_free_blocks(sb));
+			ext3_count_free_blocks(sb), GFP_KERNEL);
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext3_count_free_inodes(sb));
+				ext3_count_free_inodes(sb), GFP_KERNEL);
 	}
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext3_count_dirs(sb));
+				ext3_count_dirs(sb), GFP_KERNEL);
 	}
 	if (err) {
 		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0b28b36e7915..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3892,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* Register extent status tree shrinker */
 	ext4_es_register_shrinker(sbi);
 
-	if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
+	err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
+	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
@@ -4106,17 +4107,20 @@ no_journal:
 	block = ext4_count_free_clusters(sb);
 	ext4_free_blocks_count_set(sbi->s_es, 
 				   EXT4_C2B(sbi, block));
-	err = percpu_counter_init(&sbi->s_freeclusters_counter, block);
+	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+				  GFP_KERNEL);
 	if (!err) {
 		unsigned long freei = ext4_count_free_inodes(sb);
 		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
-		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei);
+		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+					  GFP_KERNEL);
 	}
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
-					  ext4_count_dirs(sb));
+					  ext4_count_dirs(sb), GFP_KERNEL);
 	if (!err)
-		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+					  GFP_KERNEL);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ec3b7a5381fa..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
 	return page;
 }
 
-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+	bool readahead = false;
+	struct page *page;
+
+	page = find_get_page(META_MAPPING(sbi), index);
+	if (!page || (page && !PageUptodate(page)))
+		readahead = true;
+	f2fs_put_page(page, 0);
+
+	if (readahead)
+		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+	return get_meta_page(sbi, index);
+}
+
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
 {
 	switch (type) {
 	case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
 	case META_SSA:
 	case META_CP:
 		return 0;
+	case META_POR:
+		return MAX_BLKADDR(sbi);
 	default:
 		BUG();
 	}
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
 /*
  * Readahead CP/NAT/SIT/SSA pages
  */
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
 {
 	block_t prev_blk_addr = 0;
 	struct page *page;
-	int blkno = start;
-	int max_blks = get_max_meta_blks(sbi, type);
+	block_t blkno = start;
+	block_t max_blks = get_max_meta_blks(sbi, type);
 
 	struct f2fs_io_info fio = {
 		.type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
 			break;
 		case META_SSA:
 		case META_CP:
-			/* get ssa/cp block addr */
+		case META_POR:
+			if (unlikely(blkno >= max_blks))
+				goto out;
+			if (unlikely(blkno < SEG0_BLKADDR(sbi)))
+				goto out;
 			blk_addr = blkno;
 			break;
 		default:
@@ -151,8 +172,7 @@ out:
 static int f2fs_write_meta_page(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
 	trace_f2fs_writepage(page, META);
 
@@ -177,7 +197,7 @@ redirty_out:
 static int f2fs_write_meta_pages(struct address_space *mapping,
 				struct writeback_control *wbc)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	long diff, written;
 
 	trace_f2fs_writepages(mapping->host, wbc, META);
@@ -259,15 +279,12 @@ continue_unlock:
 
 static int f2fs_set_meta_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
 	trace_f2fs_set_page_dirty(page, META);
 
 	SetPageUptodate(page);
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
-		inc_page_count(sbi, F2FS_DIRTY_META);
+		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
 		return 1;
 	}
 	return 0;
@@ -378,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->ino_lock[ORPHAN_INO]);
-	f2fs_bug_on(sbi->n_orphans == 0);
+	f2fs_bug_on(sbi, sbi->n_orphans == 0);
 	sbi->n_orphans--;
 	spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
 }
@@ -398,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct inode *inode = f2fs_iget(sbi->sb, ino);
-	f2fs_bug_on(IS_ERR(inode));
+	f2fs_bug_on(sbi, IS_ERR(inode));
 	clear_nlink(inode);
 
 	/* truncate all the data during iput */
@@ -459,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 	list_for_each_entry(orphan, head, list) {
 		if (!page) {
 			page = find_get_page(META_MAPPING(sbi), start_blk++);
-			f2fs_bug_on(!page);
+			f2fs_bug_on(sbi, !page);
 			orphan_blk =
 				(struct f2fs_orphan_block *)page_address(page);
 			memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -619,7 +636,7 @@ fail_no_cp:
 
 static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
 		return -EEXIST;
@@ -631,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 	return 0;
 }
 
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+void update_dirty_page(struct inode *inode, struct page *page)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dir_inode_entry *new;
 	int ret = 0;
 
-	if (!S_ISDIR(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
 		return;
 
+	if (!S_ISDIR(inode->i_mode)) {
+		inode_inc_dirty_pages(inode);
+		goto out;
+	}
+
 	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
 	new->inode = inode;
 	INIT_LIST_HEAD(&new->list);
 
 	spin_lock(&sbi->dir_inode_lock);
 	ret = __add_dirty_inode(inode, new);
-	inode_inc_dirty_dents(inode);
-	SetPagePrivate(page);
+	inode_inc_dirty_pages(inode);
 	spin_unlock(&sbi->dir_inode_lock);
 
 	if (ret)
 		kmem_cache_free(inode_entry_slab, new);
+out:
+	SetPagePrivate(page);
 }
 
 void add_dirty_dir_inode(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dir_inode_entry *new =
 			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
 	int ret = 0;
@@ -674,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
 
 void remove_dirty_dir_inode(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dir_inode_entry *entry;
 
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
 	spin_lock(&sbi->dir_inode_lock);
-	if (get_dirty_dents(inode) ||
+	if (get_dirty_pages(inode) ||
 			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
 		spin_unlock(&sbi->dir_inode_lock);
 		return;
@@ -802,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 	finish_wait(&sbi->cp_wait, &wait);
 }
 
-static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
-	nid_t last_nid = 0;
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	nid_t last_nid = nm_i->next_scan_nid;
 	block_t start_blk;
 	struct page *cp_page;
 	unsigned int data_sum_blocks, orphan_blocks;
@@ -869,7 +893,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
 			orphan_blocks);
 
-	if (is_umount) {
+	if (cpc->reason == CP_UMOUNT) {
 		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
 				cp_payload_blks + data_sum_blocks +
@@ -886,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	else
 		clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
 
+	if (sbi->need_fsck)
+		set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
 	/* update SIT/NAT bitmap */
 	get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -920,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 	write_data_summaries(sbi, start_blk);
 	start_blk += data_sum_blocks;
-	if (is_umount) {
+	if (cpc->reason == CP_UMOUNT) {
 		write_node_summaries(sbi, start_blk);
 		start_blk += NR_CURSEG_NODE_TYPE;
 	}
@@ -960,23 +987,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 /*
  * We guarantee that this checkpoint procedure will not fail.
  */
-void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned long long ckpt_ver;
 
-	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
 
 	mutex_lock(&sbi->cp_mutex);
 
-	if (!sbi->s_dirty)
+	if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
 		goto out;
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto out;
 	if (block_operations(sbi))
 		goto out;
 
-	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
 
 	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -992,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 
 	/* write cached NAT/SIT entries to NAT/SIT area */
 	flush_nat_entries(sbi);
-	flush_sit_entries(sbi);
+	flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
-	do_checkpoint(sbi, is_umount);
+	do_checkpoint(sbi, cpc);
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
 out:
 	mutex_unlock(&sbi->cp_mutex);
-	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
 }
 
 void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 76de83e25a89..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 	bio = bio_alloc(GFP_NOIO, npages);
 
 	bio->bi_bdev = sbi->sb->s_bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 	bio->bi_private = sbi;
 
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		int bio_blocks = MAX_BIO_BLOCKS(sbi);
 
 		io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
 		io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
 
 int reserve_new_block(struct dnode_of_data *dn)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 
 	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
 		return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	int err;
 
 	/* if inode_page exists, index should be zero */
-	f2fs_bug_on(!need_put && index);
+	f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
 
 	err = get_dnode_of_data(dn, index, ALLOC_NODE);
 	if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 	block_t start_blkaddr, end_blkaddr;
 	int need_update = true;
 
-	f2fs_bug_on(blk_addr == NEW_ADDR);
+	f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
 
@@ -396,7 +396,6 @@ end_update:
 
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 		return page;
 	}
 
-	err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
 					sync ? READ_SYNC : READA);
 	if (err)
 		return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
  */
 struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
@@ -490,7 +488,8 @@ repeat:
 		return page;
 	}
 
-	err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
+	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+					dn.data_blkaddr, READ_SYNC);
 	if (err)
 		return ERR_PTR(err);
 
@@ -517,7 +516,6 @@ repeat:
 struct page *get_new_data_page(struct inode *inode,
 		struct page *ipage, pgoff_t index, bool new_i_size)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 		SetPageUptodate(page);
 	} else {
-		err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
-								READ_SYNC);
+		err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+						dn.data_blkaddr, READ_SYNC);
 		if (err)
 			goto put_err;
 
@@ -573,10 +571,12 @@ put_err:
 
 static int __allocate_data_block(struct dnode_of_data *dn)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct f2fs_inode_info *fi = F2FS_I(dn->inode);
 	struct f2fs_summary sum;
 	block_t new_blkaddr;
 	struct node_info ni;
+	pgoff_t fofs;
 	int type;
 
 	if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 	update_extent_cache(new_blkaddr, dn);
 	clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
 
+	/* update i_size */
+	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+							dn->ofs_in_node;
+	if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+		i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+
 	dn->data_blkaddr = new_blkaddr;
 	return 0;
 }
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 static int __get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create, bool fiemap)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
 	unsigned maxblocks = bh_result->b_size >> blkbits;
 	struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 		goto out;
 
 	if (create) {
-		f2fs_balance_fs(sbi);
-		f2fs_lock_op(sbi);
+		f2fs_balance_fs(F2FS_I_SB(inode));
+		f2fs_lock_op(F2FS_I_SB(inode));
 	}
 
 	/* When reading holes, we need its node page */
@@ -707,7 +712,7 @@ put_out:
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (create)
-		f2fs_unlock_op(sbi);
+		f2fs_unlock_op(F2FS_I_SB(inode));
 out:
 	trace_f2fs_get_data_block(inode, iblock, bh_result, err);
 	return err;
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
 					struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = ((unsigned long long) i_size)
 							>> PAGE_CACHE_SHIFT;
@@ -846,7 +851,7 @@ write:
 	if (unlikely(f2fs_cp_error(sbi))) {
 		SetPageError(page);
 		unlock_page(page);
-		return 0;
+		goto out;
 	}
 
 	if (!wbc->for_reclaim)
@@ -866,7 +871,7 @@ done:
 
 	clear_cold_data(page);
 out:
-	inode_dec_dirty_dents(inode);
+	inode_dec_dirty_pages(inode);
 	unlock_page(page);
 	if (need_balance_fs)
 		f2fs_balance_fs(sbi);
@@ -892,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	bool locked = false;
 	int ret;
 	long diff;
@@ -904,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 		return 0;
 
 	if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
-			get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+			get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
 			available_free_memory(sbi, DIRTY_DENTS))
 		goto skip_write;
 
@@ -926,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	return ret;
 
 skip_write:
-	wbc->pages_skipped += get_dirty_dents(inode);
+	wbc->pages_skipped += get_dirty_pages(inode);
 	return 0;
 }
 
@@ -945,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 		struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page;
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
 	struct dnode_of_data dn;
@@ -1047,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
 
 	trace_f2fs_write_end(inode, pos, len, copied);
 
-	set_page_dirty(page);
+	if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+		register_inmem_page(inode, page);
+	else
+		set_page_dirty(page);
 
 	if (pos + copied > i_size_read(inode)) {
 		i_size_write(inode, pos + copied);
@@ -1092,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 	if (check_direct_IO(inode, rw, iter, offset))
 		return 0;
 
-	/* clear fsync mark to recover these blocks */
-	fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
-
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
 	err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1110,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
 				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
+
+	if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+		return;
+
 	if (PageDirty(page))
-		inode_dec_dirty_dents(inode);
+		inode_dec_dirty_pages(inode);
 	ClearPagePrivate(page);
 }
 
@@ -1133,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
 
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
-		set_dirty_dir_page(inode, page);
+		update_dirty_page(inode, page);
 		return 1;
 	}
 	return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fecebdbfd781..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 	total_vblocks = 0;
 	blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
 	hblks_per_sec = blks_per_sec / 2;
-	for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
 		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
 		dist = abs(vblocks - hblks_per_sec);
 		bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 			ndirty++;
 		}
 	}
-	dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+	dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
 	si->bimodal = bimodal / dist;
 	if (si->dirty_count)
 		si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 
 	/* build sit */
 	si->base_mem += sizeof(struct sit_info);
-	si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
-	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
+	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	if (sbi->segs_per_sec > 1)
-		si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
+		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
 	si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
 
 	/* build free segmap */
 	si->base_mem += sizeof(struct free_segmap_info);
-	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
 
 	/* build curseg */
 	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,8 +149,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 
 	/* build dirty segmap */
 	si->base_mem += sizeof(struct dirty_seglist_info);
-	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
+	si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
 
 	/* build nm */
 	si->base_mem += sizeof(struct f2fs_nm_info);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 155fb056b7f1..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -126,7 +126,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 		 * For the most part, it should be a bug when name_len is zero.
 		 * We stop here for figuring out where the bugs has occurred.
 		 */
-		f2fs_bug_on(!de->name_len);
+		f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
 
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 	}
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	bool room = false;
 	int max_slots = 0;
 
-	f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
+	f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
 
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 
 int update_dent_inode(struct inode *inode, const struct qstr *name)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *page;
 
-	page = get_node_page(sbi, inode->i_ino);
+	page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
 static struct page *init_inode_metadata(struct inode *inode,
 		struct inode *dir, const struct qstr *name)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct page *page;
 	int err;
 
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
 		if (err)
 			goto put_error;
 	} else {
-		page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+		page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
 		if (IS_ERR(page))
 			return page;
 
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
 		 * we should remove this inode from orphan list.
 		 */
 		if (inode->i_nlink == 0)
-			remove_orphan_inode(sbi, inode->i_ino);
+			remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
 		inc_nlink(inode);
 	}
 	return page;
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 {
 	struct	f2fs_dentry_block *dentry_blk;
 	unsigned int bit_pos;
-	struct address_space *mapping = page->mapping;
-	struct inode *dir = mapping->host;
+	struct inode *dir = page->mapping->host;
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
 	int i;
 
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 
 	if (inode) {
-		struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+		struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 
 		down_write(&F2FS_I(inode)->i_sem);
 
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		truncate_hole(dir, page->index, page->index + 1);
 		clear_page_dirty_for_io(page);
 		ClearPageUptodate(page);
-		inode_dec_dirty_dents(dir);
+		inode_dec_dirty_pages(dir);
 	}
 	f2fs_put_page(page, 1);
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e921242186f6..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
 #include <linux/sched.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
-#define f2fs_bug_on(condition)	BUG_ON(condition)
+#define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
 #define f2fs_down_write(x, y)	down_write_nest_lock(x, y)
 #else
-#define f2fs_bug_on(condition)	WARN_ON(condition)
+#define f2fs_bug_on(sbi, condition)					\
+	do {								\
+		if (unlikely(condition)) {				\
+			WARN_ON(1);					\
+			sbi->need_fsck = true;				\
+		}							\
+	} while (0)
 #define f2fs_down_write(x, y)	down_write(x)
 #endif
 
@@ -90,6 +96,20 @@ enum {
 	SIT_BITMAP
 };
 
+enum {
+	CP_UMOUNT,
+	CP_SYNC,
+	CP_DISCARD,
+};
+
+struct cp_control {
+	int reason;
+	__u64 trim_start;
+	__u64 trim_end;
+	__u64 trim_minlen;
+	__u64 trimmed;
+};
+
 /*
  * For CP/NAT/SIT/SSA readahead
  */
@@ -97,7 +117,8 @@ enum {
 	META_CP,
 	META_NAT,
 	META_SIT,
-	META_SSA
+	META_SSA,
+	META_POR,
 };
 
 /* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
 struct fsync_inode_entry {
 	struct list_head list;	/* list head */
 	struct inode *inode;	/* vfs inode pointer */
-	block_t blkaddr;	/* block address locating the last inode */
+	block_t blkaddr;	/* block address locating the last fsync */
+	block_t last_dentry;	/* block address locating the last dentry */
+	block_t last_inode;	/* block address locating the last inode */
 };
 
 #define nats_in_cursum(sum)		(le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
 #define sit_in_journal(sum, i)		(sum->sit_j.entries[i].se)
 #define segno_in_journal(sum, i)	(sum->sit_j.entries[i].segno)
 
+#define MAX_NAT_JENTRIES(sum)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
 static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
 {
 	int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
 	return before;
 }
 
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+								int type)
+{
+	if (type == NAT_JOURNAL)
+		return size <= MAX_NAT_JENTRIES(sum);
+	return size <= MAX_SIT_JENTRIES(sum);
+}
+
 /*
  * ioctl commands
  */
-#define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+#define F2FS_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS		FS_IOC_SETFLAGS
+
+#define F2FS_IOCTL_MAGIC		0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
 	/* Use below internally in f2fs*/
 	unsigned long flags;		/* use to pass per-file flags */
 	struct rw_semaphore i_sem;	/* protect fi info */
-	atomic_t dirty_dents;		/* # of dirty dentry pages */
+	atomic_t dirty_pages;		/* # of dirty pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
 	struct extent_info ext;		/* in-memory extent cache entry */
 	struct dir_inode_entry *dirty_dir;	/* the pointer of dirty dir */
+
+	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
+	struct mutex inmem_lock;	/* lock for inmemory pages */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
 
 	/* NAT cache management */
 	struct radix_tree_root nat_root;/* root of the nat entry cache */
+	struct radix_tree_root nat_set_root;/* root of the nat set cache */
 	rwlock_t nat_tree_lock;		/* protect nat_tree_lock */
-	unsigned int nat_cnt;		/* the # of cached nat entries */
 	struct list_head nat_entries;	/* cached nat entry list (clean) */
-	struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
-	struct list_head nat_entry_set;	/* nat entry set list */
+	unsigned int nat_cnt;		/* the # of cached nat entries */
 	unsigned int dirty_nat_cnt;	/* total num of nat entries in set */
 
 	/* free node ids management */
@@ -332,18 +373,16 @@ enum {
 };
 
 struct flush_cmd {
-	struct flush_cmd *next;
 	struct completion wait;
+	struct llist_node llnode;
 	int ret;
 };
 
 struct flush_cmd_control {
 	struct task_struct *f2fs_issue_flush;	/* flush thread */
 	wait_queue_head_t flush_wait_queue;	/* waiting queue for wake-up */
-	struct flush_cmd *issue_list;		/* list for command issue */
-	struct flush_cmd *dispatch_list;	/* list for command dispatch */
-	spinlock_t issue_lock;			/* for issue list lock */
-	struct flush_cmd *issue_tail;		/* list tail of issue list */
+	struct llist_head issue_list;		/* list for command issue */
+	struct llist_node *dispatch_list;	/* list for command dispatch */
 };
 
 struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
 	int nr_discards;			/* # of discards in the list */
 	int max_discards;			/* max. discards to be issued */
 
+	struct list_head sit_entry_set;	/* sit entry set list */
+
 	unsigned int ipu_policy;	/* in-place-update policy */
 	unsigned int min_ipu_util;	/* in-place-update threshold */
+	unsigned int min_fsync_blocks;	/* threshold for fsync */
 
 	/* for flush command control */
 	struct flush_cmd_control *cmd_control_info;
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
 	struct buffer_head *raw_super_buf;	/* buffer head of raw sb */
 	struct f2fs_super_block *raw_super;	/* raw super block pointer */
 	int s_dirty;				/* dirty flag for checkpoint */
+	bool need_fsck;				/* need fsck.f2fs to fix */
 
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+	return F2FS_SB(inode->i_sb);
+}
+
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+	return F2FS_I_SB(mapping->host);
+}
+
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+	return F2FS_M_SB(page->mapping);
+}
+
 static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
 {
 	return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 						blkcnt_t count)
 {
 	spin_lock(&sbi->stat_lock);
-	f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
-	f2fs_bug_on(inode->i_blocks < count);
+	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
+	f2fs_bug_on(sbi, inode->i_blocks < count);
 	inode->i_blocks -= count;
 	sbi->total_valid_block_count -= (block_t)count;
 	spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 	F2FS_SET_SB_DIRT(sbi);
 }
 
-static inline void inode_inc_dirty_dents(struct inode *inode)
+static inline void inode_inc_dirty_pages(struct inode *inode)
 {
-	inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
-	atomic_inc(&F2FS_I(inode)->dirty_dents);
+	atomic_inc(&F2FS_I(inode)->dirty_pages);
+	if (S_ISDIR(inode->i_mode))
+		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 	atomic_dec(&sbi->nr_pages[count_type]);
 }
 
-static inline void inode_dec_dirty_dents(struct inode *inode)
+static inline void inode_dec_dirty_pages(struct inode *inode)
 {
-	if (!S_ISDIR(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
 		return;
 
-	dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
-	atomic_dec(&F2FS_I(inode)->dirty_dents);
+	atomic_dec(&F2FS_I(inode)->dirty_pages);
+
+	if (S_ISDIR(inode->i_mode))
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
 }
 
 static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
 	return atomic_read(&sbi->nr_pages[count_type]);
 }
 
-static inline int get_dirty_dents(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
 {
-	return atomic_read(&F2FS_I(inode)->dirty_dents);
+	return atomic_read(&F2FS_I(inode)->dirty_pages);
 }
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 {
 	spin_lock(&sbi->stat_lock);
 
-	f2fs_bug_on(!sbi->total_valid_block_count);
-	f2fs_bug_on(!sbi->total_valid_node_count);
-	f2fs_bug_on(!inode->i_blocks);
+	f2fs_bug_on(sbi, !sbi->total_valid_block_count);
+	f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+	f2fs_bug_on(sbi, !inode->i_blocks);
 
 	inode->i_blocks--;
 	sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->stat_lock);
-	f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
+	f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
 	sbi->total_valid_inode_count++;
 	spin_unlock(&sbi->stat_lock);
 }
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->stat_lock);
-	f2fs_bug_on(!sbi->total_valid_inode_count);
+	f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
 	sbi->total_valid_inode_count--;
 	spin_unlock(&sbi->stat_lock);
 }
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
 		return;
 
 	if (unlock) {
-		f2fs_bug_on(!PageLocked(page));
+		f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
 		unlock_page(page);
 	}
 	page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
 	FI_INLINE_DATA,		/* used for inline data*/
 	FI_APPEND_WRITE,	/* inode has appended data */
 	FI_UPDATE_WRITE,	/* inode has in-place-update data */
-	FI_NEED_IPU,		/* used fo ipu for fdatasync */
+	FI_NEED_IPU,		/* used for ipu per file */
+	FI_ATOMIC_FILE,		/* indicate atomic file */
+	FI_VOLATILE_FILE,	/* indicate volatile file */
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
 	return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
 }
 
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+	return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
+
 static inline void *inline_data_addr(struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1141,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
 void update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
 
 /*
  * namei.c
@@ -1188,9 +1262,9 @@ struct dnode_of_data;
 struct node_info;
 
 bool available_free_memory(struct f2fs_sb_info *, int);
-int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
-void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1221,6 +1295,8 @@ void destroy_node_manager_caches(void);
 /*
  * segment.c
  */
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1229,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
 void discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1248,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
 int lookup_journal_in_cursum(struct f2fs_summary_block *,
 					int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
 int __init create_segment_manager_caches(void);
@@ -1259,7 +1337,8 @@ void destroy_segment_manager_caches(void);
  */
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
 void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
@@ -1271,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
 void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 void recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
-void set_dirty_dir_page(struct inode *, struct page *);
+void update_dirty_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
 void init_ino_entry_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -1359,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_inc_inline_inode(inode)					\
 	do {								\
 		if (f2fs_has_inline_data(inode))			\
-			((F2FS_SB(inode->i_sb))->inline_inode++);	\
+			((F2FS_I_SB(inode))->inline_inode++);		\
 	} while (0)
 #define stat_dec_inline_inode(inode)					\
 	do {								\
 		if (f2fs_has_inline_data(inode))			\
-			((F2FS_SB(inode->i_sb))->inline_inode--);	\
+			((F2FS_I_SB(inode))->inline_inode--);		\
 	} while (0)
 
 #define stat_inc_seg_type(sbi, curseg)					\
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 060aee65aee8..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
 	int err;
 
@@ -117,7 +117,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 
 static inline bool need_do_checkpoint(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	bool need_cp = false;
 
 	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
@@ -138,7 +138,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t ino = inode->i_ino;
 	int ret = 0;
 	bool need_cp = false;
 	struct writeback_control wbc = {
@@ -153,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_f2fs_sync_file_enter(inode);
 
 	/* if fdatasync is triggered, let's do in-place-update */
-	if (datasync)
+	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(fi, FI_NEED_IPU);
-
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (datasync)
-		clear_inode_flag(fi, FI_NEED_IPU);
+	clear_inode_flag(fi, FI_NEED_IPU);
+
 	if (ret) {
 		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
 		return ret;
@@ -168,13 +168,22 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * if there is no written data, don't waste time to write recovery info.
 	 */
 	if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
-		!exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+			!exist_written_data(sbi, ino, APPEND_INO)) {
+		struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+
+		/* But we need to avoid that there are some inode updates */
+		if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
+			f2fs_put_page(i, 0);
+			goto go_write;
+		}
+		f2fs_put_page(i, 0);
+
 		if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
-			exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+				exist_written_data(sbi, ino, UPDATE_INO))
 			goto flush_out;
 		goto out;
 	}
-
+go_write:
 	/* guarantee free sections for fsync */
 	f2fs_balance_fs(sbi);
 
@@ -207,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 			up_write(&fi->i_sem);
 		}
 	} else {
-		/* if there is no written node page, write its inode page */
-		while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
-			if (fsync_mark_done(sbi, inode->i_ino))
-				goto out;
+sync_nodes:
+		sync_node_pages(sbi, ino, &wbc);
+
+		if (need_inode_block_update(sbi, ino)) {
 			mark_inode_dirty_sync(inode);
 			ret = f2fs_write_inode(inode, NULL);
 			if (ret)
 				goto out;
+			goto sync_nodes;
 		}
-		ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+
+		ret = wait_on_node_pages_writeback(sbi, ino);
 		if (ret)
 			goto out;
 
 		/* once recovery info is written, don't need to tack this */
-		remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+		remove_dirty_inode(sbi, ino, APPEND_INO);
 		clear_inode_flag(fi, FI_APPEND_WRITE);
 flush_out:
-		remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+		remove_dirty_inode(sbi, ino, UPDATE_INO);
 		clear_inode_flag(fi, FI_UPDATE_WRITE);
-		ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
+		ret = f2fs_issue_flush(F2FS_I_SB(inode));
 	}
 out:
 	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -353,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 						maxbytes, i_size_read(inode));
 	case SEEK_DATA:
 	case SEEK_HOLE:
+		if (offset < 0)
+			return -ENXIO;
 		return f2fs_seek_block(file, offset, whence);
 	}
 
@@ -369,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
 	int nr_free = 0, ofs = dn->ofs_in_node;
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_node *raw_node;
 	__le32 *addr;
 
@@ -432,7 +445,7 @@ out:
 
 int truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	unsigned int blocksize = inode->i_sb->s_blocksize;
 	struct dnode_of_data dn;
 	pgoff_t free_from;
@@ -463,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
 	count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 
 	count -= dn.ofs_in_node;
-	f2fs_bug_on(count < 0);
+	f2fs_bug_on(sbi, count < 0);
 
 	if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
 		truncate_data_blocks_range(&dn, count);
@@ -547,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-			attr->ia_size != i_size_read(inode)) {
+	if (attr->ia_valid & ATTR_SIZE) {
 		err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
 		if (err)
 			return err;
 
-		truncate_setsize(inode, attr->ia_size);
-		f2fs_truncate(inode);
-		f2fs_balance_fs(F2FS_SB(inode->i_sb));
+		if (attr->ia_size != i_size_read(inode)) {
+			truncate_setsize(inode, attr->ia_size);
+			f2fs_truncate(inode);
+			f2fs_balance_fs(F2FS_I_SB(inode));
+		} else {
+			/*
+			 * giving a chance to truncate blocks past EOF which
+			 * are fallocated with FALLOC_FL_KEEP_SIZE.
+			 */
+			f2fs_truncate(inode);
+		}
 	}
 
 	__setattr_copy(inode, attr);
@@ -589,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
 static void fill_zero(struct inode *inode, pgoff_t index,
 					loff_t start, loff_t len)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page;
 
 	if (!len)
@@ -638,6 +658,13 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	loff_t off_start, off_end;
 	int ret = 0;
 
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	/* skip punching hole beyond i_size */
+	if (offset >= inode->i_size)
+		return ret;
+
 	ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
 	if (ret)
 		return ret;
@@ -661,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		if (pg_start < pg_end) {
 			struct address_space *mapping = inode->i_mapping;
 			loff_t blk_start, blk_end;
-			struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+			struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 			f2fs_balance_fs(sbi);
 
@@ -682,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 static int expand_inode_data(struct inode *inode, loff_t offset,
 					loff_t len, int mode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	pgoff_t index, pg_start, pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_start, off_end;
@@ -778,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
 		return flags & F2FS_OTHER_FLMASK;
 }
 
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+	return put_user(flags, (int __user *)arg);
+}
+
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	unsigned int flags;
+	unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+	unsigned int oldflags;
 	int ret;
 
-	switch (cmd) {
-	case F2FS_IOC_GETFLAGS:
-		flags = fi->i_flags & FS_FL_USER_VISIBLE;
-		return put_user(flags, (int __user *) arg);
-	case F2FS_IOC_SETFLAGS:
-	{
-		unsigned int oldflags;
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
 
-		ret = mnt_want_write_file(filp);
-		if (ret)
-			return ret;
+	if (!inode_owner_or_capable(inode)) {
+		ret = -EACCES;
+		goto out;
+	}
 
-		if (!inode_owner_or_capable(inode)) {
-			ret = -EACCES;
-			goto out;
-		}
+	if (get_user(flags, (int __user *)arg)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	flags = f2fs_mask_flags(inode->i_mode, flags);
+
+	mutex_lock(&inode->i_mutex);
 
-		if (get_user(flags, (int __user *) arg)) {
-			ret = -EFAULT;
+	oldflags = fi->i_flags;
+
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			mutex_unlock(&inode->i_mutex);
+			ret = -EPERM;
 			goto out;
 		}
+	}
 
-		flags = f2fs_mask_flags(inode->i_mode, flags);
+	flags = flags & FS_FL_USER_MODIFIABLE;
+	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+	fi->i_flags = flags;
+	mutex_unlock(&inode->i_mutex);
 
-		mutex_lock(&inode->i_mutex);
+	f2fs_set_inode_flags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+out:
+	mnt_drop_write_file(filp);
+	return ret;
+}
 
-		oldflags = fi->i_flags;
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-		if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				ret = -EPERM;
-				goto out;
-			}
-		}
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
 
-		flags = flags & FS_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
-		fi->i_flags = flags;
-		mutex_unlock(&inode->i_mutex);
+	f2fs_balance_fs(sbi);
 
-		f2fs_set_inode_flags(inode);
-		inode->i_ctime = CURRENT_TIME;
-		mark_inode_dirty(inode);
-out:
-		mnt_drop_write_file(filp);
+	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+
+	return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+}
+
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	if (f2fs_is_volatile_file(inode))
+		return 0;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
 		return ret;
-	}
+
+	if (f2fs_is_atomic_file(inode))
+		commit_inmem_pages(inode, false);
+
+	ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+	mnt_drop_write_file(filp);
+	return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+
+	if (!inode_owner_or_capable(inode))
+		return -EACCES;
+
+	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	return 0;
+}
+
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	struct fstrim_range range;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+				sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max((unsigned int)range.minlen,
+				q->limits.discard_granularity);
+	ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user((struct fstrim_range __user *)arg, &range,
+				sizeof(range)))
+		return -EFAULT;
+	return 0;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case F2FS_IOC_GETFLAGS:
+		return f2fs_ioc_getflags(filp, arg);
+	case F2FS_IOC_SETFLAGS:
+		return f2fs_ioc_setflags(filp, arg);
+	case F2FS_IOC_START_ATOMIC_WRITE:
+		return f2fs_ioc_start_atomic_write(filp);
+	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+		return f2fs_ioc_commit_atomic_write(filp);
+	case F2FS_IOC_START_VOLATILE_WRITE:
+		return f2fs_ioc_start_volatile_write(filp);
+	case FITRIM:
+		return f2fs_ioc_fitrim(filp, arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 943a31db7cc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 	 * selected by background GC before.
 	 * Those segments guarantee they have small valid blocks.
 	 */
-	for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
+	for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
 		if (sec_usage_check(sbi, secno))
 			continue;
 		clear_bit(secno, dirty_i->victim_secmap);
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 	unsigned int secno, max_cost;
 	int nsearched = 0;
 
+	mutex_lock(&dirty_i->seglist_lock);
+
 	p.alloc_mode = alloc_mode;
 	select_policy(sbi, gc_type, type, &p);
 
 	p.min_segno = NULL_SEGNO;
 	p.min_cost = max_cost = get_max_cost(sbi, &p);
 
-	mutex_lock(&dirty_i->seglist_lock);
-
 	if (p.alloc_mode == LFS && gc_type == FG_GC) {
 		p.min_segno = check_bg_victims(sbi);
 		if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		unsigned long cost;
 		unsigned int segno;
 
-		segno = find_next_bit(p.dirty_segmap,
-						TOTAL_SEGS(sbi), p.offset);
-		if (segno >= TOTAL_SEGS(sbi)) {
+		segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
+		if (segno >= MAIN_SEGS(sbi)) {
 			if (sbi->last_victim[p.gc_mode]) {
 				sbi->last_victim[p.gc_mode] = 0;
 				p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
 		if (IS_ERR(node_page))
 			continue;
 
+		/* block may become invalid during get_node_page */
+		if (check_valid_map(sbi, segno, off) == 0) {
+			f2fs_put_page(node_page, 1);
+			continue;
+		}
+
 		/* set page dirty and write it */
 		if (gc_type == FG_GC) {
 			f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 		f2fs_wait_on_page_writeback(page, DATA);
 
 		if (clear_page_dirty_for_io(page))
-			inode_dec_dirty_dents(inode);
+			inode_dec_dirty_pages(inode);
 		set_cold_data(page);
 		do_write_data_page(page, &fio);
 		clear_cold_data(page);
@@ -688,6 +693,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 	int gc_type = BG_GC;
 	int nfree = 0;
 	int ret = -1;
+	struct cp_control cpc = {
+		.reason = CP_SYNC,
+	};
 
 	INIT_LIST_HEAD(&ilist);
 gc_more:
@@ -698,7 +706,7 @@ gc_more:
 
 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
 		gc_type = FG_GC;
-		write_checkpoint(sbi, false);
+		write_checkpoint(sbi, &cpc);
 	}
 
 	if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
 		goto gc_more;
 
 	if (gc_type == FG_GC)
-		write_checkpoint(sbi, false);
+		write_checkpoint(sbi, &cpc);
 stop:
 	mutex_unlock(&sbi->gc_mutex);
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3e8ecdf3742b..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
 
 bool f2fs_may_inline(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	block_t nr_blocks;
 	loff_t i_size;
 
-	if (!test_opt(sbi, INLINE_DATA))
+	if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+		return false;
+
+	if (f2fs_is_atomic_file(inode))
 		return false;
 
 	nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
 
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *ipage;
 	void *src_addr, *dst_addr;
 
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 		goto out;
 	}
 
-	ipage = get_node_page(sbi, inode->i_ino);
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(ipage)) {
 		unlock_page(page);
 		return PTR_ERR(ipage);
@@ -73,7 +74,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 	struct dnode_of_data dn;
 	void *src_addr, *dst_addr;
 	block_t new_blk_addr;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_io_info fio = {
 		.type = DATA,
 		.rw = WRITE_SYNC | REQ_PRIO,
@@ -189,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
 
 void truncate_inline_data(struct inode *inode, u64 from)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *ipage;
 
 	if (from >= MAX_INLINE_DATA)
 		return;
 
-	ipage = get_node_page(sbi, inode->i_ino);
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(ipage))
 		return;
 
@@ -209,7 +209,7 @@ void truncate_inline_data(struct inode *inode, u64 from)
 
 bool recover_inline_data(struct inode *inode, struct page *npage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode *ri = NULL;
 	void *src_addr, *dst_addr;
 	struct page *ipage;
@@ -229,7 +229,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage)
 			ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 process_inline:
 		ipage = get_node_page(sbi, inode->i_ino);
-		f2fs_bug_on(IS_ERR(ipage));
+		f2fs_bug_on(sbi, IS_ERR(ipage));
 
 		f2fs_wait_on_page_writeback(ipage, NODE);
 
@@ -243,7 +243,7 @@ process_inline:
 
 	if (f2fs_has_inline_data(inode)) {
 		ipage = get_node_page(sbi, inode->i_ino);
-		f2fs_bug_on(IS_ERR(ipage));
+		f2fs_bug_on(sbi, IS_ERR(ipage));
 		f2fs_wait_on_page_writeback(ipage, NODE);
 		zero_user_segment(ipage, INLINE_DATA_OFFSET,
 				 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 
 static int do_read_inode(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct page *node_page;
 	struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
 
 void update_inode_page(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *node_page;
 retry:
 	node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
 
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
 			inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
  */
 void f2fs_evict_inode(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 
+	/* some remained atomic pages should discarded */
+	if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+		commit_inmem_pages(inode, true);
+
 	trace_f2fs_evict_inode(inode);
 	truncate_inode_pages_final(&inode->i_data);
 
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
 			inode->i_ino == F2FS_META_INO(sbi))
 		goto out_clear;
 
-	f2fs_bug_on(get_dirty_dents(inode));
+	f2fs_bug_on(sbi, get_dirty_pages(inode));
 	remove_dirty_dir_inode(inode);
 
 	if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
 out_clear:
 	clear_inode(inode);
 }
+
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	clear_nlink(inode);
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+
+	i_size_write(inode, 0);
+	if (F2FS_HAS_BLOCKS(inode))
+		f2fs_truncate(inode);
+
+	remove_inode_page(inode);
+	stat_dec_inline_inode(inode);
+
+	alloc_nid_failed(sbi, inode->i_ino);
+	f2fs_unlock_op(sbi);
+
+	/* iput will drop the inode object */
+	iput(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ee103fd7283c..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
 
 static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
 static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 						bool excl)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	nid_t ino = 0;
 	int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
+	f2fs_unlock_op(sbi);
 
 	alloc_nid_done(sbi, ino);
 
@@ -133,9 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	unlock_new_inode(inode);
 	return 0;
 out:
-	clear_nlink(inode);
-	iget_failed(inode);
-	alloc_nid_failed(sbi, ino);
+	handle_failed_inode(inode);
 	return err;
 }
 
@@ -143,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 		struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	int err;
 
 	f2fs_balance_fs(sbi);
@@ -154,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
+	f2fs_unlock_op(sbi);
 
 	d_instantiate(dentry, inode);
 	return 0;
 out:
 	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	iput(inode);
+	f2fs_unlock_op(sbi);
 	return err;
 }
 
@@ -203,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 
 static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode = dentry->d_inode;
 	struct f2fs_dir_entry *de;
 	struct page *page;
@@ -237,7 +236,7 @@ fail:
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 					const char *symname)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	size_t symlen = strlen(symname) + 1;
 	int err;
@@ -253,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
+	f2fs_unlock_op(sbi);
 
 	err = page_symlink(inode, symname, symlen);
 	alloc_nid_done(sbi, inode->i_ino);
@@ -264,15 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	unlock_new_inode(inode);
 	return err;
 out:
-	clear_nlink(inode);
-	iget_failed(inode);
-	alloc_nid_failed(sbi, inode->i_ino);
+	handle_failed_inode(inode);
 	return err;
 }
 
 static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	int err;
 
@@ -290,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	f2fs_unlock_op(sbi);
 	if (err)
 		goto out_fail;
+	f2fs_unlock_op(sbi);
 
 	alloc_nid_done(sbi, inode->i_ino);
 
@@ -303,9 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 out_fail:
 	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
-	clear_nlink(inode);
-	iget_failed(inode);
-	alloc_nid_failed(sbi, inode->i_ino);
+	handle_failed_inode(inode);
 	return err;
 }
 
@@ -320,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 				umode_t mode, dev_t rdev)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	int err = 0;
 
@@ -338,25 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
+	f2fs_unlock_op(sbi);
 
 	alloc_nid_done(sbi, inode->i_ino);
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 	return 0;
 out:
-	clear_nlink(inode);
-	iget_failed(inode);
-	alloc_nid_failed(sbi, inode->i_ino);
+	handle_failed_inode(inode);
 	return err;
 }
 
 static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct page *old_dir_page;
@@ -480,8 +473,7 @@ out:
 static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct super_block *sb = old_dir->i_sb;
-	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct page *old_dir_page, *new_dir_page;
@@ -642,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	int err;
 
@@ -678,10 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 release_out:
 	release_orphan_inode(sbi);
 out:
-	f2fs_unlock_op(sbi);
-	clear_nlink(inode);
-	iget_failed(inode);
-	alloc_nid_failed(sbi, inode->i_ino);
+	handle_failed_inode(inode);
 	return err;
 }
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 45378196e19a..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 static void clear_node_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
-	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
 	unsigned int long flags;
 
 	if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 
 		clear_page_dirty_for_io(page);
-		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
 	}
 	ClearPageUptodate(page);
 }
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 	/* get current nat block page with lock */
 	src_page = get_meta_page(sbi, src_off);
 	dst_page = grab_meta_page(sbi, dst_off);
-	f2fs_bug_on(PageDirty(src_page));
+	f2fs_bug_on(sbi, PageDirty(src_page));
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	kmem_cache_free(nat_entry_slab, e);
 }
 
-int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+						struct nat_entry *ne)
+{
+	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+	struct nat_entry_set *head;
+
+	if (get_nat_flag(ne, IS_DIRTY))
+		return;
+retry:
+	head = radix_tree_lookup(&nm_i->nat_set_root, set);
+	if (!head) {
+		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+		INIT_LIST_HEAD(&head->entry_list);
+		INIT_LIST_HEAD(&head->set_list);
+		head->set = set;
+		head->entry_cnt = 0;
+
+		if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+			cond_resched();
+			goto retry;
+		}
+	}
+	list_move_tail(&ne->list, &head->entry_list);
+	nm_i->dirty_nat_cnt++;
+	head->entry_cnt++;
+	set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+						struct nat_entry *ne)
+{
+	nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+	struct nat_entry_set *head;
+
+	head = radix_tree_lookup(&nm_i->nat_set_root, set);
+	if (head) {
+		list_move_tail(&ne->list, &nm_i->nat_entries);
+		set_nat_flag(ne, IS_DIRTY, false);
+		head->entry_cnt--;
+		nm_i->dirty_nat_cnt--;
+	}
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+		nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+	return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+							start, nr);
+}
+
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
-	int is_cp = 1;
+	bool is_cp = true;
 
 	read_lock(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
-	if (e && !e->checkpointed)
-		is_cp = 0;
+	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+		is_cp = false;
 	read_unlock(&nm_i->nat_tree_lock);
 	return is_cp;
 }
 
-bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
-	bool fsync_done = false;
+	bool fsynced = false;
 
 	read_lock(&nm_i->nat_tree_lock);
-	e = __lookup_nat_cache(nm_i, nid);
-	if (e)
-		fsync_done = e->fsync_done;
+	e = __lookup_nat_cache(nm_i, ino);
+	if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
+		fsynced = true;
 	read_unlock(&nm_i->nat_tree_lock);
-	return fsync_done;
+	return fsynced;
 }
 
-void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
+	bool need_update = true;
 
-	write_lock(&nm_i->nat_tree_lock);
-	e = __lookup_nat_cache(nm_i, nid);
-	if (e)
-		e->fsync_done = false;
-	write_unlock(&nm_i->nat_tree_lock);
+	read_lock(&nm_i->nat_tree_lock);
+	e = __lookup_nat_cache(nm_i, ino);
+	if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
+			(get_nat_flag(e, IS_CHECKPOINTED) ||
+			 get_nat_flag(e, HAS_FSYNCED_INODE)))
+		need_update = false;
+	read_unlock(&nm_i->nat_tree_lock);
+	return need_update;
 }
 
 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
 	}
 	memset(new, 0, sizeof(struct nat_entry));
 	nat_set_nid(new, nid);
-	new->checkpointed = true;
+	nat_reset_flag(new);
 	list_add_tail(&new->list, &nm_i->nat_entries);
 	nm_i->nat_cnt++;
 	return new;
@@ -216,7 +270,7 @@ retry:
 			goto retry;
 		}
 		e->ni = *ni;
-		f2fs_bug_on(ni->blk_addr == NEW_ADDR);
+		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
 		/*
 		 * when nid is reallocated,
@@ -224,16 +278,16 @@ retry:
 		 * So, reinitialize it with new information.
 		 */
 		e->ni = *ni;
-		f2fs_bug_on(ni->blk_addr != NULL_ADDR);
+		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 	}
 
 	/* sanity check */
-	f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
-	f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
 			new_blkaddr == NULL_ADDR);
-	f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 			new_blkaddr == NEW_ADDR);
-	f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
+	f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
 			nat_get_blkaddr(e) != NULL_ADDR &&
 			new_blkaddr == NEW_ADDR);
 
@@ -245,12 +299,17 @@ retry:
 
 	/* change address */
 	nat_set_blkaddr(e, new_blkaddr);
+	if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+		set_nat_flag(e, IS_CHECKPOINTED, false);
 	__set_nat_cache_dirty(nm_i, e);
 
 	/* update fsync_mark if its inode nat entry is still alive */
 	e = __lookup_nat_cache(nm_i, ni->ino);
-	if (e)
-		e->fsync_done = fsync_done;
+	if (e) {
+		if (fsync_done && ni->nid == ni->ino)
+			set_nat_flag(e, HAS_FSYNCED_INODE, true);
+		set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+	}
 	write_unlock(&nm_i->nat_tree_lock);
 }
 
@@ -411,7 +470,7 @@ got:
  */
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct page *npage[4];
 	struct page *parent;
 	int offset[4];
@@ -504,15 +563,15 @@ release_out:
 
 static void truncate_node(struct dnode_of_data *dn)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info ni;
 
 	get_node_info(sbi, dn->nid, &ni);
 	if (dn->inode->i_blocks == 0) {
-		f2fs_bug_on(ni.blk_addr != NULL_ADDR);
+		f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
 		goto invalidate;
 	}
-	f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 
 	/* Deallocate node address */
 	invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
 
 static int truncate_dnode(struct dnode_of_data *dn)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct page *page;
 
 	if (dn->nid == 0)
 		return 1;
 
 	/* get direct node */
-	page = get_node_page(sbi, dn->nid);
+	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 	if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
 		return 1;
 	else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 						int ofs, int depth)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct dnode_of_data rdn = *dn;
 	struct page *page;
 	struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 
 	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
 
-	page = get_node_page(sbi, dn->nid);
+	page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 	if (IS_ERR(page)) {
 		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
 		return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
 static int truncate_partial_nodes(struct dnode_of_data *dn,
 			struct f2fs_inode *ri, int *offset, int depth)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct page *pages[2];
 	nid_t nid[3];
 	nid_t child_nid;
@@ -651,7 +707,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 	/* get indirect nodes in the path */
 	for (i = 0; i < idx + 1; i++) {
 		/* reference count'll be increased */
-		pages[i] = get_node_page(sbi, nid[i]);
+		pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
 		if (IS_ERR(pages[i])) {
 			err = PTR_ERR(pages[i]);
 			idx = i - 1;
@@ -696,7 +752,7 @@ fail:
  */
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err = 0, cont = 1;
 	int level, offset[4], noffset[4];
 	unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
 
 int truncate_xattr_node(struct inode *inode, struct page *page)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
 	struct dnode_of_data dn;
 	struct page *npage;
@@ -840,7 +896,8 @@ void remove_inode_page(struct inode *inode)
 		truncate_data_blocks_range(&dn, 1);
 
 	/* 0 is possible, after f2fs_new_inode() has failed */
-	f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
+	f2fs_bug_on(F2FS_I_SB(inode),
+			inode->i_blocks != 0 && inode->i_blocks != 1);
 
 	/* will put inode & node pages */
 	truncate_node(&dn);
@@ -860,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
 struct page *new_node_page(struct dnode_of_data *dn,
 				unsigned int ofs, struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info old_ni, new_ni;
 	struct page *page;
 	int err;
@@ -880,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	get_node_info(sbi, dn->nid, &old_ni);
 
 	/* Reinitialize old_ni with new node page */
-	f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
+	f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
 	new_ni = old_ni;
 	new_ni.ino = dn->inode->i_ino;
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -918,7 +975,7 @@ fail:
  */
 static int read_node_page(struct page *page, int rw)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	struct node_info ni;
 
 	get_node_info(sbi, page->index, &ni);
@@ -994,7 +1051,7 @@ got_it:
  */
 struct page *get_node_page_ra(struct page *parent, int start)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
 	struct blk_plug plug;
 	struct page *page;
 	int err, i, end;
@@ -1124,10 +1181,14 @@ continue_unlock:
 
 			/* called by fsync() */
 			if (ino && IS_DNODE(page)) {
-				int mark = !is_checkpointed_node(sbi, ino);
 				set_fsync_mark(page, 1);
-				if (IS_INODE(page))
-					set_dentry_mark(page, mark);
+				if (IS_INODE(page)) {
+					if (!is_checkpointed_node(sbi, ino) &&
+						!has_fsynced_inode(sbi, ino))
+						set_dentry_mark(page, 1);
+					else
+						set_dentry_mark(page, 0);
+				}
 				nwritten++;
 			} else {
 				set_fsync_mark(page, 0);
@@ -1206,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	nid_t nid;
 	block_t new_addr;
 	struct node_info ni;
@@ -1226,7 +1287,7 @@ static int f2fs_write_node_page(struct page *page,
 
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
-	f2fs_bug_on(page->index != nid);
+	f2fs_bug_on(sbi, page->index != nid);
 
 	get_node_info(sbi, nid, &ni);
 
@@ -1257,7 +1318,7 @@ redirty_out:
 static int f2fs_write_node_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	long diff;
 
 	trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1282,15 +1343,12 @@ skip_write:
 
 static int f2fs_set_node_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
 	trace_f2fs_set_page_dirty(page, NODE);
 
 	SetPageUptodate(page);
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
-		inc_page_count(sbi, F2FS_DIRTY_NODES);
+		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 		SetPagePrivate(page);
 		return 1;
 	}
@@ -1301,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
 				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	if (PageDirty(page))
-		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
 	ClearPagePrivate(page);
 }
 
@@ -1356,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		read_lock(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne &&
-			(!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
+			(!get_nat_flag(ne, IS_CHECKPOINTED) ||
+				nat_get_blkaddr(ne) != NULL_ADDR))
 			allocated = true;
 		read_unlock(&nm_i->nat_tree_lock);
 		if (allocated)
@@ -1413,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 			break;
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
-		f2fs_bug_on(blk_addr == NEW_ADDR);
+		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
 		if (blk_addr == NULL_ADDR) {
 			if (add_free_nid(sbi, start_nid, true) < 0)
 				break;
@@ -1483,12 +1541,12 @@ retry:
 
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(list_empty(&nm_i->free_nid_list));
+		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
 		list_for_each_entry(i, &nm_i->free_nid_list, list)
 			if (i->state == NID_NEW)
 				break;
 
-		f2fs_bug_on(i->state != NID_NEW);
+		f2fs_bug_on(sbi, i->state != NID_NEW);
 		*nid = i->nid;
 		i->state = NID_ALLOC;
 		nm_i->fcnt--;
@@ -1514,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(!i || i->state != NID_ALLOC);
+	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
 	__del_from_free_nid_list(nm_i, i);
 	spin_unlock(&nm_i->free_nid_list_lock);
 
@@ -1535,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(!i || i->state != NID_ALLOC);
+	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
 	if (!available_free_memory(sbi, FREE_NIDS)) {
 		__del_from_free_nid_list(nm_i, i);
 		need_free = true;
@@ -1551,14 +1609,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
 void recover_inline_xattr(struct inode *inode, struct page *page)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	void *src_addr, *dst_addr;
 	size_t inline_size;
 	struct page *ipage;
 	struct f2fs_inode *ri;
 
-	ipage = get_node_page(sbi, inode->i_ino);
-	f2fs_bug_on(IS_ERR(ipage));
+	ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+	f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
 
 	ri = F2FS_INODE(page);
 	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
@@ -1579,7 +1636,7 @@ update_inode:
 
 void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
 	nid_t new_xnid = nid_of_node(page);
 	struct node_info ni;
@@ -1590,7 +1647,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 
 	/* Deallocate node address */
 	get_node_info(sbi, prev_xnid, &ni);
-	f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+	f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
 	invalidate_blocks(sbi, ni.blk_addr);
 	dec_valid_node_count(sbi, inode);
 	set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1598,7 +1655,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 recover_xnid:
 	/* 2: allocate new xattr nid */
 	if (unlikely(!inc_valid_node_count(sbi, inode)))
-		f2fs_bug_on(1);
+		f2fs_bug_on(sbi, 1);
 
 	remove_free_nid(NM_I(sbi), new_xnid);
 	get_node_info(sbi, new_xnid, &ni);
@@ -1691,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	struct f2fs_summary *sum_entry;
 	struct inode *inode = sbi->sb->s_bdev->bd_inode;
 	block_t addr;
-	int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+	int bio_blocks = MAX_BIO_BLOCKS(sbi);
 	struct page *pages[bio_blocks];
 	int i, idx, last_offset, nrpages, err = 0;
 
@@ -1733,89 +1790,6 @@ skip:
 	return err;
 }
 
-static struct nat_entry_set *grab_nat_entry_set(void)
-{
-	struct nat_entry_set *nes =
-			f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-
-	nes->entry_cnt = 0;
-	INIT_LIST_HEAD(&nes->set_list);
-	INIT_LIST_HEAD(&nes->entry_list);
-	return nes;
-}
-
-static void release_nat_entry_set(struct nat_entry_set *nes,
-						struct f2fs_nm_info *nm_i)
-{
-	f2fs_bug_on(!list_empty(&nes->entry_list));
-
-	nm_i->dirty_nat_cnt -= nes->entry_cnt;
-	list_del(&nes->set_list);
-	kmem_cache_free(nat_entry_set_slab, nes);
-}
-
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
-						struct list_head *head)
-{
-	struct nat_entry_set *next = nes;
-
-	if (list_is_last(&nes->set_list, head))
-		return;
-
-	list_for_each_entry_continue(next, head, set_list)
-		if (nes->entry_cnt <= next->entry_cnt)
-			break;
-
-	list_move_tail(&nes->set_list, &next->set_list);
-}
-
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
-	struct nat_entry_set *nes;
-	nid_t start_nid = START_NID(ne->ni.nid);
-
-	list_for_each_entry(nes, head, set_list) {
-		if (nes->start_nid == start_nid) {
-			list_move_tail(&ne->list, &nes->entry_list);
-			nes->entry_cnt++;
-			adjust_nat_entry_set(nes, head);
-			return;
-		}
-	}
-
-	nes = grab_nat_entry_set();
-
-	nes->start_nid = start_nid;
-	list_move_tail(&ne->list, &nes->entry_list);
-	nes->entry_cnt++;
-	list_add(&nes->set_list, head);
-}
-
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct list_head *dirty_list = &nm_i->dirty_nat_entries;
-	struct list_head *set_list = &nm_i->nat_entry_set;
-	struct nat_entry *ne, *tmp;
-
-	write_lock(&nm_i->nat_tree_lock);
-	list_for_each_entry_safe(ne, tmp, dirty_list, list) {
-		if (nat_get_blkaddr(ne) == NEW_ADDR)
-			continue;
-		add_nat_entry(ne, set_list);
-		nm_i->dirty_nat_cnt++;
-	}
-	write_unlock(&nm_i->nat_tree_lock);
-}
-
-static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
-{
-	if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
-		return true;
-	else
-		return false;
-}
-
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1850,99 +1824,130 @@ found:
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
-/*
- * This function is called during the checkpointing process.
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+						struct list_head *head, int max)
 {
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_summary_block *sum = curseg->sum_blk;
-	struct nat_entry_set *nes, *tmp;
-	struct list_head *head = &nm_i->nat_entry_set;
-	bool to_journal = true;
+	struct nat_entry_set *cur;
 
-	/* merge nat entries of dirty list to nat entry set temporarily */
-	merge_nats_in_set(sbi);
+	if (nes->entry_cnt >= max)
+		goto add_out;
 
-	/*
-	 * if there are no enough space in journal to store dirty nat
-	 * entries, remove all entries from journal and merge them
-	 * into nat entry set.
-	 */
-	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
-		remove_nats_in_journal(sbi);
-
-		/*
-		 * merge nat entries of dirty list to nat entry set temporarily
-		 */
-		merge_nats_in_set(sbi);
+	list_for_each_entry(cur, head, set_list) {
+		if (cur->entry_cnt >= nes->entry_cnt) {
+			list_add(&nes->set_list, cur->set_list.prev);
+			return;
+		}
 	}
+add_out:
+	list_add_tail(&nes->set_list, head);
+}
 
-	if (!nm_i->dirty_nat_cnt)
-		return;
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+					struct nat_entry_set *set)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+	bool to_journal = true;
+	struct f2fs_nat_block *nat_blk;
+	struct nat_entry *ne, *cur;
+	struct page *page = NULL;
 
 	/*
 	 * there are two steps to flush nat entries:
 	 * #1, flush nat entries to journal in current hot data summary block.
 	 * #2, flush nat entries to nat page.
 	 */
-	list_for_each_entry_safe(nes, tmp, head, set_list) {
-		struct f2fs_nat_block *nat_blk;
-		struct nat_entry *ne, *cur;
-		struct page *page;
-		nid_t start_nid = nes->start_nid;
+	if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+		to_journal = false;
 
-		if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
-			to_journal = false;
+	if (to_journal) {
+		mutex_lock(&curseg->curseg_mutex);
+	} else {
+		page = get_next_nat_page(sbi, start_nid);
+		nat_blk = page_address(page);
+		f2fs_bug_on(sbi, !nat_blk);
+	}
+
+	/* flush dirty nats in nat entry set */
+	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+		struct f2fs_nat_entry *raw_ne;
+		nid_t nid = nat_get_nid(ne);
+		int offset;
+
+		if (nat_get_blkaddr(ne) == NEW_ADDR)
+			continue;
 
 		if (to_journal) {
-			mutex_lock(&curseg->curseg_mutex);
+			offset = lookup_journal_in_cursum(sum,
+							NAT_JOURNAL, nid, 1);
+			f2fs_bug_on(sbi, offset < 0);
+			raw_ne = &nat_in_journal(sum, offset);
+			nid_in_journal(sum, offset) = cpu_to_le32(nid);
 		} else {
-			page = get_next_nat_page(sbi, start_nid);
-			nat_blk = page_address(page);
-			f2fs_bug_on(!nat_blk);
+			raw_ne = &nat_blk->entries[nid - start_nid];
 		}
+		raw_nat_from_node_info(raw_ne, &ne->ni);
 
-		/* flush dirty nats in nat entry set */
-		list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
-			struct f2fs_nat_entry *raw_ne;
-			nid_t nid = nat_get_nid(ne);
-			int offset;
+		write_lock(&NM_I(sbi)->nat_tree_lock);
+		nat_reset_flag(ne);
+		__clear_nat_cache_dirty(NM_I(sbi), ne);
+		write_unlock(&NM_I(sbi)->nat_tree_lock);
 
-			if (to_journal) {
-				offset = lookup_journal_in_cursum(sum,
-							NAT_JOURNAL, nid, 1);
-				f2fs_bug_on(offset < 0);
-				raw_ne = &nat_in_journal(sum, offset);
-				nid_in_journal(sum, offset) = cpu_to_le32(nid);
-			} else {
-				raw_ne = &nat_blk->entries[nid - start_nid];
-			}
-			raw_nat_from_node_info(raw_ne, &ne->ni);
+		if (nat_get_blkaddr(ne) == NULL_ADDR)
+			add_free_nid(sbi, nid, false);
+	}
 
-			if (nat_get_blkaddr(ne) == NULL_ADDR &&
-				add_free_nid(sbi, nid, false) <= 0) {
-				write_lock(&nm_i->nat_tree_lock);
-				__del_from_nat_cache(nm_i, ne);
-				write_unlock(&nm_i->nat_tree_lock);
-			} else {
-				write_lock(&nm_i->nat_tree_lock);
-				__clear_nat_cache_dirty(nm_i, ne);
-				write_unlock(&nm_i->nat_tree_lock);
-			}
-		}
+	if (to_journal)
+		mutex_unlock(&curseg->curseg_mutex);
+	else
+		f2fs_put_page(page, 1);
 
-		if (to_journal)
-			mutex_unlock(&curseg->curseg_mutex);
-		else
-			f2fs_put_page(page, 1);
+	if (!set->entry_cnt) {
+		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+		kmem_cache_free(nat_entry_set_slab, set);
+	}
+}
+
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+	struct f2fs_summary_block *sum = curseg->sum_blk;
+	struct nat_entry_set *setvec[NATVEC_SIZE];
+	struct nat_entry_set *set, *tmp;
+	unsigned int found;
+	nid_t set_idx = 0;
+	LIST_HEAD(sets);
+
+	/*
+	 * if there are no enough space in journal to store dirty nat
+	 * entries, remove all entries from journal and merge them
+	 * into nat entry set.
+	 */
+	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+		remove_nats_in_journal(sbi);
 
-		release_nat_entry_set(nes, nm_i);
+	if (!nm_i->dirty_nat_cnt)
+		return;
+
+	while ((found = __gang_lookup_nat_set(nm_i,
+					set_idx, NATVEC_SIZE, setvec))) {
+		unsigned idx;
+		set_idx = setvec[found - 1]->set + 1;
+		for (idx = 0; idx < found; idx++)
+			__adjust_nat_entry_set(setvec[idx], &sets,
+							MAX_NAT_JENTRIES(sum));
 	}
 
-	f2fs_bug_on(!list_empty(head));
-	f2fs_bug_on(nm_i->dirty_nat_cnt);
+	/* flush dirty nats in nat entry set */
+	list_for_each_entry_safe(set, tmp, &sets, set_list)
+		__flush_nat_entry_set(sbi, set);
+
+	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 
 static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1969,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
 	INIT_LIST_HEAD(&nm_i->free_nid_list);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
-	INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
-	INIT_LIST_HEAD(&nm_i->nat_entry_set);
 
 	mutex_init(&nm_i->build_lock);
 	spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2020,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	/* destroy free nid list */
 	spin_lock(&nm_i->free_nid_list_lock);
 	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
-		f2fs_bug_on(i->state == NID_ALLOC);
+		f2fs_bug_on(sbi, i->state == NID_ALLOC);
 		__del_from_free_nid_list(nm_i, i);
 		nm_i->fcnt--;
 		spin_unlock(&nm_i->free_nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
 		spin_lock(&nm_i->free_nid_list_lock);
 	}
-	f2fs_bug_on(nm_i->fcnt);
+	f2fs_bug_on(sbi, nm_i->fcnt);
 	spin_unlock(&nm_i->free_nid_list_lock);
 
 	/* destroy nat cache */
@@ -2039,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 		for (idx = 0; idx < found; idx++)
 			__del_from_nat_cache(nm_i, natvec[idx]);
 	}
-	f2fs_bug_on(nm_i->nat_cnt);
+	f2fs_bug_on(sbi, nm_i->nat_cnt);
 	write_unlock(&nm_i->nat_tree_lock);
 
 	kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
 	unsigned char version;	/* version of the node */
 };
 
+enum {
+	IS_CHECKPOINTED,	/* is it checkpointed before? */
+	HAS_FSYNCED_INODE,	/* is the inode fsynced before? */
+	HAS_LAST_FSYNC,		/* has the latest node fsync mark? */
+	IS_DIRTY,		/* this nat entry is dirty? */
+};
+
 struct nat_entry {
 	struct list_head list;	/* for clean or dirty nat list */
-	bool checkpointed;	/* whether it is checkpointed or not */
-	bool fsync_done;	/* whether the latest node has fsync mark */
+	unsigned char flag;	/* for node information bits */
 	struct node_info ni;	/* in-memory node information */
 };
 
@@ -55,18 +61,32 @@ struct nat_entry {
 #define nat_get_version(nat)		(nat->ni.version)
 #define nat_set_version(nat, v)		(nat->ni.version = v)
 
-#define __set_nat_cache_dirty(nm_i, ne)					\
-	do {								\
-		ne->checkpointed = false;				\
-		list_move_tail(&ne->list, &nm_i->dirty_nat_entries);	\
-	} while (0)
-#define __clear_nat_cache_dirty(nm_i, ne)				\
-	do {								\
-		ne->checkpointed = true;				\
-		list_move_tail(&ne->list, &nm_i->nat_entries);		\
-	} while (0)
 #define inc_node_version(version)	(++version)
 
+static inline void set_nat_flag(struct nat_entry *ne,
+				unsigned int type, bool set)
+{
+	unsigned char mask = 0x01 << type;
+	if (set)
+		ne->flag |= mask;
+	else
+		ne->flag &= ~mask;
+}
+
+static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+{
+	unsigned char mask = 0x01 << type;
+	return ne->flag & mask;
+}
+
+static inline void nat_reset_flag(struct nat_entry *ne)
+{
+	/* these states can be set only after checkpoint was done */
+	set_nat_flag(ne, IS_CHECKPOINTED, true);
+	set_nat_flag(ne, HAS_FSYNCED_INODE, false);
+	set_nat_flag(ne, HAS_LAST_FSYNC, true);
+}
+
 static inline void node_info_from_raw_nat(struct node_info *ni,
 						struct f2fs_nat_entry *raw_ne)
 {
@@ -90,9 +110,9 @@ enum mem_type {
 };
 
 struct nat_entry_set {
-	struct list_head set_list;	/* link with all nat sets */
+	struct list_head set_list;	/* link with other nat sets */
 	struct list_head entry_list;	/* link with dirty nat entries */
-	nid_t start_nid;		/* start nid of nats in set */
+	nid_t set;			/* set number*/
 	unsigned int entry_cnt;		/* the # of nat entries in set */
 };
 
@@ -110,18 +130,19 @@ struct free_nid {
 	int state;		/* in use or not: NID_NEW or NID_ALLOC */
 };
 
-static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *fnid;
 
-	if (nm_i->fcnt <= 0)
-		return -1;
 	spin_lock(&nm_i->free_nid_list_lock);
+	if (nm_i->fcnt <= 0) {
+		spin_unlock(&nm_i->free_nid_list_lock);
+		return;
+	}
 	fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
 	*nid = fnid->nid;
 	spin_unlock(&nm_i->free_nid_list_lock);
-	return 0;
 }
 
 /*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
 
 static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
 	struct f2fs_node *rn = F2FS_NODE(page);
 
 	rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 756c41cd2582..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
 #include "node.h"
 #include "segment.h"
 
+/*
+ * Roll forward recovery scenarios.
+ *
+ * [Term] F: fsync_mark, D: dentry_mark
+ *
+ * 1. inode(x) | CP | inode(x) | dnode(F)
+ * -> Update the latest inode(x).
+ *
+ * 2. inode(x) | CP | inode(F) | dnode(F)
+ * -> No problem.
+ *
+ * 3. inode(x) | CP | dnode(F) | inode(x)
+ * -> Recover to the latest dnode(F), and drop the last inode(x)
+ *
+ * 4. inode(x) | CP | dnode(F) | inode(F)
+ * -> No problem.
+ *
+ * 5. CP | inode(x) | dnode(F)
+ * -> The inode(DF) was missing. Should drop this dnode(F).
+ *
+ * 6. CP | inode(DF) | dnode(F)
+ * -> No problem.
+ *
+ * 7. CP | dnode(F) | inode(DF)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *
+ * 8. CP | dnode(F) | inode(x)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *    But it will fail due to no inode(DF).
+ */
+
 static struct kmem_cache *fsync_entry_slab;
 
 bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 	return NULL;
 }
 
-static int recover_dentry(struct page *ipage, struct inode *inode)
+static int recover_dentry(struct inode *inode, struct page *ipage)
 {
 	struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
 	nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -75,7 +106,7 @@ retry:
 				err = -EEXIST;
 			goto out_unmap_put;
 		}
-		err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+		err = acquire_orphan_inode(F2FS_I_SB(inode));
 		if (err) {
 			iput(einode);
 			goto out_unmap_put;
@@ -110,35 +141,28 @@ out:
 	return err;
 }
 
-static int recover_inode(struct inode *inode, struct page *node_page)
+static void recover_inode(struct inode *inode, struct page *page)
 {
-	struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
-
-	if (!IS_INODE(node_page))
-		return 0;
+	struct f2fs_inode *raw = F2FS_INODE(page);
 
-	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
-	i_size_write(inode, le64_to_cpu(raw_inode->i_size));
-	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
-	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-
-	if (is_dent_dnode(node_page))
-		return recover_dentry(node_page, inode);
+	inode->i_mode = le16_to_cpu(raw->i_mode);
+	i_size_write(inode, le64_to_cpu(raw->i_size));
+	inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
 
 	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
-			ino_of_node(node_page), raw_inode->i_name);
-	return 0;
+			ino_of_node(page), F2FS_INODE(page)->i_name);
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
-	struct page *page;
+	struct page *page = NULL;
 	block_t blkaddr;
 	int err = 0;
 
@@ -146,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
-	/* read node page */
-	page = alloc_page(GFP_F2FS_ZERO);
-	if (!page)
-		return -ENOMEM;
-	lock_page(page);
-
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
-		if (err)
-			return err;
+		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+			return 0;
 
-		lock_page(page);
+		page = get_meta_page_ra(sbi, blkaddr);
 
 		if (cp_ver != cpver_of_node(page))
 			break;
@@ -180,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 			}
 
 			/* add this fsync inode to the list */
-			entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+			entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
 			if (!entry) {
 				err = -ENOMEM;
 				break;
 			}
-
+			/*
+			 * CP | dnode(F) | inode(DF)
+			 * For this case, we should not give up now.
+			 */
 			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
 			if (IS_ERR(entry->inode)) {
 				err = PTR_ERR(entry->inode);
 				kmem_cache_free(fsync_entry_slab, entry);
+				if (err == -ENOENT)
+					goto next;
 				break;
 			}
 			list_add_tail(&entry->list, head);
 		}
 		entry->blkaddr = blkaddr;
 
-		err = recover_inode(entry->inode, page);
-		if (err && err != -ENOENT)
-			break;
+		if (IS_INODE(page)) {
+			entry->last_inode = blkaddr;
+			if (is_dent_dnode(page))
+				entry->last_dentry = blkaddr;
+		}
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
+		f2fs_put_page(page, 1);
 	}
-
-	unlock_page(page);
-	__free_pages(page, 0);
-
+	f2fs_put_page(page, 1);
 	return err;
 }
 
@@ -279,16 +301,30 @@ got_it:
 	ino = ino_of_node(node_page);
 	f2fs_put_page(node_page, 1);
 
-	/* Deallocate previous index in the node page */
-	inode = f2fs_iget(sbi->sb, ino);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+	if (ino != dn->inode->i_ino) {
+		/* Deallocate previous index in the node page */
+		inode = f2fs_iget(sbi->sb, ino);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+	} else {
+		inode = dn->inode;
+	}
 
 	bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
-					le16_to_cpu(sum.ofs_in_node);
+			le16_to_cpu(sum.ofs_in_node);
 
-	truncate_hole(inode, bidx, bidx + 1);
-	iput(inode);
+	if (ino != dn->inode->i_ino) {
+		truncate_hole(inode, bidx, bidx + 1);
+		iput(inode);
+	} else {
+		struct dnode_of_data tdn;
+		set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
+		if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+			return 0;
+		if (tdn.data_blkaddr != NULL_ADDR)
+			truncate_data_blocks_range(&tdn, 1);
+		f2fs_put_page(tdn.node_page, 1);
+	}
 	return 0;
 }
 
@@ -331,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	f2fs_wait_on_page_writeback(dn.node_page, NODE);
 
 	get_node_info(sbi, dn.nid, &ni);
-	f2fs_bug_on(ni.ino != ino_of_node(page));
-	f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
+	f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
+	f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
 
 	for (; start < end; start++) {
 		block_t src, dest;
@@ -344,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
 				/* We should not get -ENOSPC */
-				f2fs_bug_on(err);
+				f2fs_bug_on(sbi, err);
 			}
 
 			/* Check the previous node page having this index */
@@ -386,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
-	struct page *page;
+	struct page *page = NULL;
 	int err = 0;
 	block_t blkaddr;
 
@@ -394,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	curseg = CURSEG_I(sbi, type);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
-	/* read node page */
-	page = alloc_page(GFP_F2FS_ZERO);
-	if (!page)
-		return -ENOMEM;
-
-	lock_page(page);
-
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
-		if (err)
-			return err;
+		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+			break;
 
-		lock_page(page);
+		page = get_meta_page_ra(sbi, blkaddr);
 
-		if (cp_ver != cpver_of_node(page))
+		if (cp_ver != cpver_of_node(page)) {
+			f2fs_put_page(page, 1);
 			break;
+		}
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (!entry)
 			goto next;
-
+		/*
+		 * inode(x) | CP | inode(x) | dnode(F)
+		 * In this case, we can lose the latest inode(x).
+		 * So, call recover_inode for the inode update.
+		 */
+		if (entry->last_inode == blkaddr)
+			recover_inode(entry->inode, page);
+		if (entry->last_dentry == blkaddr) {
+			err = recover_dentry(entry->inode, page);
+			if (err) {
+				f2fs_put_page(page, 1);
+				break;
+			}
+		}
 		err = do_recover_data(sbi, entry->inode, page, blkaddr);
-		if (err)
+		if (err) {
+			f2fs_put_page(page, 1);
 			break;
+		}
 
 		if (entry->blkaddr == blkaddr) {
 			iput(entry->inode);
@@ -429,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
+		f2fs_put_page(page, 1);
 	}
-
-	unlock_page(page);
-	__free_pages(page, 0);
-
 	if (!err)
 		allocate_new_segments(sbi);
 	return err;
@@ -474,11 +516,15 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	/* step #2: recover data */
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
 	if (!err)
-		f2fs_bug_on(!list_empty(&inode_list));
+		f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:
 	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
 
+	/* truncate meta pages to be used by the recovery */
+	truncate_inode_pages_range(META_MAPPING(sbi),
+			MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
 		truncate_inode_pages_final(META_MAPPING(sbi));
@@ -494,8 +540,11 @@ out:
 		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
 		mutex_unlock(&sbi->cp_mutex);
 	} else if (need_writecp) {
+		struct cp_control cpc = {
+			.reason = CP_SYNC,
+		};
 		mutex_unlock(&sbi->cp_mutex);
-		write_checkpoint(sbi, false);
+		write_checkpoint(sbi, &cpc);
 	} else {
 		mutex_unlock(&sbi->cp_mutex);
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0aa337cd5bba..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 
 static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *sit_entry_set_slab;
+static struct kmem_cache *inmem_entry_slab;
 
 /*
  * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -172,6 +174,60 @@ found_middle:
 	return result + __reverse_ffz(tmp);
 }
 
+void register_inmem_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct inmem_pages *new;
+
+	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+
+	/* add atomic page indices to the list */
+	new->page = page;
+	INIT_LIST_HEAD(&new->list);
+
+	/* increase reference count with clean state */
+	mutex_lock(&fi->inmem_lock);
+	get_page(page);
+	list_add_tail(&new->list, &fi->inmem_pages);
+	mutex_unlock(&fi->inmem_lock);
+}
+
+void commit_inmem_pages(struct inode *inode, bool abort)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct inmem_pages *cur, *tmp;
+	bool submit_bio = false;
+	struct f2fs_io_info fio = {
+		.type = DATA,
+		.rw = WRITE_SYNC,
+	};
+
+	f2fs_balance_fs(sbi);
+	f2fs_lock_op(sbi);
+
+	mutex_lock(&fi->inmem_lock);
+	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+		lock_page(cur->page);
+		if (!abort && cur->page->mapping == inode->i_mapping) {
+			f2fs_wait_on_page_writeback(cur->page, DATA);
+			if (clear_page_dirty_for_io(cur->page))
+				inode_dec_dirty_pages(inode);
+			do_write_data_page(cur->page, &fio);
+			submit_bio = true;
+		}
+		f2fs_put_page(cur->page, 1);
+		list_del(&cur->list);
+		kmem_cache_free(inmem_entry_slab, cur);
+	}
+	if (submit_bio)
+		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+	mutex_unlock(&fi->inmem_lock);
+
+	filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+	f2fs_unlock_op(sbi);
+}
+
 /*
  * This function balances dirty node and dentry pages.
  * In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
 	if (kthread_should_stop())
 		return 0;
 
-	spin_lock(&fcc->issue_lock);
-	if (fcc->issue_list) {
-		fcc->dispatch_list = fcc->issue_list;
-		fcc->issue_list = fcc->issue_tail = NULL;
-	}
-	spin_unlock(&fcc->issue_lock);
-
-	if (fcc->dispatch_list) {
+	if (!llist_empty(&fcc->issue_list)) {
 		struct bio *bio = bio_alloc(GFP_NOIO, 0);
 		struct flush_cmd *cmd, *next;
 		int ret;
 
+		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
+		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
+
 		bio->bi_bdev = sbi->sb->s_bdev;
 		ret = submit_bio_wait(WRITE_FLUSH, bio);
 
-		for (cmd = fcc->dispatch_list; cmd; cmd = next) {
+		llist_for_each_entry_safe(cmd, next,
+					  fcc->dispatch_list, llnode) {
 			cmd->ret = ret;
-			next = cmd->next;
 			complete(&cmd->wait);
 		}
 		bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
 	}
 
 	wait_event_interruptible(*q,
-			kthread_should_stop() || fcc->issue_list);
+		kthread_should_stop() || !llist_empty(&fcc->issue_list));
 	goto repeat;
 }
 
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
 
 	init_completion(&cmd.wait);
-	cmd.next = NULL;
 
-	spin_lock(&fcc->issue_lock);
-	if (fcc->issue_list)
-		fcc->issue_tail->next = &cmd;
-	else
-		fcc->issue_list = &cmd;
-	fcc->issue_tail = &cmd;
-	spin_unlock(&fcc->issue_lock);
+	llist_add(&cmd.llnode, &fcc->issue_list);
 
 	if (!fcc->dispatch_list)
 		wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
-	spin_lock_init(&fcc->issue_lock);
 	init_waitqueue_head(&fcc->flush_wait_queue);
+	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->cmd_control_info = fcc;
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		struct seg_entry *sentry = get_seg_entry(sbi, segno);
 		enum dirty_type t = sentry->type;
 
+		if (unlikely(t >= DIRTY)) {
+			f2fs_bug_on(sbi, 1);
+			return;
+		}
 		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
 			dirty_i->nr_dirty[t]++;
 	}
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 				block_t blkstart, block_t blklen)
 {
-	sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
-	sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+	sector_t start = SECTOR_FROM_BLOCK(blkstart);
+	sector_t len = SECTOR_FROM_BLOCK(blklen);
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
@@ -392,22 +441,48 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 	}
 }
 
-static void add_discard_addrs(struct f2fs_sb_info *sbi,
-			unsigned int segno, struct seg_entry *se)
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct list_head *head = &SM_I(sbi)->discard_list;
 	struct discard_entry *new;
 	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
 	int max_blocks = sbi->blocks_per_seg;
+	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
 	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
 	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
 	unsigned long dmap[entries];
 	unsigned int start = 0, end = -1;
+	bool force = (cpc->reason == CP_DISCARD);
 	int i;
 
-	if (!test_opt(sbi, DISCARD))
+	if (!force && !test_opt(sbi, DISCARD))
 		return;
 
+	if (force && !se->valid_blocks) {
+		struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+		/*
+		 * if this segment is registered in the prefree list, then
+		 * we should skip adding a discard candidate, and let the
+		 * checkpoint do that later.
+		 */
+		mutex_lock(&dirty_i->seglist_lock);
+		if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
+			mutex_unlock(&dirty_i->seglist_lock);
+			cpc->trimmed += sbi->blocks_per_seg;
+			return;
+		}
+		mutex_unlock(&dirty_i->seglist_lock);
+
+		new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+		INIT_LIST_HEAD(&new->list);
+		new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
+		new->len = sbi->blocks_per_seg;
+		list_add_tail(&new->list, head);
+		SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
+		cpc->trimmed += sbi->blocks_per_seg;
+		return;
+	}
+
 	/* zero block will be discarded through the prefree list */
 	if (!se->valid_blocks || se->valid_blocks == max_blocks)
 		return;
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
 	for (i = 0; i < entries; i++)
 		dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
 
-	while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+	while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
 		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
 		if (start >= max_blocks)
 			break;
 
 		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
 
+		if (end - start < cpc->trim_minlen)
+			continue;
+
 		new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
 		INIT_LIST_HEAD(&new->list);
-		new->blkaddr = START_BLOCK(sbi, segno) + start;
+		new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
 		new->len = end - start;
+		cpc->trimmed += end - start;
 
 		list_add_tail(&new->list, head);
 		SM_I(sbi)->nr_discards += end - start;
 	}
 }
 
+void release_discard_addrs(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &(SM_I(sbi)->discard_list);
+	struct discard_entry *entry, *this;
+
+	/* drop caches */
+	list_for_each_entry_safe(entry, this, head, list) {
+		list_del(&entry->list);
+		kmem_cache_free(discard_entry_slab, entry);
+	}
+}
+
 /*
  * Should call clear_prefree_segments after checkpoint is done.
  */
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned int segno;
-	unsigned int total_segs = TOTAL_SEGS(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
-	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs)
+	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
 		__set_test_and_free(sbi, segno);
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
 	struct discard_entry *entry, *this;
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
-	unsigned int total_segs = TOTAL_SEGS(sbi);
 	unsigned int start = 0, end = -1;
 
 	mutex_lock(&dirty_i->seglist_lock);
 
 	while (1) {
 		int i;
-		start = find_next_bit(prefree_map, total_segs, end + 1);
-		if (start >= total_segs)
+		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
+		if (start >= MAIN_SEGS(sbi))
 			break;
-		end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+		end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
+								start + 1);
 
 		for (i = start; i < end; i++)
 			clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
 	}
 }
 
-static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+
+	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
 		sit_i->dirty_sentries++;
+		return false;
+	}
+
+	return true;
 }
 
 static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	new_vblocks = se->valid_blocks + del;
 	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
 
-	f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
+	f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
 				(new_vblocks > sbi->blocks_per_seg)));
 
 	se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	/* Update valid block bitmap */
 	if (del > 0) {
 		if (f2fs_set_bit(offset, se->cur_valid_map))
-			BUG();
+			f2fs_bug_on(sbi, 1);
 	} else {
 		if (!f2fs_clear_bit(offset, se->cur_valid_map))
-			BUG();
+			f2fs_bug_on(sbi, 1);
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
 		se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 	unsigned int segno = GET_SEGNO(sbi, addr);
 	struct sit_info *sit_i = SIT_I(sbi);
 
-	f2fs_bug_on(addr == NULL_ADDR);
+	f2fs_bug_on(sbi, addr == NULL_ADDR);
 	if (addr == NEW_ADDR)
 		return;
 
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 	unsigned int segno = curseg->segno + 1;
 	struct free_segmap_info *free_i = FREE_I(sbi);
 
-	if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+	if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
 		return !test_bit(segno, free_i->free_segmap);
 	return 0;
 }
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
 	unsigned int segno, secno, zoneno;
-	unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
+	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
 	unsigned int hint = *newseg / sbi->segs_per_sec;
 	unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
 	unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 
 	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
 		segno = find_next_zero_bit(free_i->free_segmap,
-					TOTAL_SEGS(sbi), *newseg + 1);
+					MAIN_SEGS(sbi), *newseg + 1);
 		if (segno - *newseg < sbi->segs_per_sec -
 					(*newseg % sbi->segs_per_sec))
 			goto got_it;
 	}
 find_other_zone:
-	secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
-	if (secno >= TOTAL_SECS(sbi)) {
+	secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+	if (secno >= MAIN_SECS(sbi)) {
 		if (dir == ALLOC_RIGHT) {
 			secno = find_next_zero_bit(free_i->free_secmap,
-							TOTAL_SECS(sbi), 0);
-			f2fs_bug_on(secno >= TOTAL_SECS(sbi));
+							MAIN_SECS(sbi), 0);
+			f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
 		} else {
 			go_left = 1;
 			left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
 			continue;
 		}
 		left_start = find_next_zero_bit(free_i->free_secmap,
-							TOTAL_SECS(sbi), 0);
-		f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
+							MAIN_SECS(sbi), 0);
+		f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
 		break;
 	}
 	secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
 	}
 got_it:
 	/* set it as dirty segment in free segmap */
-	f2fs_bug_on(test_bit(segno, free_i->free_segmap));
+	f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
 	__set_inuse(sbi, segno);
 	*newseg = segno;
 	write_unlock(&free_i->segmap_lock);
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
 	.allocate_segment = allocate_segment_by_default,
 };
 
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
+{
+	__u64 start = range->start >> sbi->log_blocksize;
+	__u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+	unsigned int start_segno, end_segno;
+	struct cp_control cpc;
+
+	if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
+						range->len < sbi->blocksize)
+		return -EINVAL;
+
+	if (end <= MAIN_BLKADDR(sbi))
+		goto out;
+
+	/* start/end segment number in main_area */
+	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
+	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
+						GET_SEGNO(sbi, end);
+	cpc.reason = CP_DISCARD;
+	cpc.trim_start = start_segno;
+	cpc.trim_end = end_segno;
+	cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+	cpc.trimmed = 0;
+
+	/* do checkpoint to issue discard commands safely */
+	write_checkpoint(sbi, &cpc);
+out:
+	range->len = cpc.trimmed << sbi->log_blocksize;
+	return 0;
+}
+
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
 
 static int __get_segment_type(struct page *page, enum page_type p_type)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
-	switch (sbi->active_logs) {
+	switch (F2FS_P_SB(page)->active_logs) {
 	case 2:
 		return __get_segment_type_2(page, p_type);
 	case 4:
 		return __get_segment_type_4(page, p_type);
 	}
 	/* NR_CURSEG_TYPE(6) logs by default */
-	f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
+	f2fs_bug_on(F2FS_P_SB(page),
+		F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
 	return __get_segment_type_6(page, p_type);
 }
 
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
 void write_data_page(struct page *page, struct dnode_of_data *dn,
 		block_t *new_blkaddr, struct f2fs_io_info *fio)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_summary sum;
 	struct node_info ni;
 
-	f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
+	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
 void rewrite_data_page(struct page *page, block_t old_blkaddr,
 					struct f2fs_io_info *fio)
 {
-	struct inode *inode = page->mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
+	f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
 }
 
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1130,8 +1254,9 @@ out:
 void f2fs_wait_on_page_writeback(struct page *page,
 				enum page_type type)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
 	if (PageWriteback(page)) {
+		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+
 		if (is_merged_page(sbi, page, type))
 			f2fs_submit_merged_bio(sbi, type, WRITE);
 		wait_on_page_writeback(page);
@@ -1400,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
 					unsigned int segno)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+	unsigned int offset = SIT_BLOCK_OFFSET(segno);
 	block_t blk_addr = sit_i->sit_base_addr + offset;
 
 	check_seg_range(sbi, segno);
@@ -1426,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 	/* get current sit block page without lock */
 	src_page = get_meta_page(sbi, src_off);
 	dst_page = grab_meta_page(sbi, dst_off);
-	f2fs_bug_on(PageDirty(src_page));
+	f2fs_bug_on(sbi, PageDirty(src_page));
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
@@ -1440,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 	return dst_page;
 }
 
-static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+static struct sit_entry_set *grab_sit_entry_set(void)
+{
+	struct sit_entry_set *ses =
+			f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+
+	ses->entry_cnt = 0;
+	INIT_LIST_HEAD(&ses->set_list);
+	return ses;
+}
+
+static void release_sit_entry_set(struct sit_entry_set *ses)
+{
+	list_del(&ses->set_list);
+	kmem_cache_free(sit_entry_set_slab, ses);
+}
+
+static void adjust_sit_entry_set(struct sit_entry_set *ses,
+						struct list_head *head)
+{
+	struct sit_entry_set *next = ses;
+
+	if (list_is_last(&ses->set_list, head))
+		return;
+
+	list_for_each_entry_continue(next, head, set_list)
+		if (ses->entry_cnt <= next->entry_cnt)
+			break;
+
+	list_move_tail(&ses->set_list, &next->set_list);
+}
+
+static void add_sit_entry(unsigned int segno, struct list_head *head)
+{
+	struct sit_entry_set *ses;
+	unsigned int start_segno = START_SEGNO(segno);
+
+	list_for_each_entry(ses, head, set_list) {
+		if (ses->start_segno == start_segno) {
+			ses->entry_cnt++;
+			adjust_sit_entry_set(ses, head);
+			return;
+		}
+	}
+
+	ses = grab_sit_entry_set();
+
+	ses->start_segno = start_segno;
+	ses->entry_cnt++;
+	list_add(&ses->set_list, head);
+}
+
+static void add_sits_in_set(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_sm_info *sm_info = SM_I(sbi);
+	struct list_head *set_list = &sm_info->sit_entry_set;
+	unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
+	unsigned int segno;
+
+	for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
+		add_sit_entry(segno, set_list);
+}
+
+static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
 	struct f2fs_summary_block *sum = curseg->sum_blk;
 	int i;
 
-	/*
-	 * If the journal area in the current summary is full of sit entries,
-	 * all the sit entries will be flushed. Otherwise the sit entries
-	 * are not able to replace with newly hot sit entries.
-	 */
-	if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
-		for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
-			unsigned int segno;
-			segno = le32_to_cpu(segno_in_journal(sum, i));
-			__mark_sit_entry_dirty(sbi, segno);
-		}
-		update_sits_in_cursum(sum, -sits_in_cursum(sum));
-		return true;
+	for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+		unsigned int segno;
+		bool dirtied;
+
+		segno = le32_to_cpu(segno_in_journal(sum, i));
+		dirtied = __mark_sit_entry_dirty(sbi, segno);
+
+		if (!dirtied)
+			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
 	}
-	return false;
+	update_sits_in_cursum(sum, -sits_in_cursum(sum));
 }
 
 /*
  * CP calls this function, which flushes SIT entries including sit_journal,
  * and moves prefree segs to free segs.
  */
-void flush_sit_entries(struct f2fs_sb_info *sbi)
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
 	struct f2fs_summary_block *sum = curseg->sum_blk;
-	unsigned long nsegs = TOTAL_SEGS(sbi);
-	struct page *page = NULL;
-	struct f2fs_sit_block *raw_sit = NULL;
-	unsigned int start = 0, end = 0;
-	unsigned int segno;
-	bool flushed;
+	struct sit_entry_set *ses, *tmp;
+	struct list_head *head = &SM_I(sbi)->sit_entry_set;
+	bool to_journal = true;
+	struct seg_entry *se;
 
 	mutex_lock(&curseg->curseg_mutex);
 	mutex_lock(&sit_i->sentry_lock);
 
 	/*
-	 * "flushed" indicates whether sit entries in journal are flushed
-	 * to the SIT area or not.
+	 * add and account sit entries of dirty bitmap in sit entry
+	 * set temporarily
 	 */
-	flushed = flush_sits_in_journal(sbi);
+	add_sits_in_set(sbi);
 
-	for_each_set_bit(segno, bitmap, nsegs) {
-		struct seg_entry *se = get_seg_entry(sbi, segno);
-		int sit_offset, offset;
+	/*
+	 * if there are no enough space in journal to store dirty sit
+	 * entries, remove all entries from journal and add and account
+	 * them in sit entry set.
+	 */
+	if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+		remove_sits_in_journal(sbi);
 
-		sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+	if (!sit_i->dirty_sentries)
+		goto out;
 
-		/* add discard candidates */
-		if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
-			add_discard_addrs(sbi, segno, se);
+	/*
+	 * there are two steps to flush sit entries:
+	 * #1, flush sit entries to journal in current cold data summary block.
+	 * #2, flush sit entries to sit page.
+	 */
+	list_for_each_entry_safe(ses, tmp, head, set_list) {
+		struct page *page;
+		struct f2fs_sit_block *raw_sit = NULL;
+		unsigned int start_segno = ses->start_segno;
+		unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
+						(unsigned long)MAIN_SEGS(sbi));
+		unsigned int segno = start_segno;
+
+		if (to_journal &&
+			!__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+			to_journal = false;
+
+		if (!to_journal) {
+			page = get_next_sit_page(sbi, start_segno);
+			raw_sit = page_address(page);
+		}
 
-		if (flushed)
-			goto to_sit_page;
+		/* flush dirty sit entries in region of current sit set */
+		for_each_set_bit_from(segno, bitmap, end) {
+			int offset, sit_offset;
 
-		offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
-		if (offset >= 0) {
-			segno_in_journal(sum, offset) = cpu_to_le32(segno);
-			seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
-			goto flush_done;
-		}
-to_sit_page:
-		if (!page || (start > segno) || (segno > end)) {
-			if (page) {
-				f2fs_put_page(page, 1);
-				page = NULL;
+			se = get_seg_entry(sbi, segno);
+
+			/* add discard candidates */
+			if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+				cpc->trim_start = segno;
+				add_discard_addrs(sbi, cpc);
 			}
 
-			start = START_SEGNO(sit_i, segno);
-			end = start + SIT_ENTRY_PER_BLOCK - 1;
+			if (to_journal) {
+				offset = lookup_journal_in_cursum(sum,
+							SIT_JOURNAL, segno, 1);
+				f2fs_bug_on(sbi, offset < 0);
+				segno_in_journal(sum, offset) =
+							cpu_to_le32(segno);
+				seg_info_to_raw_sit(se,
+						&sit_in_journal(sum, offset));
+			} else {
+				sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+				seg_info_to_raw_sit(se,
+						&raw_sit->entries[sit_offset]);
+			}
 
-			/* read sit block that will be updated */
-			page = get_next_sit_page(sbi, start);
-			raw_sit = page_address(page);
+			__clear_bit(segno, bitmap);
+			sit_i->dirty_sentries--;
+			ses->entry_cnt--;
 		}
 
-		/* udpate entry in SIT block */
-		seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
-flush_done:
-		__clear_bit(segno, bitmap);
-		sit_i->dirty_sentries--;
+		if (!to_journal)
+			f2fs_put_page(page, 1);
+
+		f2fs_bug_on(sbi, ses->entry_cnt);
+		release_sit_entry_set(ses);
+	}
+
+	f2fs_bug_on(sbi, !list_empty(head));
+	f2fs_bug_on(sbi, sit_i->dirty_sentries);
+out:
+	if (cpc->reason == CP_DISCARD) {
+		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
+			add_discard_addrs(sbi, cpc);
 	}
 	mutex_unlock(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
 
-	/* writeout last modified SIT block */
-	f2fs_put_page(page, 1);
-
 	set_prefree_as_free_segments(sbi);
 }
 
@@ -1554,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->sit_info = sit_i;
 
-	sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+	sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
-	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 	sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
-	for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+	for (start = 0; start < MAIN_SEGS(sbi); start++) {
 		sit_i->sentries[start].cur_valid_map
 			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		sit_i->sentries[start].ckpt_valid_map
@@ -1574,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	}
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
+		sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
 					sizeof(struct sec_entry));
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
@@ -1609,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 static int build_free_segmap(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_sm_info *sm_info = SM_I(sbi);
 	struct free_segmap_info *free_i;
 	unsigned int bitmap_size, sec_bitmap_size;
 
@@ -1620,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->free_info = free_i;
 
-	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 	free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
-	sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 	free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
@@ -1635,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	memset(free_i->free_secmap, 0xff, sec_bitmap_size);
 
 	/* init free segmap information */
-	free_i->start_segno =
-		(unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+	free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
 	free_i->free_segments = 0;
 	free_i->free_sections = 0;
 	rwlock_init(&free_i->segmap_lock);
@@ -1673,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
-	int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+	int nrpages = MAX_BIO_BLOCKS(sbi);
 
 	do {
 		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1681,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 		start = start_blk * sit_i->sents_per_block;
 		end = (start_blk + readed) * sit_i->sents_per_block;
 
-		for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+		for (; start < end && start < MAIN_SEGS(sbi); start++) {
 			struct seg_entry *se = &sit_i->sentries[start];
 			struct f2fs_sit_block *sit_blk;
 			struct f2fs_sit_entry sit;
@@ -1719,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
 	unsigned int start;
 	int type;
 
-	for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+	for (start = 0; start < MAIN_SEGS(sbi); start++) {
 		struct seg_entry *sentry = get_seg_entry(sbi, start);
 		if (!sentry->valid_blocks)
 			__set_free(sbi, start);
@@ -1736,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
+	unsigned int segno = 0, offset = 0;
 	unsigned short valid_blocks;
 
 	while (1) {
 		/* find dirty segment based on free segmap */
-		segno = find_next_inuse(free_i, total_segs, offset);
-		if (segno >= total_segs)
+		segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
+		if (segno >= MAIN_SEGS(sbi))
 			break;
 		offset = segno + 1;
 		valid_blocks = get_valid_blocks(sbi, segno, 0);
-		if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+		if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+			continue;
+		if (valid_blocks > sbi->blocks_per_seg) {
+			f2fs_bug_on(sbi, 1);
 			continue;
+		}
 		mutex_lock(&dirty_i->seglist_lock);
 		__locate_dirty_segment(sbi, segno, DIRTY);
 		mutex_unlock(&dirty_i->seglist_lock);
@@ -1757,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 static int init_victim_secmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 
 	dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
 	if (!dirty_i->victim_secmap)
@@ -1778,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	SM_I(sbi)->dirty_info = dirty_i;
 	mutex_init(&dirty_i->seglist_lock);
 
-	bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 
 	for (i = 0; i < NR_DIRTY_TYPE; i++) {
 		dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1802,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 
 	sit_i->min_mtime = LLONG_MAX;
 
-	for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
 		unsigned int i;
 		unsigned long long mtime = 0;
 
@@ -1840,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
 	sm_info->rec_prefree_segments = sm_info->main_segments *
 					DEF_RECLAIM_PREFREE_SEGMENTS / 100;
-	sm_info->ipu_policy = F2FS_IPU_DISABLE;
+	sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 
 	INIT_LIST_HEAD(&sm_info->discard_list);
 	sm_info->nr_discards = 0;
 	sm_info->max_discards = 0;
 
+	INIT_LIST_HEAD(&sm_info->sit_entry_set);
+
 	if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
@@ -1942,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 		return;
 
 	if (sit_i->sentries) {
-		for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+		for (start = 0; start < MAIN_SEGS(sbi); start++) {
 			kfree(sit_i->sentries[start].cur_valid_map);
 			kfree(sit_i->sentries[start].ckpt_valid_map);
 		}
@@ -1976,11 +2197,30 @@ int __init create_segment_manager_caches(void)
 	discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
 			sizeof(struct discard_entry));
 	if (!discard_entry_slab)
-		return -ENOMEM;
+		goto fail;
+
+	sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+			sizeof(struct nat_entry_set));
+	if (!sit_entry_set_slab)
+		goto destory_discard_entry;
+
+	inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+			sizeof(struct inmem_pages));
+	if (!inmem_entry_slab)
+		goto destroy_sit_entry_set;
 	return 0;
+
+destroy_sit_entry_set:
+	kmem_cache_destroy(sit_entry_set_slab);
+destory_discard_entry:
+	kmem_cache_destroy(discard_entry_slab);
+fail:
+	return -ENOMEM;
 }
 
 void destroy_segment_manager_caches(void)
 {
+	kmem_cache_destroy(sit_entry_set_slab);
 	kmem_cache_destroy(discard_entry_slab);
+	kmem_cache_destroy(inmem_entry_slab);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ff483257283b..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
 	 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
 	  sbi->segs_per_sec))	\
 
-#define START_BLOCK(sbi, segno)						\
-	(SM_I(sbi)->seg0_blkaddr +					\
+#define MAIN_BLKADDR(sbi)	(SM_I(sbi)->main_blkaddr)
+#define SEG0_BLKADDR(sbi)	(SM_I(sbi)->seg0_blkaddr)
+
+#define MAIN_SEGS(sbi)	(SM_I(sbi)->main_segments)
+#define MAIN_SECS(sbi)	(sbi->total_sections)
+
+#define TOTAL_SEGS(sbi)	(SM_I(sbi)->segment_count)
+#define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+
+#define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
+#define SEGMENT_SIZE(sbi)	(1ULL << (sbi->log_blocksize +		\
+					sbi->log_blocks_per_seg))
+
+#define START_BLOCK(sbi, segno)	(SEG0_BLKADDR(sbi) +			\
 	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+
 #define NEXT_FREE_BLKADDR(sbi, curseg)					\
 	(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
 
-#define MAIN_BASE_BLOCK(sbi)	(SM_I(sbi)->main_blkaddr)
-
-#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)				\
-	((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)	((blk_addr) - SEG0_BLKADDR(sbi))
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)				\
 	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
 #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)				\
@@ -77,23 +87,21 @@
 
 #define SIT_ENTRY_OFFSET(sit_i, segno)					\
 	(segno % sit_i->sents_per_block)
-#define SIT_BLOCK_OFFSET(sit_i, segno)					\
+#define SIT_BLOCK_OFFSET(segno)					\
 	(segno / SIT_ENTRY_PER_BLOCK)
-#define	START_SEGNO(sit_i, segno)		\
-	(SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define	START_SEGNO(segno)		\
+	(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
 #define SIT_BLK_CNT(sbi)			\
-	((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+	((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
 #define f2fs_bitmap_size(nr)			\
 	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
-#define TOTAL_SEGS(sbi)	(SM_I(sbi)->main_segments)
-#define TOTAL_SECS(sbi)	(sbi->total_sections)
 
-#define SECTOR_FROM_BLOCK(sbi, blk_addr)				\
-	(((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
-#define SECTOR_TO_BLOCK(sbi, sectors)					\
-	(sectors >> (sbi)->log_sectors_per_block)
-#define MAX_BIO_BLOCKS(max_hw_blocks)					\
-	(min((int)max_hw_blocks, BIO_MAX_PAGES))
+#define SECTOR_FROM_BLOCK(blk_addr)					\
+	(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
+#define SECTOR_TO_BLOCK(sectors)					\
+	(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+#define MAX_BIO_BLOCKS(sbi)						\
+	((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
 
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
 	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
 };
 
+struct inmem_pages {
+	struct list_head list;
+	struct page *page;
+};
+
 struct sit_info {
 	const struct segment_allocation *s_ops;
 
@@ -237,6 +250,12 @@ struct curseg_info {
 	unsigned int next_segno;		/* preallocated segment */
 };
 
+struct sit_entry_set {
+	struct list_head set_list;	/* link with all sit sets */
+	unsigned int start_segno;	/* start segno of sits in set */
+	unsigned int entry_cnt;		/* the # of sit entries in set */
+};
+
 /*
  * inline functions
  */
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
 	clear_bit(segno, free_i->free_segmap);
 	free_i->free_segments++;
 
-	next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+	next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
 	if (next >= start_segno + sbi->segs_per_sec) {
 		clear_bit(secno, free_i->free_secmap);
 		free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 
 static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
-	return (prefree_segments(sbi) / sbi->segs_per_sec)
-			+ free_sections(sbi) < overprovision_sections(sbi);
+	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+	return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+						reserved_sections(sbi) + 1);
 }
 
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
  * F2FS_IPU_UTIL - if FS utilization is over threashold,
  * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
  *                     threashold,
+ * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
+ *                     storages. IPU will be triggered only if the # of dirty
+ *                     pages over min_fsync_blocks.
  * F2FS_IPUT_DISABLE - disable IPU. (=default option)
  */
 #define DEF_MIN_IPU_UTIL	70
+#define DEF_MIN_FSYNC_BLOCKS	8
 
 enum {
 	F2FS_IPU_FORCE,
 	F2FS_IPU_SSR,
 	F2FS_IPU_UTIL,
 	F2FS_IPU_SSR_UTIL,
-	F2FS_IPU_DISABLE,
+	F2FS_IPU_FSYNC,
 };
 
 static inline bool need_inplace_update(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int policy = SM_I(sbi)->ipu_policy;
 
 	/* IPU can be done only for the user data */
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
 		return false;
 
-	/* this is only set during fdatasync */
-	if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+	if (policy & (0x1 << F2FS_IPU_FORCE))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+		return true;
+	if (policy & (0x1 << F2FS_IPU_UTIL) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
+		return true;
+	if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+			utilization(sbi) > SM_I(sbi)->min_ipu_util)
 		return true;
 
-	switch (SM_I(sbi)->ipu_policy) {
-	case F2FS_IPU_FORCE:
+	/* this is only set during fdatasync */
+	if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+			is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
 		return true;
-	case F2FS_IPU_SSR:
-		if (need_SSR(sbi))
-			return true;
-		break;
-	case F2FS_IPU_UTIL:
-		if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
-			return true;
-		break;
-	case F2FS_IPU_SSR_UTIL:
-		if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
-			return true;
-		break;
-	case F2FS_IPU_DISABLE:
-		break;
-	}
+
 	return false;
 }
 
@@ -534,18 +554,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
 #ifdef CONFIG_F2FS_CHECK_FS
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
-	unsigned int end_segno = SM_I(sbi)->segment_count - 1;
-	BUG_ON(segno > end_segno);
+	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
 }
 
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
-	struct f2fs_sm_info *sm_info = SM_I(sbi);
-	block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
-	block_t start_addr = sm_info->seg0_blkaddr;
-	block_t end_addr = start_addr + total_blks - 1;
-	BUG_ON(blk_addr < start_addr);
-	BUG_ON(blk_addr > end_addr);
+	BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
+	BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
 }
 
 /*
@@ -554,8 +569,6 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 static inline void check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)
 {
-	struct f2fs_sm_info *sm_info = SM_I(sbi);
-	unsigned int end_segno = sm_info->segment_count - 1;
 	bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
 	int valid_blocks = 0;
 	int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
 
 	/* check boundary of a given segment number */
-	BUG_ON(segno > end_segno);
+	BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
 
 	/* check bitmap with valid block count */
 	do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
 }
 #else
-#define check_seg_range(sbi, segno)
-#define verify_block_addr(sbi, blk_addr)
-#define check_block_count(sbi, segno, raw_sit)
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+	if (segno > TOTAL_SEGS(sbi) - 1)
+		sbi->need_fsck = true;
+}
+
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+	if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
+		sbi->need_fsck = true;
+}
+
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+		int segno, struct f2fs_sit_entry *raw_sit)
+{
+	/* check segment usage */
+	if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
+		sbi->need_fsck = true;
+
+	/* check boundary of a given segment number */
+	if (segno > TOTAL_SEGS(sbi) - 1)
+		sbi->need_fsck = true;
+}
 #endif
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
 						unsigned int start)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+	unsigned int offset = SIT_BLOCK_OFFSET(start);
 	block_t blk_addr = sit_i->sit_base_addr + offset;
 
 	check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
 
 static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
 {
-	unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+	unsigned int block_off = SIT_BLOCK_OFFSET(start);
 
 	if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
 		f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
 {
 	struct block_device *bdev = sbi->sb->s_bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
-	return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+	return SECTOR_TO_BLOCK(queue_max_sectors(q));
 }
 
 /*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	else if (type == NODE)
 		return 3 * sbi->blocks_per_seg;
 	else if (type == META)
-		return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		return MAX_BIO_BLOCKS(sbi);
 	else
 		return 0;
 }
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 	else if (type == NODE)
 		desired = 3 * max_hw_blocks(sbi);
 	else
-		desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		desired = MAX_BIO_BLOCKS(sbi);
 
 	wbc->nr_to_write = desired;
 	return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 41bdf511003d..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_small_discards),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
+	ATTR_LIST(min_fsync_blocks),
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	/* Initialize f2fs-specific inode info */
 	fi->vfs_inode.i_version = 1;
-	atomic_set(&fi->dirty_dents, 0);
+	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	rwlock_init(&fi->ext.ext_lock);
 	init_rwsem(&fi->i_sem);
+	INIT_LIST_HEAD(&fi->inmem_pages);
+	mutex_init(&fi->inmem_lock);
 
 	set_inode_flag(fi, FI_NEW_INODE);
 
@@ -432,14 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
 	stop_gc_thread(sbi);
 
 	/* We don't need to do checkpoint when it's clean */
-	if (sbi->s_dirty)
-		write_checkpoint(sbi, true);
+	if (sbi->s_dirty) {
+		struct cp_control cpc = {
+			.reason = CP_UMOUNT,
+		};
+		write_checkpoint(sbi, &cpc);
+	}
 
 	/*
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
 	 */
 	release_dirty_inode(sbi);
+	release_discard_addrs(sbi);
 
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
@@ -464,8 +473,11 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	trace_f2fs_sync_fs(sb, sync);
 
 	if (sync) {
+		struct cp_control cpc = {
+			.reason = CP_SYNC,
+		};
 		mutex_lock(&sbi->gc_mutex);
-		write_checkpoint(sbi, false);
+		write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
 	} else {
 		f2fs_balance_fs(sbi);
@@ -616,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	org_mount_opt = sbi->mount_opt;
 	active_logs = sbi->active_logs;
 
+	sbi->mount_opt.opt = 0;
+	sbi->active_logs = NR_CURSEG_TYPE;
+
 	/* parse mount options */
 	err = parse_options(sb, data);
 	if (err)
@@ -786,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
-	if (le32_to_cpu(raw_super->log_sectorsize) !=
-					F2FS_LOG_SECTOR_SIZE) {
-		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
+	/* Currently, support 512/1024/2048/4096 bytes sector size */
+	if (le32_to_cpu(raw_super->log_sectorsize) >
+				F2FS_MAX_LOG_SECTOR_SIZE ||
+		le32_to_cpu(raw_super->log_sectorsize) <
+				F2FS_MIN_LOG_SECTOR_SIZE) {
+		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
+			le32_to_cpu(raw_super->log_sectorsize));
 		return 1;
 	}
-	if (le32_to_cpu(raw_super->log_sectors_per_block) !=
-					F2FS_LOG_SECTORS_PER_BLOCK) {
-		f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
+	if (le32_to_cpu(raw_super->log_sectors_per_block) +
+		le32_to_cpu(raw_super->log_sectorsize) !=
+			F2FS_MAX_LOG_SECTOR_SIZE) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid log sectors per block(%u) log sectorsize(%u)",
+			le32_to_cpu(raw_super->log_sectors_per_block),
+			le32_to_cpu(raw_super->log_sectorsize));
 		return 1;
 	}
 	return 0;
@@ -849,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		atomic_set(&sbi->nr_pages[i], 0);
 
 	sbi->dir_level = DEF_DIR_LEVEL;
+	sbi->need_fsck = false;
 }
 
 /*
@@ -1082,6 +1106,9 @@ try_onemore:
 	if (err)
 		goto free_proc;
 
+	if (!retry)
+		sbi->need_fsck = true;
+
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
 		err = recover_fsync_data(sbi);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 728a5dc3dc16..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 
 static void *read_all_xattrs(struct inode *inode, struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
 	size_t size = PAGE_SIZE, inline_size = 0;
 	void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
 static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 				void *txattr_addr, struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	size_t inline_size = 0;
 	void *xattr_addr;
 	struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 			alloc_nid_failed(sbi, new_nid);
 			return PTR_ERR(xpage);
 		}
-		f2fs_bug_on(new_nid);
+		f2fs_bug_on(sbi, new_nid);
 		f2fs_wait_on_page_writeback(xpage, NODE);
 	} else {
 		struct dnode_of_data dn;
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 				const void *value, size_t size,
 				struct page *ipage, int flags)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
 	/* this case is only from init_inode_metadata */
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..0bab12b20460 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages)
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
 submit_op_failed:
 	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
 	spin_unlock(&cookie->lock);
+	fscache_unuse_cookie(object);
 	kfree(op);
 	_leave(" [EIO]");
 	return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 
 /*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+				  HZ);
+}
+
+/*
  * decide whether a page can be released, possibly by cancelling a store to it
  * - we're allowed to sleep if __GFP_WAIT is flagged
  */
@@ -115,7 +128,10 @@ page_busy:
 	}
 
 	fscache_stat(&fscache_n_store_vmscan_wait);
-	__fscache_wait_on_page_write(cookie, page);
+	if (!release_page_wait_timeout(cookie, page))
+		_debug("fscache writeout timeout page: %p{%lx}",
+			page, page->index);
+
 	gfp &= ~__GFP_WAIT;
 	goto try_again;
 }
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
-	bool wake_cookie;
+	bool wake_cookie = false;
 
 	_enter("%p", cookie);
 
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 
 	__fscache_use_cookie(cookie);
 	if (fscache_submit_exclusive_op(object, op) < 0)
-		goto nobufs;
+		goto nobufs_dec;
 	spin_unlock(&cookie->lock);
 	fscache_stat(&fscache_n_attr_changed_ok);
 	fscache_put_operation(op);
 	_leave(" = 0");
 	return 0;
 
-nobufs:
+nobufs_dec:
 	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
 	spin_unlock(&cookie->lock);
 	kfree(op);
 	if (wake_cookie)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 912061ac4baf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1305,6 +1305,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 		size_t start;
 		ssize_t ret = iov_iter_get_pages(ii,
 					&req->pages[req->num_pages],
+					*nbytesp - nbytes,
 					req->max_pages - req->num_pages,
 					&start);
 		if (ret < 0)
diff --git a/fs/internal.h b/fs/internal.h
index e325b4f9c799..b2623200107b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 #endif
 
 /*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+
+/*
  * char_dev.c
  */
 extern void __init chrdev_init(void);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
 
 lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+	        svcshare.o svcproc.o svcsubs.o mon.o xdr.o
 lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
 lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
 
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	if (status == -ECONNREFUSED) {
+		dprintk("lockd:	NSM upcall RPC failed, status=%d, forcing rebind\n",
+				status);
+		rpc_force_rebind(clnt);
+		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+	}
 	if (status < 0)
 		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
 				status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
 
 	struct delayed_work grace_period_end;
 	struct lock_manager lockd_manager;
-	struct list_head grace_list;
 
 	spinlock_t nsm_clnt_lock;
 	unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+		    loff_t *pos)
+{
+	char *data;
+	struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+					   lockd_net_id);
+
+	if (size < 1)
+		return -EINVAL;
+
+	data = simple_transaction_get(file, buf, size);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	switch(data[0]) {
+	case 'Y':
+	case 'y':
+	case '1':
+		locks_end_grace(&ln->lockd_manager);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+		   loff_t *pos)
+{
+	struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+					   lockd_net_id);
+	char resp[3];
+
+	resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+	resp[1] = '\n';
+	resp[2] = '\0';
+
+	return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+
+static const struct file_operations lockd_end_grace_operations = {
+	.write		= nlm_end_grace_write,
+	.read		= nlm_end_grace_read,
+	.llseek		= default_llseek,
+	.release	= simple_transaction_release,
+	.owner		= THIS_MODULE,
+};
+
+int __init
+lockd_create_procfs(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir("fs/lockd", NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+				 &lockd_end_grace_operations);
+	if (!entry) {
+		remove_proc_entry("fs/lockd", NULL);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void __exit
+lockd_remove_procfs(void)
+{
+	remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+	remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+	return 0;
+}
+
+static inline void
+lockd_remove_procfs(void)
+{
+	return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+
+#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ec9e082f9ecd..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
 #include <linux/nfs.h>
 
 #include "netns.h"
+#include "procfs.h"
 
 #define NLMDBG_FACILITY		NLMDBG_SVC
 #define LOCKD_BUFSIZE		(1024 + NLMSVC_XDRSIZE)
@@ -304,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv)
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
-	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
+	nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
 		printk(KERN_WARNING
 			"lockd_up: kthread_run failed, error=%d\n", error);
 		goto out_task;
 	}
+	nlmsvc_rqst->rq_task = nlmsvc_task;
+	wake_up_process(nlmsvc_task);
+
 	dprintk("lockd_up: service started\n");
 	return 0;
 
@@ -581,7 +585,7 @@ static int lockd_init_net(struct net *net)
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
 	INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
-	INIT_LIST_HEAD(&ln->grace_list);
+	INIT_LIST_HEAD(&ln->lockd_manager.list);
 	spin_lock_init(&ln->nsm_clnt_lock);
 	return 0;
 }
@@ -615,8 +619,15 @@ static int __init init_nlm(void)
 	err = register_pernet_subsys(&lockd_net_ops);
 	if (err)
 		goto err_pernet;
+
+	err = lockd_create_procfs();
+	if (err)
+		goto err_procfs;
+
 	return 0;
 
+err_procfs:
+	unregister_pernet_subsys(&lockd_net_ops);
 err_pernet:
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(nlm_sysctl_table);
@@ -629,6 +640,7 @@ static void __exit exit_nlm(void)
 {
 	/* FIXME: delete all NLM clients */
 	nlm_shutdown_hosts();
+	lockd_remove_procfs();
 	unregister_pernet_subsys(&lockd_net_ops);
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/cleancache.h>
+#include "internal.h"
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io;
+	guard_bio_eod(rw, bio);
 	submit_bio(rw, bio);
 	return NULL;
 }
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
 # Makefile for the pNFS block layout driver kernel module
 #
 obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/bio.h>		/* struct bio */
-#include <linux/buffer_head.h>	/* various write calls */
 #include <linux/prefetch.h>
 #include <linux/pagevec.h>
 
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
 
-static void print_page(struct page *page)
+static bool is_hole(struct pnfs_block_extent *be)
 {
-	dprintk("PRINTPAGE page %p\n", page);
-	dprintk("	PagePrivate %d\n", PagePrivate(page));
-	dprintk("	PageUptodate %d\n", PageUptodate(page));
-	dprintk("	PageError %d\n", PageError(page));
-	dprintk("	PageDirty %d\n", PageDirty(page));
-	dprintk("	PageReferenced %d\n", PageReferenced(page));
-	dprintk("	PageLocked %d\n", PageLocked(page));
-	dprintk("	PageWriteback %d\n", PageWriteback(page));
-	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page));
-	dprintk("\n");
-}
-
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
-	if (be->be_state == PNFS_BLOCK_NONE_DATA)
-		return 1;
-	else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
-		return 0;
-	else
-		return !bl_is_sector_init(be->be_inval, isect);
-}
-
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
-{
-	return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-		be->be_state == PNFS_BLOCK_INVALID_DATA);
+	switch (be->be_state) {
+	case PNFS_BLOCK_NONE_DATA:
+		return true;
+	case PNFS_BLOCK_INVALID_DATA:
+		return be->be_tag ? false : true;
+	default:
+		return false;
+	}
 }
 
 /* The data we are handed might be spread across several bios.  We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	void (*pnfs_callback) (void *data, int num_se);
+	void (*pnfs_callback) (void *data);
 	void *data;
-	int bse_count;
 };
 
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
 	if (rv) {
 		rv->data = data;
 		kref_init(&rv->refcnt);
-		rv->bse_count = 0;
 	}
 	return rv;
 }
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
 
 	dprintk("%s enter\n", __func__);
-	p->pnfs_callback(p->data, p->bse_count);
+	p->pnfs_callback(p->data);
 	kfree(p);
 }
 
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
 	return NULL;
 }
 
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
-				     struct pnfs_block_extent *be,
-				     void (*end_io)(struct bio *, int err),
-				     struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+		void (*end_io)(struct bio *, int err), struct parallel_io *par)
 {
 	struct bio *bio;
 
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 	}
 
 	if (bio) {
-		bio->bi_iter.bi_sector = isect - be->be_f_offset +
-			be->be_v_offset;
-		bio->bi_bdev = be->be_mdev;
+		bio->bi_iter.bi_sector = disk_sector;
+		bio->bi_bdev = bdev;
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
 	}
 	return bio;
 }
 
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
-				      sector_t isect, struct page *page,
-				      struct pnfs_block_extent *be,
-				      void (*end_io)(struct bio *, int err),
-				      struct parallel_io *par,
-				      unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+		struct page *page, struct pnfs_block_dev_map *map,
+		struct pnfs_block_extent *be,
+		void (*end_io)(struct bio *, int err),
+		struct parallel_io *par, unsigned int offset, int *len)
 {
-	isect = isect + (offset >> SECTOR_SHIFT);
+	struct pnfs_block_dev *dev =
+		container_of(be->be_device, struct pnfs_block_dev, node);
+	u64 disk_addr, end;
+
 	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
-		npg, rw, (unsigned long long)isect, offset, len);
+		npg, rw, (unsigned long long)isect, offset, *len);
+
+	/* translate to device offset */
+	isect += be->be_v_offset;
+	isect -= be->be_f_offset;
+
+	/* translate to physical disk offset */
+	disk_addr = (u64)isect << SECTOR_SHIFT;
+	if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+		if (!dev->map(dev, disk_addr, map))
+			return ERR_PTR(-EIO);
+		bio = bl_submit_bio(rw, bio);
+	}
+	disk_addr += map->disk_offset;
+	disk_addr -= map->start;
+
+	/* limit length to what the device mapping allows */
+	end = disk_addr + *len;
+	if (end >= map->start + map->len)
+		*len = map->start + map->len - disk_addr;
+
 retry:
 	if (!bio) {
-		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+		bio = bl_alloc_init_bio(npg, map->bdev,
+				disk_addr >> SECTOR_SHIFT, end_io, par);
 		if (!bio)
 			return ERR_PTR(-ENOMEM);
 	}
-	if (bio_add_page(bio, page, len, offset) < len) {
+	if (bio_add_page(bio, page, *len, offset) < *len) {
 		bio = bl_submit_bio(rw, bio);
 		goto retry;
 	}
 	return bio;
 }
 
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
-				      sector_t isect, struct page *page,
-				      struct pnfs_block_extent *be,
-				      void (*end_io)(struct bio *, int err),
-				      struct parallel_io *par)
-{
-	return do_add_page_to_bio(bio, npg, rw, isect, page, be,
-				  end_io, par, 0, PAGE_CACHE_SIZE);
-}
-
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
-
-	if (!err)
-		bio_for_each_segment_all(bvec, bio, i)
-			SetPageUptodate(bvec->bv_page);
 
 	if (err) {
 		struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 			header->pnfs_error = -EIO;
 		pnfs_set_lo_fail(header->lseg);
 	}
+
 	bio_put(bio);
 	put_parallel(par);
 }
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 
 static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_header *hdr)
+bl_read_pagelist(struct nfs_pgio_header *header)
 {
-	struct nfs_pgio_header *header = hdr;
-	int i, hole;
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
 	struct bio *bio = NULL;
-	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+	struct pnfs_block_extent be;
 	sector_t isect, extent_length = 0;
 	struct parallel_io *par;
-	loff_t f_offset = hdr->args.offset;
-	size_t bytes_left = hdr->args.count;
+	loff_t f_offset = header->args.offset;
+	size_t bytes_left = header->args.count;
 	unsigned int pg_offset, pg_len;
-	struct page **pages = hdr->args.pages;
-	int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
+	struct page **pages = header->args.pages;
+	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
+	struct blk_plug plug;
+	int i;
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-		hdr->page_array.npages, f_offset,
-		(unsigned int)hdr->args.count);
+		header->page_array.npages, f_offset,
+		(unsigned int)header->args.count);
 
-	par = alloc_parallel(hdr);
+	par = alloc_parallel(header);
 	if (!par)
-		goto use_mds;
+		return PNFS_NOT_ATTEMPTED;
 	par->pnfs_callback = bl_end_par_io_read;
-	/* At this point, we can no longer jump to use_mds */
+
+	blk_start_plug(&plug);
 
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
-	for (i = pg_index; i < hdr->page_array.npages; i++) {
-		if (!extent_length) {
+	for (i = pg_index; i < header->page_array.npages; i++) {
+		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bl_put_extent(be);
-			bl_put_extent(cow_read);
 			bio = bl_submit_bio(READ, bio);
+
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, &cow_read);
-			if (!be) {
+			if (!ext_tree_lookup(bl, isect, &be, false)) {
 				header->pnfs_error = -EIO;
 				goto out;
 			}
-			extent_length = be->be_length -
-				(isect - be->be_f_offset);
-			if (cow_read) {
-				sector_t cow_length = cow_read->be_length -
-					(isect - cow_read->be_f_offset);
-				extent_length = min(extent_length, cow_length);
-			}
+			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
+		pg_offset = f_offset & ~PAGE_CACHE_MASK;
 		if (is_dio) {
-			pg_offset = f_offset & ~PAGE_CACHE_MASK;
 			if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
 				pg_len = PAGE_CACHE_SIZE - pg_offset;
 			else
 				pg_len = bytes_left;
-
-			f_offset += pg_len;
-			bytes_left -= pg_len;
-			isect += (pg_offset >> SECTOR_SHIFT);
 		} else {
-			pg_offset = 0;
+			BUG_ON(pg_offset != 0);
 			pg_len = PAGE_CACHE_SIZE;
 		}
 
-		hole = is_hole(be, isect);
-		if (hole && !cow_read) {
+		isect += (pg_offset >> SECTOR_SHIFT);
+		extent_length -= (pg_offset >> SECTOR_SHIFT);
+
+		if (is_hole(&be)) {
 			bio = bl_submit_bio(READ, bio);
 			/* Fill hole w/ zeroes w/o accessing device */
 			dprintk("%s Zeroing page for hole\n", __func__);
 			zero_user_segment(pages[i], pg_offset, pg_len);
-			print_page(pages[i]);
-			SetPageUptodate(pages[i]);
-		} else {
-			struct pnfs_block_extent *be_read;
 
-			be_read = (hole && cow_read) ? cow_read : be;
+			/* invalidate map */
+			map.start = NFS4_MAX_UINT64;
+		} else {
 			bio = do_add_page_to_bio(bio,
-						 hdr->page_array.npages - i,
+						 header->page_array.npages - i,
 						 READ,
-						 isect, pages[i], be_read,
+						 isect, pages[i], &map, &be,
 						 bl_end_io_read, par,
-						 pg_offset, pg_len);
+						 pg_offset, &pg_len);
 			if (IS_ERR(bio)) {
 				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
 			}
 		}
 		isect += (pg_len >> SECTOR_SHIFT);
-		extent_length -= PAGE_CACHE_SECTORS;
+		extent_length -= (pg_len >> SECTOR_SHIFT);
+		f_offset += pg_len;
+		bytes_left -= pg_len;
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-		hdr->res.eof = 1;
-		hdr->res.count = header->inode->i_size - hdr->args.offset;
+		header->res.eof = 1;
+		header->res.count = header->inode->i_size - header->args.offset;
 	} else {
-		hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
+		header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
 	}
 out:
-	bl_put_extent(be);
-	bl_put_extent(cow_read);
 	bl_submit_bio(READ, bio);
+	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
-
- use_mds:
-	dprintk("Giving up and using normal NFS\n");
-	return PNFS_NOT_ATTEMPTED;
-}
-
-static void mark_extents_written(struct pnfs_block_layout *bl,
-				 __u64 offset, __u32 count)
-{
-	sector_t isect, end;
-	struct pnfs_block_extent *be;
-	struct pnfs_block_short_extent *se;
-
-	dprintk("%s(%llu, %u)\n", __func__, offset, count);
-	if (count == 0)
-		return;
-	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
-	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
-	end >>= SECTOR_SHIFT;
-	while (isect < end) {
-		sector_t len;
-		be = bl_find_get_extent(bl, isect, NULL);
-		BUG_ON(!be); /* FIXME */
-		len = min(end, be->be_f_offset + be->be_length) - isect;
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-			se = bl_pop_one_short_extent(be->be_inval);
-			BUG_ON(!se);
-			bl_mark_for_commit(be, isect, len, se);
-		}
-		isect += len;
-		bl_put_extent(be);
-	}
-}
-
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
-	struct parallel_io *par = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		/* This is the zeroing page we added */
-		end_page_writeback(bvec->bv_page);
-		page_cache_release(bvec->bv_page);
-	}
-
-	if (unlikely(err)) {
-		struct nfs_pgio_header *header = par->data;
-
-		if (!header->pnfs_error)
-			header->pnfs_error = -EIO;
-		pnfs_set_lo_fail(header->lseg);
-	}
-	bio_put(bio);
-	put_parallel(par);
 }
 
 static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
  */
 static void bl_write_cleanup(struct work_struct *work)
 {
-	struct rpc_task *task;
-	struct nfs_pgio_header *hdr;
+	struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+	struct nfs_pgio_header *hdr =
+			container_of(task, struct nfs_pgio_header, task);
+
 	dprintk("%s enter\n", __func__);
-	task = container_of(work, struct rpc_task, u.tk_work);
-	hdr = container_of(task, struct nfs_pgio_header, task);
+
 	if (likely(!hdr->pnfs_error)) {
-		/* Marks for LAYOUTCOMMIT */
-		mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
-				     hdr->args.offset, hdr->args.count);
+		struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+		u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+		u64 end = (hdr->args.offset + hdr->args.count +
+			PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+
+		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+					(end - start) >> SECTOR_SHIFT);
 	}
+
 	pnfs_ld_write_done(hdr);
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
+static void bl_end_par_io_write(void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
-	if (unlikely(hdr->pnfs_error)) {
-		bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
-					num_se);
-	}
-
 	hdr->task.tk_status = hdr->pnfs_error;
 	hdr->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
 	schedule_work(&hdr->task.u.tk_work);
 }
 
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
-	return;
-}
-
-/*
- * map_block:  map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
-	dprintk("%s enter be=%p\n", __func__, be);
-
-	set_buffer_mapped(bh);
-	bh->b_bdev = be->be_mdev;
-	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
-	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-
-	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
-		__func__, (unsigned long long)isect, (long)bh->b_blocknr,
-		bh->b_size);
-	return;
-}
-
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct page *page = bvec->bv_page;
-
-	/* Only one page in bvec */
-	unlock_page(page);
-}
-
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
-		    unsigned int offset, unsigned int len)
-{
-	struct bio *bio;
-	struct page *shadow_page;
-	sector_t isect;
-	char *kaddr, *kshadow_addr;
-	int ret = 0;
-
-	dprintk("%s: offset %u len %u\n", __func__, offset, len);
-
-	shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-	if (shadow_page == NULL)
-		return -ENOMEM;
-
-	bio = bio_alloc(GFP_NOIO, 1);
-	if (bio == NULL)
-		return -ENOMEM;
-
-	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
-		(offset / SECTOR_SIZE);
-
-	bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
-	bio->bi_bdev = be->be_mdev;
-	bio->bi_end_io = bl_read_single_end_io;
-
-	lock_page(shadow_page);
-	if (bio_add_page(bio, shadow_page,
-			 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
-		unlock_page(shadow_page);
-		bio_put(bio);
-		return -EIO;
-	}
-
-	submit_bio(READ, bio);
-	wait_on_page_locked(shadow_page);
-	if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
-		ret = -EIO;
-	} else {
-		kaddr = kmap_atomic(page);
-		kshadow_addr = kmap_atomic(shadow_page);
-		memcpy(kaddr + offset, kshadow_addr + offset, len);
-		kunmap_atomic(kshadow_addr);
-		kunmap_atomic(kaddr);
-	}
-	__free_page(shadow_page);
-	bio_put(bio);
-
-	return ret;
-}
-
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
-			  unsigned int dirty_offset, unsigned int dirty_len,
-			  bool full_page)
-{
-	int ret = 0;
-	unsigned int start, end;
-
-	if (full_page) {
-		start = 0;
-		end = PAGE_CACHE_SIZE;
-	} else {
-		start = round_down(dirty_offset, SECTOR_SIZE);
-		end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
-	}
-
-	dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
-	if (!be) {
-		zero_user_segments(page, start, dirty_offset,
-				   dirty_offset + dirty_len, end);
-		if (start == 0 && end == PAGE_CACHE_SIZE &&
-		    trylock_page(page)) {
-			SetPageUptodate(page);
-			unlock_page(page);
-		}
-		return ret;
-	}
-
-	if (start != dirty_offset)
-		ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-
-	if (!ret && (dirty_offset + dirty_len < end))
-		ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
-					  end - dirty_offset - dirty_len);
-
-	return ret;
-}
-
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
-{
-	struct buffer_head *bh = NULL;
-	int ret = 0;
-	sector_t isect;
-
-	dprintk("%s enter, %p\n", __func__, page);
-	BUG_ON(PageUptodate(page));
-	if (!cow_read) {
-		zero_user_segment(page, 0, PAGE_SIZE);
-		SetPageUptodate(page);
-		goto cleanup;
-	}
-
-	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
-	if (!bh) {
-		ret = -ENOMEM;
-		goto cleanup;
-	}
-
-	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
-	map_block(bh, isect, cow_read);
-	if (!bh_uptodate_or_lock(bh))
-		ret = bh_submit_read(bh);
-	if (ret)
-		goto cleanup;
-	SetPageUptodate(page);
-
-cleanup:
-	if (bh)
-		free_buffer_head(bh);
-	if (ret) {
-		/* Need to mark layout with bad read...should now
-		 * just use nfs4 for reads and writes.
-		 */
-		mark_bad_read();
-	}
-	return ret;
-}
-
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
-			struct pnfs_block_extent *cow_read)
-{
-	struct page *page;
-	int locked = 0;
-	page = find_get_page(inode->i_mapping, index);
-	if (page)
-		goto check_page;
-
-	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-	if (unlikely(!page)) {
-		dprintk("%s oom\n", __func__);
-		return ERR_PTR(-ENOMEM);
-	}
-	locked = 1;
-
-check_page:
-	/* PageDirty: Other will write this out
-	 * PageWriteback: Other is writing this out
-	 * PageUptodate: It was read before
-	 */
-	if (PageDirty(page) || PageWriteback(page)) {
-		print_page(page);
-		if (locked)
-			unlock_page(page);
-		page_cache_release(page);
-		return NULL;
-	}
-
-	if (!locked) {
-		lock_page(page);
-		locked = 1;
-		goto check_page;
-	}
-	if (!PageUptodate(page)) {
-		/* New page, readin or zero it */
-		init_page_for_write(page, cow_read);
-	}
-	set_page_writeback(page);
-	unlock_page(page);
-
-	return page;
-}
-
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-	int i, ret, npg_zero, pg_index, last = 0;
+	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
 	struct bio *bio = NULL;
-	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
-	sector_t isect, last_isect = 0, extent_length = 0;
+	struct pnfs_block_extent be;
+	sector_t isect, extent_length = 0;
 	struct parallel_io *par = NULL;
 	loff_t offset = header->args.offset;
 	size_t count = header->args.count;
-	unsigned int pg_offset, pg_len, saved_len;
 	struct page **pages = header->args.pages;
-	struct page *page;
-	pgoff_t index;
-	u64 temp;
-	int npg_per_block =
-	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	unsigned int pg_len;
+	struct blk_plug plug;
+	int i;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 
-	if (header->dreq != NULL &&
-	    (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
-	     !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
-		dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
-		goto out_mds;
-	}
 	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
 	 * We want to write each, and if there is an error set pnfs_error
 	 * to have it redone using nfs.
 	 */
 	par = alloc_parallel(header);
 	if (!par)
-		goto out_mds;
+		return PNFS_NOT_ATTEMPTED;
 	par->pnfs_callback = bl_end_par_io_write;
-	/* At this point, have to be more careful with error handling */
 
-	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-	be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
-	if (!be || !is_writable(be, isect)) {
-		dprintk("%s no matching extents!\n", __func__);
-		goto out_mds;
-	}
+	blk_start_plug(&plug);
 
-	/* First page inside INVALID extent */
-	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		if (likely(!bl_push_one_short_extent(be->be_inval)))
-			par->bse_count++;
-		else
-			goto out_mds;
-		temp = offset >> PAGE_CACHE_SHIFT;
-		npg_zero = do_div(temp, npg_per_block);
-		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
-				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-		extent_length = be->be_length - (isect - be->be_f_offset);
-
-fill_invalid_ext:
-		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
-		for (;npg_zero > 0; npg_zero--) {
-			if (bl_is_sector_init(be->be_inval, isect)) {
-				dprintk("isect %llu already init\n",
-					(unsigned long long)isect);
-				goto next_page;
-			}
-			/* page ref released in bl_end_io_write_zero */
-			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
-			dprintk("%s zero %dth page: index %lu isect %llu\n",
-				__func__, npg_zero, index,
-				(unsigned long long)isect);
-			page = bl_find_get_zeroing_page(header->inode, index,
-							cow_read);
-			if (unlikely(IS_ERR(page))) {
-				header->pnfs_error = PTR_ERR(page);
-				goto out;
-			} else if (page == NULL)
-				goto next_page;
-
-			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS);
-			if (unlikely(ret)) {
-				dprintk("%s bl_mark_sectors_init fail %d\n",
-					__func__, ret);
-				end_page_writeback(page);
-				page_cache_release(page);
-				header->pnfs_error = ret;
-				goto out;
-			}
-			if (likely(!bl_push_one_short_extent(be->be_inval)))
-				par->bse_count++;
-			else {
-				end_page_writeback(page);
-				page_cache_release(page);
-				header->pnfs_error = -ENOMEM;
-				goto out;
-			}
-			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(header->lseg),
-					     page->index << PAGE_CACHE_SHIFT,
-					     PAGE_CACHE_SIZE);
-
-			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
-						 isect, page, be,
-						 bl_end_io_write_zero, par);
-			if (IS_ERR(bio)) {
-				header->pnfs_error = PTR_ERR(bio);
-				bio = NULL;
-				goto out;
-			}
-next_page:
-			isect += PAGE_CACHE_SECTORS;
-			extent_length -= PAGE_CACHE_SECTORS;
-		}
-		if (last)
-			goto write_done;
-	}
-	bio = bl_submit_bio(WRITE, bio);
+	/* we always write out the whole page */
+	offset = offset & (loff_t)PAGE_CACHE_MASK;
+	isect = offset >> SECTOR_SHIFT;
 
-	/* Middle pages */
-	pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	for (i = pg_index; i < header->page_array.npages; i++) {
-		if (!extent_length) {
+		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bl_put_extent(be);
-			bl_put_extent(cow_read);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, &cow_read);
-			if (!be || !is_writable(be, isect)) {
+			if (!ext_tree_lookup(bl, isect, &be, true)) {
 				header->pnfs_error = -EINVAL;
 				goto out;
 			}
-			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-				if (likely(!bl_push_one_short_extent(
-								be->be_inval)))
-					par->bse_count++;
-				else {
-					header->pnfs_error = -ENOMEM;
-					goto out;
-				}
-			}
-			extent_length = be->be_length -
-			    (isect - be->be_f_offset);
-		}
-
-		dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
-		pg_offset = offset & ~PAGE_CACHE_MASK;
-		if (pg_offset + count > PAGE_CACHE_SIZE)
-			pg_len = PAGE_CACHE_SIZE - pg_offset;
-		else
-			pg_len = count;
-
-		saved_len = pg_len;
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
-		    !bl_is_sector_init(be->be_inval, isect)) {
-			ret = bl_read_partial_page_sync(pages[i], cow_read,
-							pg_offset, pg_len, true);
-			if (ret) {
-				dprintk("%s bl_read_partial_page_sync fail %d\n",
-					__func__, ret);
-				header->pnfs_error = ret;
-				goto out;
-			}
-
-			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS);
-			if (unlikely(ret)) {
-				dprintk("%s bl_mark_sectors_init fail %d\n",
-					__func__, ret);
-				header->pnfs_error = ret;
-				goto out;
-			}
 
-			/* Expand to full page write */
-			pg_offset = 0;
-			pg_len = PAGE_CACHE_SIZE;
-		} else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
-			    (pg_len & (SECTOR_SIZE - 1))){
-			/* ahh, nasty case. We have to do sync full sector
-			 * read-modify-write cycles.
-			 */
-			unsigned int saved_offset = pg_offset;
-			ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
-							pg_len, false);
-			pg_offset = round_down(pg_offset, SECTOR_SIZE);
-			pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
-				 - pg_offset;
+			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
-
+		pg_len = PAGE_CACHE_SIZE;
 		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
-					 WRITE,
-					 isect, pages[i], be,
+					 WRITE, isect, pages[i], &map, &be,
 					 bl_end_io_write, par,
-					 pg_offset, pg_len);
+					 0, &pg_len);
 		if (IS_ERR(bio)) {
 			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
-		offset += saved_len;
-		count -= saved_len;
-		isect += PAGE_CACHE_SECTORS;
-		last_isect = isect;
-		extent_length -= PAGE_CACHE_SECTORS;
-	}
 
-	/* Last page inside INVALID extent */
-	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		bio = bl_submit_bio(WRITE, bio);
-		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
-		npg_zero = npg_per_block - do_div(temp, npg_per_block);
-		if (npg_zero < npg_per_block) {
-			last = 1;
-			goto fill_invalid_ext;
-		}
+		offset += pg_len;
+		count -= pg_len;
+		isect += (pg_len >> SECTOR_SHIFT);
+		extent_length -= (pg_len >> SECTOR_SHIFT);
 	}
 
-write_done:
 	header->res.count = header->args.count;
 out:
-	bl_put_extent(be);
-	bl_put_extent(cow_read);
 	bl_submit_bio(WRITE, bio);
+	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
-out_mds:
-	bl_put_extent(be);
-	bl_put_extent(cow_read);
-	kfree(par);
-	return PNFS_NOT_ATTEMPTED;
-}
-
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
-	int i;
-	struct pnfs_block_extent *be;
-
-	spin_lock(&bl->bl_ext_lock);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		while (!list_empty(&bl->bl_extents[i])) {
-			be = list_first_entry(&bl->bl_extents[i],
-					      struct pnfs_block_extent,
-					      be_node);
-			list_del(&be->be_node);
-			bl_put_extent(be);
-		}
-	}
-	spin_unlock(&bl->bl_ext_lock);
-}
-
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_inval_tracking *pos, *temp;
-	struct pnfs_block_short_extent *se, *stemp;
-
-	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
-		list_del(&pos->it_link);
-		kfree(pos);
-	}
-
-	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
-		list_del(&se->bse_node);
-		kfree(se);
-	}
-	return;
 }
 
 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	int err;
 
 	dprintk("%s enter\n", __func__);
-	release_extents(bl, NULL);
-	release_inval_marks(&bl->bl_inval);
+
+	err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+	WARN_ON(err);
+
 	kfree(bl);
 }
 
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
 	bl = kzalloc(sizeof(*bl), gfp_flags);
 	if (!bl)
 		return NULL;
+
+	bl->bl_ext_rw = RB_ROOT;
+	bl->bl_ext_ro = RB_ROOT;
 	spin_lock_init(&bl->bl_ext_lock);
-	INIT_LIST_HEAD(&bl->bl_extents[0]);
-	INIT_LIST_HEAD(&bl->bl_extents[1]);
-	INIT_LIST_HEAD(&bl->bl_commit);
-	INIT_LIST_HEAD(&bl->bl_committing);
-	bl->bl_count = 0;
-	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
-	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+
 	return &bl->bl_layout;
 }
 
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 	kfree(lseg);
 }
 
-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
-						 struct nfs4_layoutget_res *lgr,
-						 gfp_t gfp_flags)
-{
-	struct pnfs_layout_segment *lseg;
-	int status;
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+	u32 mode;	/* R or RW */
+	u64 start;	/* Expected start of next non-COW extent */
+	u64 inval;	/* Start of INVAL coverage */
+	u64 cowread;	/* End of COW read coverage */
+};
 
-	dprintk("%s enter\n", __func__);
-	lseg = kzalloc(sizeof(*lseg), gfp_flags);
-	if (!lseg)
-		return ERR_PTR(-ENOMEM);
-	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
-	if (status) {
-		/* We don't want to call the full-blown bl_free_lseg,
-		 * since on error extents were not touched.
-		 */
-		kfree(lseg);
-		return ERR_PTR(status);
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+			 struct layout_verification *lv)
+{
+	if (lv->mode == IOMODE_READ) {
+		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+		    be->be_state == PNFS_BLOCK_INVALID_DATA)
+			return -EIO;
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
 	}
-	return lseg;
+	/* lv->mode == IOMODE_RW */
+	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		if (lv->cowread > lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		lv->inval = lv->start;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		if (be->be_f_offset != lv->start)
+			return -EIO;
+		lv->start += be->be_length;
+		return 0;
+	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+		if (be->be_f_offset > lv->start)
+			return -EIO;
+		if (be->be_f_offset < lv->inval)
+			return -EIO;
+		if (be->be_f_offset < lv->cowread)
+			return -EIO;
+		/* It looks like you might want to min this with lv->start,
+		 * but you really don't.
+		 */
+		lv->inval = lv->inval + be->be_length;
+		lv->cowread = be->be_f_offset + be->be_length;
+		return 0;
+	} else
+		return -EIO;
 }
 
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
-		       const struct nfs4_layoutcommit_args *arg)
+static int decode_sector_number(__be32 **rp, sector_t *sp)
 {
-	dprintk("%s enter\n", __func__);
-	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+	uint64_t s;
+
+	*rp = xdr_decode_hyper(*rp, &s);
+	if (s & 0x1ff) {
+		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+		return -1;
+	}
+	*sp = s >> SECTOR_SHIFT;
+	return 0;
 }
 
-static void
-bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+		struct layout_verification *lv, struct list_head *extents,
+		gfp_t gfp_mask)
 {
-	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+	struct pnfs_block_extent *be;
+	struct nfs4_deviceid id;
+	int error;
+	__be32 *p;
 
-	dprintk("%s enter\n", __func__);
-	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
-}
+	p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+	if (!p)
+		return -EIO;
 
-static void free_blk_mountid(struct block_mount_id *mid)
-{
-	if (mid) {
-		struct pnfs_block_dev *dev, *tmp;
+	be = kzalloc(sizeof(*be), GFP_NOFS);
+	if (!be)
+		return -ENOMEM;
 
-		/* No need to take bm_lock as we are last user freeing bm_devlist */
-		list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
-			list_del(&dev->bm_node);
-			bl_free_block_dev(dev);
-		}
-		kfree(mid);
+	memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+	error = -EIO;
+	be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+						lo->plh_lc_cred, gfp_mask);
+	if (!be->be_device)
+		goto out_free_be;
+
+	/*
+	 * The next three values are read in as bytes, but stored in the
+	 * extent structure in 512-byte granularity.
+	 */
+	if (decode_sector_number(&p, &be->be_f_offset) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_length) < 0)
+		goto out_put_deviceid;
+	if (decode_sector_number(&p, &be->be_v_offset) < 0)
+		goto out_put_deviceid;
+	be->be_state = be32_to_cpup(p++);
+
+	error = verify_extent(be, lv);
+	if (error) {
+		dprintk("%s: extent verification failed\n", __func__);
+		goto out_put_deviceid;
 	}
+
+	list_add_tail(&be->be_list, extents);
+	return 0;
+
+out_put_deviceid:
+	nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+	kfree(be);
+	return error;
 }
 
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
-			struct nfs4_deviceid *d_id)
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+		gfp_t gfp_mask)
 {
-	struct pnfs_device *dev;
-	struct pnfs_block_dev *rv;
-	u32 max_resp_sz;
-	int max_pages;
-	struct page **pages = NULL;
-	int i, rc;
+	struct layout_verification lv = {
+		.mode = lgr->range.iomode,
+		.start = lgr->range.offset >> SECTOR_SHIFT,
+		.inval = lgr->range.offset >> SECTOR_SHIFT,
+		.cowread = lgr->range.offset >> SECTOR_SHIFT,
+	};
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	struct pnfs_layout_segment *lseg;
+	struct xdr_buf buf;
+	struct xdr_stream xdr;
+	struct page *scratch;
+	int status, i;
+	uint32_t count;
+	__be32 *p;
+	LIST_HEAD(extents);
+
+	dprintk("---> %s\n", __func__);
+
+	lseg = kzalloc(sizeof(*lseg), gfp_mask);
+	if (!lseg)
+		return ERR_PTR(-ENOMEM);
+
+	status = -ENOMEM;
+	scratch = alloc_page(gfp_mask);
+	if (!scratch)
+		goto out;
+
+	xdr_init_decode_pages(&xdr, &buf,
+			lgr->layoutp->pages, lgr->layoutp->len);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+	status = -EIO;
+	p = xdr_inline_decode(&xdr, 4);
+	if (unlikely(!p))
+		goto out_free_scratch;
+
+	count = be32_to_cpup(p++);
+	dprintk("%s: number of extents %d\n", __func__, count);
 
 	/*
-	 * Use the session max response size as the basis for setting
-	 * GETDEVICEINFO's maxcount
+	 * Decode individual extents, putting them in temporary staging area
+	 * until whole layout is decoded to make error recovery easier.
 	 */
-	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	max_pages = nfs_page_array_len(0, max_resp_sz);
-	dprintk("%s max_resp_sz %u max_pages %d\n",
-		__func__, max_resp_sz, max_pages);
-
-	dev = kmalloc(sizeof(*dev), GFP_NOFS);
-	if (!dev) {
-		dprintk("%s kmalloc failed\n", __func__);
-		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < count; i++) {
+		status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+		if (status)
+			goto process_extents;
 	}
 
-	pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
-	if (pages == NULL) {
-		kfree(dev);
-		return ERR_PTR(-ENOMEM);
+	if (lgr->range.offset + lgr->range.length !=
+			lv.start << SECTOR_SHIFT) {
+		dprintk("%s Final length mismatch\n", __func__);
+		status = -EIO;
+		goto process_extents;
 	}
-	for (i = 0; i < max_pages; i++) {
-		pages[i] = alloc_page(GFP_NOFS);
-		if (!pages[i]) {
-			rv = ERR_PTR(-ENOMEM);
-			goto out_free;
-		}
+
+	if (lv.start < lv.cowread) {
+		dprintk("%s Final uncovered COW extent\n", __func__);
+		status = -EIO;
 	}
 
-	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
-	dev->layout_type = LAYOUT_BLOCK_VOLUME;
-	dev->pages = pages;
-	dev->pgbase = 0;
-	dev->pglen = PAGE_SIZE * max_pages;
-	dev->mincount = 0;
-	dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
-	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
-	rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
-	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc) {
-		rv = ERR_PTR(rc);
-		goto out_free;
+process_extents:
+	while (!list_empty(&extents)) {
+		struct pnfs_block_extent *be =
+			list_first_entry(&extents, struct pnfs_block_extent,
+					 be_list);
+		list_del(&be->be_list);
+
+		if (!status)
+			status = ext_tree_insert(bl, be);
+
+		if (status) {
+			nfs4_put_deviceid_node(be->be_device);
+			kfree(be);
+		}
 	}
 
-	rv = nfs4_blk_decode_device(server, dev);
- out_free:
-	for (i = 0; i < max_pages; i++)
-		__free_page(pages[i]);
-	kfree(pages);
-	kfree(dev);
-	return rv;
+out_free_scratch:
+	__free_page(scratch);
+out:
+	dprintk("%s returns %d\n", __func__, status);
+	if (status) {
+		kfree(lseg);
+		return ERR_PTR(status);
+	}
+	return lseg;
 }
 
-static int
-bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
 {
-	struct block_mount_id *b_mt_id = NULL;
-	struct pnfs_devicelist *dlist = NULL;
-	struct pnfs_block_dev *bdev;
-	LIST_HEAD(block_disklist);
-	int status, i;
-
-	dprintk("%s enter\n", __func__);
+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+	sector_t offset = range->offset >> SECTOR_SHIFT, end;
 
-	if (server->pnfs_blksize == 0) {
-		dprintk("%s Server did not return blksize\n", __func__);
-		return -EINVAL;
-	}
-	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
-	if (!b_mt_id) {
-		status = -ENOMEM;
-		goto out_error;
-	}
-	/* Initialize nfs4 block layout mount id */
-	spin_lock_init(&b_mt_id->bm_lock);
-	INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
-	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
-	if (!dlist) {
-		status = -ENOMEM;
-		goto out_error;
+	if (range->offset % 8) {
+		dprintk("%s: offset %lld not block size aligned\n",
+			__func__, range->offset);
+		return;
 	}
-	dlist->eof = 0;
-	while (!dlist->eof) {
-		status = nfs4_proc_getdevicelist(server, fh, dlist);
-		if (status)
-			goto out_error;
-		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
-			__func__, dlist->num_devs, dlist->eof);
-		for (i = 0; i < dlist->num_devs; i++) {
-			bdev = nfs4_blk_get_deviceinfo(server, fh,
-						       &dlist->dev_id[i]);
-			if (IS_ERR(bdev)) {
-				status = PTR_ERR(bdev);
-				goto out_error;
-			}
-			spin_lock(&b_mt_id->bm_lock);
-			list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
-			spin_unlock(&b_mt_id->bm_lock);
+
+	if (range->length != NFS4_MAX_UINT64) {
+		if (range->length % 8) {
+			dprintk("%s: length %lld not block size aligned\n",
+				__func__, range->length);
+			return;
 		}
-	}
-	dprintk("%s SUCCESS\n", __func__);
-	server->pnfs_ld_data = b_mt_id;
 
- out_return:
-	kfree(dlist);
-	return status;
+		end = offset + (range->length >> SECTOR_SHIFT);
+	} else {
+		end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+	}
 
- out_error:
-	free_blk_mountid(b_mt_id);
-	goto out_return;
+	ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
 }
 
 static int
-bl_clear_layoutdriver(struct nfs_server *server)
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+	return ext_tree_prepare_commit(arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 {
-	struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+	ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
 
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
 	dprintk("%s enter\n", __func__);
-	free_blk_mountid(b_mt_id);
-	dprintk("%s RETURNS\n", __func__);
+
+	if (server->pnfs_blksize == 0) {
+		dprintk("%s Server did not return blksize\n", __func__);
+		return -EINVAL;
+	}
+	if (server->pnfs_blksize > PAGE_SIZE) {
+		printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+			__func__, server->pnfs_blksize);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
 static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+		struct nfs_page *req, unsigned int alignment)
 {
-	return IS_ALIGNED(req->wb_offset, alignment) &&
-	       IS_ALIGNED(req->wb_bytes, alignment);
+	/*
+	 * Always accept buffered writes, higher layers take care of the
+	 * right alignment.
+	 */
+	if (pgio->pg_dreq == NULL)
+		return true;
+
+	if (!IS_ALIGNED(req->wb_offset, alignment))
+		return false;
+
+	if (IS_ALIGNED(req->wb_bytes, alignment))
+		return true;
+
+	if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+		/*
+		 * If the write goes up to the inode size, just write
+		 * the full page.  Data past the inode size is
+		 * guaranteed to be zeroed by the higher level client
+		 * code, and this behaviour is mandated by RFC 5663
+		 * section 2.3.2.
+		 */
+		return true;
+	}
+
+	return false;
 }
 
 static void
 bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, SECTOR_SIZE))
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
 		nfs_pageio_reset_read_mds(pgio);
-	else
-		pnfs_generic_pg_init_read(pgio, req);
+		return;
+	}
+
+	pnfs_generic_pg_init_read(pgio, req);
 }
 
 /*
@@ -1196,10 +796,8 @@ static size_t
 bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, SECTOR_SIZE))
+	if (!is_aligned_req(pgio, req, SECTOR_SIZE))
 		return 0;
-
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
 
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
 static void
 bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+	u64 wb_size;
+
+	if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
 		nfs_pageio_reset_write_mds(pgio);
-	} else {
-		u64 wb_size;
-		if (pgio->pg_dreq == NULL)
-			wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
-						      req->wb_index);
-		else
-			wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
-		pnfs_generic_pg_init_write(pgio, req, wb_size);
+		return;
 	}
+
+	if (pgio->pg_dreq == NULL)
+		wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+					      req->wb_index);
+	else
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+	pnfs_generic_pg_init_write(pgio, req, wb_size);
 }
 
 /*
@@ -1252,10 +851,8 @@ static size_t
 bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		 struct nfs_page *req)
 {
-	if (pgio->pg_dreq != NULL &&
-	    !is_aligned_req(req, PAGE_CACHE_SIZE))
+	if (!is_aligned_req(pgio, req, PAGE_SIZE))
 		return 0;
-
 	return pnfs_generic_pg_test(pgio, prev, req);
 }
 
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.id				= LAYOUT_BLOCK_VOLUME,
 	.name				= "LAYOUT_BLOCK_VOLUME",
 	.owner				= THIS_MODULE,
+	.flags				= PNFS_LAYOUTRET_ON_SETATTR |
+					  PNFS_READ_WHOLE_PAGE,
 	.read_pagelist			= bl_read_pagelist,
 	.write_pagelist			= bl_write_pagelist,
 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
 	.free_layout_hdr		= bl_free_layout_hdr,
 	.alloc_lseg			= bl_alloc_lseg,
 	.free_lseg			= bl_free_lseg,
-	.encode_layoutcommit		= bl_encode_layoutcommit,
+	.return_range			= bl_return_range,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
 	.set_layoutdriver		= bl_set_layoutdriver,
-	.clear_layoutdriver		= bl_clear_layoutdriver,
+	.alloc_deviceid_node		= bl_alloc_deviceid_node,
+	.free_deviceid_node		= bl_free_deviceid_node,
 	.pg_read_ops			= &bl_pg_read_ops,
 	.pg_write_ops			= &bl_pg_write_ops,
 };
 
-static const struct rpc_pipe_ops bl_upcall_ops = {
-	.upcall		= rpc_pipe_generic_upcall,
-	.downcall	= bl_pipe_downcall,
-	.destroy_msg	= bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
-					    struct rpc_pipe *pipe)
-{
-	struct dentry *dir, *dentry;
-
-	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
-	if (dir == NULL)
-		return ERR_PTR(-ENOENT);
-	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
-	dput(dir);
-	return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
-					  struct rpc_pipe *pipe)
-{
-	if (pipe->dentry)
-		rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
-			   void *ptr)
-{
-	struct super_block *sb = ptr;
-	struct net *net = sb->s_fs_info;
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
-	int ret = 0;
-
-	if (!try_module_get(THIS_MODULE))
-		return 0;
-
-	if (nn->bl_device_pipe == NULL) {
-		module_put(THIS_MODULE);
-		return 0;
-	}
-
-	switch (event) {
-	case RPC_PIPEFS_MOUNT:
-		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
-		if (IS_ERR(dentry)) {
-			ret = PTR_ERR(dentry);
-			break;
-		}
-		nn->bl_device_pipe->dentry = dentry;
-		break;
-	case RPC_PIPEFS_UMOUNT:
-		if (nn->bl_device_pipe->dentry)
-			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
-		break;
-	default:
-		ret = -ENOTSUPP;
-		break;
-	}
-	module_put(THIS_MODULE);
-	return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
-	.notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
-						   struct rpc_pipe *pipe)
-{
-	struct super_block *pipefs_sb;
-	struct dentry *dentry;
-
-	pipefs_sb = rpc_get_sb_net(net);
-	if (!pipefs_sb)
-		return NULL;
-	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
-	rpc_put_sb_net(net);
-	return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
-					   struct rpc_pipe *pipe)
-{
-	struct super_block *pipefs_sb;
-
-	pipefs_sb = rpc_get_sb_net(net);
-	if (pipefs_sb) {
-		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
-		rpc_put_sb_net(net);
-	}
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
-
-	init_waitqueue_head(&nn->bl_wq);
-	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
-	if (IS_ERR(nn->bl_device_pipe))
-		return PTR_ERR(nn->bl_device_pipe);
-	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
-	if (IS_ERR(dentry)) {
-		rpc_destroy_pipe_data(nn->bl_device_pipe);
-		return PTR_ERR(dentry);
-	}
-	nn->bl_device_pipe->dentry = dentry;
-	return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
-	rpc_destroy_pipe_data(nn->bl_device_pipe);
-	nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
-	.init = nfs4blocklayout_net_init,
-	.exit = nfs4blocklayout_net_exit,
-};
-
 static int __init nfs4blocklayout_init(void)
 {
 	int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
 	ret = pnfs_register_layoutdriver(&blocklayout_type);
 	if (ret)
 		goto out;
-
-	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+	ret = bl_init_pipefs();
 	if (ret)
-		goto out_remove;
-	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
-	if (ret)
-		goto out_notifier;
-out:
-	return ret;
+		goto out_unregister;
+	return 0;
 
-out_notifier:
-	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
 	return ret;
 }
 
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
 	       __func__);
 
-	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+	bl_cleanup_pipefs();
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
-struct block_mount_id {
-	spinlock_t			bm_lock;    /* protects list */
-	struct list_head		bm_devlist; /* holds pnfs_block_dev */
-};
+struct pnfs_block_dev;
 
-struct pnfs_block_dev {
-	struct list_head		bm_node;
-	struct nfs4_deviceid		bm_mdevid;    /* associated devid */
-	struct block_device		*bm_mdev;     /* meta device itself */
-	struct net			*net;
+enum pnfs_block_volume_type {
+	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
+	PNFS_BLOCK_VOLUME_SLICE		= 1,
+	PNFS_BLOCK_VOLUME_CONCAT	= 2,
+	PNFS_BLOCK_VOLUME_STRIPE	= 3,
 };
 
-enum exstate4 {
-	PNFS_BLOCK_READWRITE_DATA	= 0,
-	PNFS_BLOCK_READ_DATA		= 1,
-	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
-	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
+#define PNFS_BLOCK_MAX_UUIDS	4
+#define PNFS_BLOCK_MAX_DEVICES	64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN	128
+
+
+struct pnfs_block_volume {
+	enum pnfs_block_volume_type	type;
+	union {
+		struct {
+			int		len;
+			int		nr_sigs;
+			struct {
+				u64		offset;
+				u32		sig_len;
+				u8		sig[PNFS_BLOCK_UUID_LEN];
+			} sigs[PNFS_BLOCK_MAX_UUIDS];
+		} simple;
+		struct {
+			u64		start;
+			u64		len;
+			u32		volume;
+		} slice;
+		struct {
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} concat;
+		struct {
+			u64		chunk_size;
+			u32		volumes_count;
+			u32		volumes[PNFS_BLOCK_MAX_DEVICES];
+		} stripe;
+	};
 };
 
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct pnfs_block_dev_map {
+	sector_t			start;
+	sector_t			len;
 
-struct my_tree {
-	sector_t		mtt_step_size;	/* Internal sector alignment */
-	struct list_head	mtt_stub; /* Should be a radix tree */
+	sector_t			disk_offset;
+	struct block_device		*bdev;
 };
 
-struct pnfs_inval_markings {
-	spinlock_t	im_lock;
-	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */
-	sector_t	im_block_size;	/* Server blocksize in sectors */
-	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */
+struct pnfs_block_dev {
+	struct nfs4_deviceid_node	node;
+
+	u64				start;
+	u64				len;
+
+	u32				nr_children;
+	struct pnfs_block_dev		*children;
+	u64				chunk_size;
+
+	struct block_device		*bdev;
+	u64				disk_offset;
+
+	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+			struct pnfs_block_dev_map *map);
 };
 
-struct pnfs_inval_tracking {
-	struct list_head it_link;
-	int		 it_sector;
-	int		 it_tags;
+enum exstate4 {
+	PNFS_BLOCK_READWRITE_DATA	= 0,
+	PNFS_BLOCK_READ_DATA		= 1,
+	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
+	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
 };
 
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
-	struct kref	be_refcnt;
-	struct list_head be_node;	/* link into lseg list */
-	struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
-	struct block_device *be_mdev;
+	union {
+		struct rb_node	be_node;
+		struct list_head be_list;
+	};
+	struct nfs4_deviceid_node *be_device;
 	sector_t	be_f_offset;	/* the starting offset in the file */
 	sector_t	be_length;	/* the size of the extent */
 	sector_t	be_v_offset;	/* the starting offset in the volume */
 	enum exstate4	be_state;	/* the state of this extent */
-	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+#define EXTENT_WRITTEN		1
+#define EXTENT_COMMITTING	2
+	unsigned int	be_tag;
 };
 
-/* Shortened extent used by LAYOUTCOMMIT */
-struct pnfs_block_short_extent {
-	struct list_head bse_node;
-	struct nfs4_deviceid bse_devid;
-	struct block_device *bse_mdev;
-	sector_t	bse_f_offset;	/* the starting offset in the file */
-	sector_t	bse_length;	/* the size of the extent */
-};
-
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
-	spin_lock_init(&marks->im_lock);
-	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
-	INIT_LIST_HEAD(&marks->im_extents);
-	marks->im_block_size = blocksize;
-	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
-					   blocksize);
-}
-
-enum extentclass4 {
-	RW_EXTENT       = 0, /* READWRTE and INVAL */
-	RO_EXTENT       = 1, /* READ and NONE */
-	EXTENT_LISTS    = 2,
-};
-
-static inline int bl_choose_list(enum exstate4 state)
-{
-	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
-		return RO_EXTENT;
-	else
-		return RW_EXTENT;
-}
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
 
 struct pnfs_block_layout {
-	struct pnfs_layout_hdr bl_layout;
-	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+	struct pnfs_layout_hdr	bl_layout;
+	struct rb_root		bl_ext_rw;
+	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
-	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */
-	struct list_head	bl_commit;	/* Needs layout commit */
-	struct list_head	bl_committing;	/* Layout committing */
-	unsigned int		bl_count;	/* entries in bl_commit */
-	sector_t		bl_blocksize;  /* Server blocksize in sectors */
 };
 
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
 static inline struct pnfs_block_layout *
 BLK_LO2EXT(struct pnfs_layout_hdr *lo)
 {
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
-						struct pnfs_device *dev);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-				struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
-
-/* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
-
-/* extents.c */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-		struct pnfs_block_extent **cow_read);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length);
-void bl_put_extent(struct pnfs_block_extent *be);
-struct pnfs_block_extent *bl_alloc_extent(void);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-				   struct xdr_stream *xdr,
-				   const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-				   const struct nfs4_layoutcommit_args *arg,
-				   int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
-			 struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-			sector_t offset, sector_t length,
-			struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+		struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+		sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+		struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+		struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- *  Device operations for the pnfs nfs4 file layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
-	uint64_t s;
-
-	*rp = xdr_decode_hyper(*rp, &s);
-	if (s & 0x1ff) {
-		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
-		return -1;
-	}
-	*sp = s >> SECTOR_SHIFT;
-	return 0;
-}
-
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
-	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
-			MINOR(bdev->bd_dev));
-	blkdev_put(bdev, FMODE_READ);
-}
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
-			 size_t mlen)
-{
-	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
-					 nfs_net_id);
-
-	if (mlen != sizeof (struct bl_dev_msg))
-		return -EINVAL;
-
-	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
-		return -EFAULT;
-
-	wake_up(&nn->bl_wq);
-
-	return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
-	struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
-	if (msg->errno >= 0)
-		return;
-	wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
-		       struct pnfs_device *dev)
-{
-	struct pnfs_block_dev *rv;
-	struct block_device *bd = NULL;
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_MOUNT,
-		.totallen = dev->mincount,
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	int offset, len, i, rc;
-	struct net *net = server->nfs_client->cl_net;
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
-	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-		dev->mincount);
-
-	bl_pipe_msg.bl_wq = &nn->bl_wq;
-	memset(msg, 0, sizeof(*msg));
-	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
-	if (!msg->data) {
-		rv = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	len = dev->mincount;
-	offset = sizeof(bl_msg);
-	for (i = 0; len > 0; i++) {
-		memcpy(&dataptr[offset], page_address(dev->pages[i]),
-				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		offset += PAGE_CACHE_SIZE;
-	}
-	msg->len = sizeof(bl_msg) + dev->mincount;
-
-	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
-	add_wait_queue(&nn->bl_wq, &wq);
-	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
-	if (rc < 0) {
-		remove_wait_queue(&nn->bl_wq, &wq);
-		rv = ERR_PTR(rc);
-		goto out;
-	}
-
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&nn->bl_wq, &wq);
-
-	if (reply->status != BL_DEVICE_REQUEST_PROC) {
-		dprintk("%s failed to open device: %d\n",
-			__func__, reply->status);
-		rv = ERR_PTR(-EINVAL);
-		goto out;
-	}
-
-	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
-			       FMODE_READ, NULL);
-	if (IS_ERR(bd)) {
-		dprintk("%s failed to open device : %ld\n", __func__,
-			PTR_ERR(bd));
-		rv = ERR_CAST(bd);
-		goto out;
-	}
-
-	rv = kzalloc(sizeof(*rv), GFP_NOFS);
-	if (!rv) {
-		rv = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	rv->bm_mdev = bd;
-	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
-	rv->net = net;
-	dprintk("%s Created device %s with bd_block_size %u\n",
-		__func__,
-		bd->bd_disk->disk_name,
-		bd->bd_block_size);
-
-out:
-	kfree(msg->data);
-	return rv;
-}
-
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
-					    struct nfs4_deviceid *id)
-{
-	struct block_device *rv = NULL;
-	struct block_mount_id *mid;
-	struct pnfs_block_dev *dev;
-
-	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
-	mid = BLK_ID(lo);
-	spin_lock(&mid->bm_lock);
-	list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
-		if (memcmp(id->data, dev->bm_mdevid.data,
-			   NFS4_DEVICEID4_SIZE) == 0) {
-			rv = dev->bm_mdev;
-			goto out;
-		}
-	}
- out:
-	spin_unlock(&mid->bm_lock);
-	dprintk("%s returning %p\n", __func__, rv);
-	return rv;
-}
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
-	u32 mode;	/* R or RW */
-	u64 start;	/* Expected start of next non-COW extent */
-	u64 inval;	/* Start of INVAL coverage */
-	u64 cowread;	/* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
-			 struct layout_verification *lv)
-{
-	if (lv->mode == IOMODE_READ) {
-		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-		    be->be_state == PNFS_BLOCK_INVALID_DATA)
-			return -EIO;
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		return 0;
-	}
-	/* lv->mode == IOMODE_RW */
-	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		if (lv->cowread > lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		lv->inval = lv->start;
-		return 0;
-	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-		if (be->be_f_offset != lv->start)
-			return -EIO;
-		lv->start += be->be_length;
-		return 0;
-	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
-		if (be->be_f_offset > lv->start)
-			return -EIO;
-		if (be->be_f_offset < lv->inval)
-			return -EIO;
-		if (be->be_f_offset < lv->cowread)
-			return -EIO;
-		/* It looks like you might want to min this with lv->start,
-		 * but you really don't.
-		 */
-		lv->inval = lv->inval + be->be_length;
-		lv->cowread = be->be_f_offset + be->be_length;
-		return 0;
-	} else
-		return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
-	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-	int i, status = -EIO;
-	uint32_t count;
-	struct pnfs_block_extent *be = NULL, *save;
-	struct xdr_stream stream;
-	struct xdr_buf buf;
-	struct page *scratch;
-	__be32 *p;
-	struct layout_verification lv = {
-		.mode = lgr->range.iomode,
-		.start = lgr->range.offset >> SECTOR_SHIFT,
-		.inval = lgr->range.offset >> SECTOR_SHIFT,
-		.cowread = lgr->range.offset >> SECTOR_SHIFT,
-	};
-	LIST_HEAD(extents);
-
-	dprintk("---> %s\n", __func__);
-
-	scratch = alloc_page(gfp_flags);
-	if (!scratch)
-		return -ENOMEM;
-
-	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
-	p = xdr_inline_decode(&stream, 4);
-	if (unlikely(!p))
-		goto out_err;
-
-	count = be32_to_cpup(p++);
-
-	dprintk("%s enter, number of extents %i\n", __func__, count);
-	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
-	if (unlikely(!p))
-		goto out_err;
-
-	/* Decode individual extents, putting them in temporary
-	 * staging area until whole layout is decoded to make error
-	 * recovery easier.
-	 */
-	for (i = 0; i < count; i++) {
-		be = bl_alloc_extent();
-		if (!be) {
-			status = -ENOMEM;
-			goto out_err;
-		}
-		memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
-		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-		be->be_mdev = translate_devid(lo, &be->be_devid);
-		if (!be->be_mdev)
-			goto out_err;
-
-		/* The next three values are read in as bytes,
-		 * but stored as 512-byte sector lengths
-		 */
-		if (decode_sector_number(&p, &be->be_f_offset) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_length) < 0)
-			goto out_err;
-		if (decode_sector_number(&p, &be->be_v_offset) < 0)
-			goto out_err;
-		be->be_state = be32_to_cpup(p++);
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-			be->be_inval = &bl->bl_inval;
-		if (verify_extent(be, &lv)) {
-			dprintk("%s verify failed\n", __func__);
-			goto out_err;
-		}
-		list_add_tail(&be->be_node, &extents);
-	}
-	if (lgr->range.offset + lgr->range.length !=
-			lv.start << SECTOR_SHIFT) {
-		dprintk("%s Final length mismatch\n", __func__);
-		be = NULL;
-		goto out_err;
-	}
-	if (lv.start < lv.cowread) {
-		dprintk("%s Final uncovered COW extent\n", __func__);
-		be = NULL;
-		goto out_err;
-	}
-	/* Extents decoded properly, now try to merge them in to
-	 * existing layout extents.
-	 */
-	spin_lock(&bl->bl_ext_lock);
-	list_for_each_entry_safe(be, save, &extents, be_node) {
-		list_del(&be->be_node);
-		status = bl_add_merge_extent(bl, be);
-		if (status) {
-			spin_unlock(&bl->bl_ext_lock);
-			/* This is a fairly catastrophic error, as the
-			 * entire layout extent lists are now corrupted.
-			 * We should have some way to distinguish this.
-			 */
-			be = NULL;
-			goto out_err;
-		}
-	}
-	spin_unlock(&bl->bl_ext_lock);
-	status = 0;
- out:
-	__free_page(scratch);
-	dprintk("%s returns %i\n", __func__, status);
-	return status;
-
- out_err:
-	bl_put_extent(be);
-	while (!list_empty(&extents)) {
-		be = list_first_entry(&extents, struct pnfs_block_extent,
-				      be_node);
-		list_del(&be->be_node);
-		bl_put_extent(be);
-	}
-	goto out;
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2007 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Fred Isaman <iisaman@umich.edu>
- *  Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-static void dev_remove(struct net *net, dev_t dev)
-{
-	struct bl_pipe_msg bl_pipe_msg;
-	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
-	struct bl_dev_msg bl_umount_request;
-	struct bl_msg_hdr bl_msg = {
-		.type = BL_DEVICE_UMOUNT,
-		.totallen = sizeof(bl_umount_request),
-	};
-	uint8_t *dataptr;
-	DECLARE_WAITQUEUE(wq, current);
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	dprintk("Entering %s\n", __func__);
-
-	bl_pipe_msg.bl_wq = &nn->bl_wq;
-	memset(msg, 0, sizeof(*msg));
-	msg->len = sizeof(bl_msg) + bl_msg.totallen;
-	msg->data = kzalloc(msg->len, GFP_NOFS);
-	if (!msg->data)
-		goto out;
-
-	memset(&bl_umount_request, 0, sizeof(bl_umount_request));
-	bl_umount_request.major = MAJOR(dev);
-	bl_umount_request.minor = MINOR(dev);
-
-	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
-	dataptr = (uint8_t *) msg->data;
-	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
-	add_wait_queue(&nn->bl_wq, &wq);
-	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
-		remove_wait_queue(&nn->bl_wq, &wq);
-		goto out;
-	}
-
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
-	kfree(msg->data);
-}
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
-	dprintk("%s Releasing\n", __func__);
-	nfs4_blkdev_put(bdev->bm_mdev);
-	dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
-	if (bdev) {
-		if (bdev->bm_mdev) {
-			dprintk("%s Removing DM device: %d:%d\n",
-				__func__,
-				MAJOR(bdev->bm_mdev->bd_dev),
-				MINOR(bdev->bm_mdev->bd_dev));
-			nfs4_blk_metadev_release(bdev);
-		}
-		kfree(bdev);
-	}
-}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+	if (dev->nr_children) {
+		int i;
+
+		for (i = 0; i < dev->nr_children; i++)
+			bl_free_device(&dev->children[i]);
+		kfree(dev->children);
+	} else {
+		if (dev->bdev)
+			blkdev_put(dev->bdev, FMODE_READ);
+	}
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct pnfs_block_dev *dev =
+		container_of(d, struct pnfs_block_dev, node);
+
+	bl_free_device(dev);
+	kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+	__be32 *p;
+	int i;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (!p)
+		return -EIO;
+	b->type = be32_to_cpup(p++);
+
+	switch (b->type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->simple.nr_sigs = be32_to_cpup(p++);
+		if (!b->simple.nr_sigs) {
+			dprintk("no signature\n");
+			return -EIO;
+		}
+
+		b->simple.len = 4 + 4;
+		for (i = 0; i < b->simple.nr_sigs; i++) {
+			p = xdr_inline_decode(xdr, 8 + 4);
+			if (!p)
+				return -EIO;
+			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+			if (!p)
+				return -EIO;
+			memcpy(&b->simple.sigs[i].sig, p,
+				b->simple.sigs[i].sig_len);
+
+			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+		}
+		break;
+	case PNFS_BLOCK_VOLUME_SLICE:
+		p = xdr_inline_decode(xdr, 8 + 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->slice.start);
+		p = xdr_decode_hyper(p, &b->slice.len);
+		b->slice.volume = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		p = xdr_inline_decode(xdr, 4);
+		if (!p)
+			return -EIO;
+		b->concat.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->concat.volumes_count; i++)
+			b->concat.volumes[i] = be32_to_cpup(p++);
+		break;
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		p = xdr_inline_decode(xdr, 8 + 4);
+		if (!p)
+			return -EIO;
+		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+		b->stripe.volumes_count = be32_to_cpup(p++);
+
+		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+		if (!p)
+			return -EIO;
+		for (i = 0; i < b->stripe.volumes_count; i++)
+			b->stripe.volumes[i] = be32_to_cpup(p++);
+		break;
+	default:
+		dprintk("unknown volume type!\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	map->start = dev->start;
+	map->len = dev->len;
+	map->disk_offset = dev->disk_offset;
+	map->bdev = dev->bdev;
+	return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	int i;
+
+	for (i = 0; i < dev->nr_children; i++) {
+		struct pnfs_block_dev *child = &dev->children[i];
+
+		if (child->start > offset ||
+		    child->start + child->len <= offset)
+			continue;
+
+		child->map(child, offset - child->start, map);
+		return true;
+	}
+
+	dprintk("%s: ran off loop!\n", __func__);
+	return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+		struct pnfs_block_dev_map *map)
+{
+	struct pnfs_block_dev *child;
+	u64 chunk;
+	u32 chunk_idx;
+	u64 disk_offset;
+
+	chunk = div_u64(offset, dev->chunk_size);
+	div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+	if (chunk_idx > dev->nr_children) {
+		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+			__func__, chunk_idx, offset, dev->chunk_size);
+		/* error, should not happen */
+		return false;
+	}
+
+	/* truncate offset to the beginning of the stripe */
+	offset = chunk * dev->chunk_size;
+
+	/* disk offset of the stripe */
+	disk_offset = div_u64(offset, dev->nr_children);
+
+	child = &dev->children[chunk_idx];
+	child->map(child, disk_offset, map);
+
+	map->start += offset;
+	map->disk_offset += disk_offset;
+	map->len = dev->chunk_size;
+	return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	dev_t dev;
+
+	dev = bl_resolve_deviceid(server, v, gfp_mask);
+	if (!dev)
+		return -EIO;
+
+	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+	if (IS_ERR(d->bdev)) {
+		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+		return PTR_ERR(d->bdev);
+	}
+
+
+	d->len = i_size_read(d->bdev->bd_inode);
+	d->map = bl_map_simple;
+
+	printk(KERN_INFO "pNFS: using block device %s\n",
+		d->bdev->bd_disk->disk_name);
+	return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	int ret;
+
+	ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+	if (ret)
+		return ret;
+
+	d->disk_offset = v->slice.start;
+	d->len = v->slice.len;
+	return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->concat.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->concat.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->concat.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		d->children[i].start += len;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->map = bl_map_concat;
+	return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	struct pnfs_block_volume *v = &volumes[idx];
+	u64 len = 0;
+	int ret, i;
+
+	d->children = kcalloc(v->stripe.volumes_count,
+			sizeof(struct pnfs_block_dev), GFP_KERNEL);
+	if (!d->children)
+		return -ENOMEM;
+
+	for (i = 0; i < v->stripe.volumes_count; i++) {
+		ret = bl_parse_deviceid(server, &d->children[i],
+				volumes, v->stripe.volumes[i], gfp_mask);
+		if (ret)
+			return ret;
+
+		d->nr_children++;
+		len += d->children[i].len;
+	}
+
+	d->len = len;
+	d->chunk_size = v->stripe.chunk_size;
+	d->map = bl_map_stripe;
+	return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+	switch (volumes[idx].type) {
+	case PNFS_BLOCK_VOLUME_SIMPLE:
+		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_SLICE:
+		return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_CONCAT:
+		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+	case PNFS_BLOCK_VOLUME_STRIPE:
+		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+	default:
+		dprintk("unsupported volume type: %d\n", volumes[idx].type);
+		return -EIO;
+	}
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_mask)
+{
+	struct nfs4_deviceid_node *node = NULL;
+	struct pnfs_block_volume *volumes;
+	struct pnfs_block_dev *top;
+	struct xdr_stream xdr;
+	struct xdr_buf buf;
+	struct page *scratch;
+	int nr_volumes, ret, i;
+	__be32 *p;
+
+	scratch = alloc_page(gfp_mask);
+	if (!scratch)
+		goto out;
+
+	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+	p = xdr_inline_decode(&xdr, sizeof(__be32));
+	if (!p)
+		goto out_free_scratch;
+	nr_volumes = be32_to_cpup(p++);
+
+	volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+			  gfp_mask);
+	if (!volumes)
+		goto out_free_scratch;
+
+	for (i = 0; i < nr_volumes; i++) {
+		ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+		if (ret < 0)
+			goto out_free_volumes;
+	}
+
+	top = kzalloc(sizeof(*top), gfp_mask);
+	if (!top)
+		goto out_free_volumes;
+
+	ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+	if (ret) {
+		bl_free_device(top);
+		kfree(top);
+		goto out_free_volumes;
+	}
+
+	node = &top->node;
+	nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+	kfree(volumes);
+out_free_scratch:
+	__free_page(scratch);
+out:
+	return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+	return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_prev(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+	struct rb_node *node = rb_next(&be->be_node);
+	return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+	return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+	struct rb_node *node = root->rb_node;
+	struct pnfs_block_extent *be = NULL;
+
+	while (node) {
+		be = ext_node(node);
+		if (start < be->be_f_offset)
+			node = node->rb_left;
+		else if (start >= ext_f_end(be))
+			node = node->rb_right;
+		else
+			return be;
+	}
+
+	if (be) {
+		if (start < be->be_f_offset)
+			return be;
+
+		if (start >= ext_f_end(be))
+			return ext_tree_next(be);
+	}
+
+	return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+	if (be1->be_state != be2->be_state)
+		return false;
+	if (be1->be_device != be2->be_device)
+		return false;
+
+	if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+		return false;
+
+	if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+	    (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+		return false;
+
+	if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+	    be1->be_tag != be2->be_tag)
+		return false;
+
+	return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *left = ext_tree_prev(be);
+
+	if (left && ext_can_merge(left, be)) {
+		left->be_length += be->be_length;
+		rb_erase(&be->be_node, root);
+		nfs4_put_deviceid_node(be->be_device);
+		kfree(be);
+		return left;
+	}
+
+	return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+	struct pnfs_block_extent *right = ext_tree_next(be);
+
+	if (right && ext_can_merge(be, right)) {
+		be->be_length += right->be_length;
+		rb_erase(&right->be_node, root);
+		nfs4_put_deviceid_node(right->be_device);
+		kfree(right);
+	}
+
+	return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+		struct pnfs_block_extent *new, bool merge_ok)
+{
+	struct rb_node **p = &root->rb_node, *parent = NULL;
+	struct pnfs_block_extent *be;
+
+	while (*p) {
+		parent = *p;
+		be = ext_node(parent);
+
+		if (new->be_f_offset < be->be_f_offset) {
+			if (merge_ok && ext_can_merge(new, be)) {
+				be->be_f_offset = new->be_f_offset;
+				if (be->be_state != PNFS_BLOCK_NONE_DATA)
+					be->be_v_offset = new->be_v_offset;
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_left(root, be);
+				goto free_new;
+			}
+			p = &(*p)->rb_left;
+		} else if (new->be_f_offset >= ext_f_end(be)) {
+			if (merge_ok && ext_can_merge(be, new)) {
+				be->be_length += new->be_length;
+				be = ext_try_to_merge_right(root, be);
+				goto free_new;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->be_node, parent, p);
+	rb_insert_color(&new->be_node, root);
+	return;
+free_new:
+	nfs4_put_deviceid_node(new->be_device);
+	kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+	struct pnfs_block_extent *be;
+	sector_t len1 = 0, len2 = 0;
+	sector_t orig_v_offset;
+	sector_t orig_len;
+
+	be = __ext_tree_search(root, start);
+	if (!be)
+		return 0;
+	if (be->be_f_offset >= end)
+		return 0;
+
+	orig_v_offset = be->be_v_offset;
+	orig_len = be->be_length;
+
+	if (start > be->be_f_offset)
+		len1 = start - be->be_f_offset;
+	if (ext_f_end(be) > end)
+		len2 = ext_f_end(be) - end;
+
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct pnfs_block_extent *new;
+
+			new = kzalloc(sizeof(*new), GFP_ATOMIC);
+			if (!new)
+				return -ENOMEM;
+
+			be->be_length = len1;
+
+			new->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				new->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			new->be_length = len2;
+			new->be_state = be->be_state;
+			new->be_tag = be->be_tag;
+			new->be_device = nfs4_get_deviceid(be->be_device);
+
+			__ext_tree_insert(root, new, true);
+		} else {
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+				be->be_v_offset =
+					orig_v_offset + orig_len - len2;
+			}
+			be->be_length = len2;
+		}
+	} else {
+		if (len1 > 0) {
+			be->be_length = len1;
+			be = ext_tree_next(be);
+		}
+
+		while (be && ext_f_end(be) <= end) {
+			struct pnfs_block_extent *next = ext_tree_next(be);
+
+			rb_erase(&be->be_node, root);
+			nfs4_put_deviceid_node(be->be_device);
+			kfree(be);
+			be = next;
+		}
+
+		if (be && be->be_f_offset < end) {
+			len1 = ext_f_end(be) - end;
+			be->be_f_offset = end;
+			if (be->be_state != PNFS_BLOCK_NONE_DATA)
+				be->be_v_offset += be->be_length - len1;
+			be->be_length = len1;
+		}
+	}
+
+	return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+	struct pnfs_block_extent *be;
+	struct rb_root *root;
+	int err = 0;
+
+	switch (new->be_state) {
+	case PNFS_BLOCK_READWRITE_DATA:
+	case PNFS_BLOCK_INVALID_DATA:
+		root = &bl->bl_ext_rw;
+		break;
+	case PNFS_BLOCK_READ_DATA:
+	case PNFS_BLOCK_NONE_DATA:
+		root = &bl->bl_ext_ro;
+		break;
+	default:
+		dprintk("invalid extent type\n");
+		return -EINVAL;
+	}
+
+	spin_lock(&bl->bl_ext_lock);
+retry:
+	be = __ext_tree_search(root, new->be_f_offset);
+	if (!be || be->be_f_offset >= ext_f_end(new)) {
+		__ext_tree_insert(root, new, true);
+	} else if (new->be_f_offset >= be->be_f_offset) {
+		if (ext_f_end(new) <= ext_f_end(be)) {
+			nfs4_put_deviceid_node(new->be_device);
+			kfree(new);
+		} else {
+			sector_t new_len = ext_f_end(new) - ext_f_end(be);
+			sector_t diff = new->be_length - new_len;
+
+			new->be_f_offset += diff;
+			new->be_v_offset += diff;
+			new->be_length = new_len;
+			goto retry;
+		}
+	} else if (ext_f_end(new) <= ext_f_end(be)) {
+		new->be_length = be->be_f_offset - new->be_f_offset;
+		__ext_tree_insert(root, new, true);
+	} else {
+		struct pnfs_block_extent *split;
+		sector_t new_len = ext_f_end(new) - ext_f_end(be);
+		sector_t diff = new->be_length - new_len;
+
+		split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+		if (!split) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		split->be_length = be->be_f_offset - split->be_f_offset;
+		split->be_device = nfs4_get_deviceid(new->be_device);
+		__ext_tree_insert(root, split, true);
+
+		new->be_f_offset += diff;
+		new->be_v_offset += diff;
+		new->be_length = new_len;
+		goto retry;
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+		struct pnfs_block_extent *ret)
+{
+	struct rb_node *node;
+	struct pnfs_block_extent *be;
+
+	node = root->rb_node;
+	while (node) {
+		be = ext_node(node);
+		if (isect < be->be_f_offset)
+			node = node->rb_left;
+		else if (isect >= ext_f_end(be))
+			node = node->rb_right;
+		else {
+			*ret = *be;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+	    struct pnfs_block_extent *ret, bool rw)
+{
+	bool found = false;
+
+	spin_lock(&bl->bl_ext_lock);
+	if (!rw)
+		found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+	if (!found)
+		found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+	spin_unlock(&bl->bl_ext_lock);
+
+	return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+		sector_t start, sector_t end)
+{
+	int err, err2;
+
+	spin_lock(&bl->bl_ext_lock);
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (rw) {
+		err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+		if (!err)
+			err = err2;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+		sector_t split)
+{
+	struct pnfs_block_extent *new;
+	sector_t orig_len = be->be_length;
+
+	new = kzalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		return -ENOMEM;
+
+	be->be_length = split - be->be_f_offset;
+
+	new->be_f_offset = split;
+	if (be->be_state != PNFS_BLOCK_NONE_DATA)
+		new->be_v_offset = be->be_v_offset + be->be_length;
+	new->be_length = orig_len - be->be_length;
+	new->be_state = be->be_state;
+	new->be_tag = be->be_tag;
+	new->be_device = nfs4_get_deviceid(be->be_device);
+
+	__ext_tree_insert(root, new, false);
+	return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+		sector_t len)
+{
+	struct rb_root *root = &bl->bl_ext_rw;
+	sector_t end = start + len;
+	struct pnfs_block_extent *be;
+	int err = 0;
+
+	spin_lock(&bl->bl_ext_lock);
+	/*
+	 * First remove all COW extents or holes from written to range.
+	 */
+	err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+	if (err)
+		goto out;
+
+	/*
+	 * Then mark all invalid extents in the range as written to.
+	 */
+	for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+		if (be->be_f_offset >= end)
+			break;
+
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+			continue;
+
+		if (be->be_f_offset < start) {
+			struct pnfs_block_extent *left = ext_tree_prev(be);
+
+			if (left && ext_can_merge(left, be)) {
+				sector_t diff = start - be->be_f_offset;
+
+				left->be_length += diff;
+
+				be->be_f_offset += diff;
+				be->be_v_offset += diff;
+				be->be_length -= diff;
+			} else {
+				err = ext_tree_split(root, be, start);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (ext_f_end(be) > end) {
+			struct pnfs_block_extent *right = ext_tree_next(be);
+
+			if (right && ext_can_merge(be, right)) {
+				sector_t diff = end - be->be_f_offset;
+
+				be->be_length -= diff;
+
+				right->be_f_offset -= diff;
+				right->be_v_offset -= diff;
+				right->be_length += diff;
+			} else {
+				err = ext_tree_split(root, be, end);
+				if (err)
+					goto out;
+			}
+		}
+
+		if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+			be->be_tag = EXTENT_WRITTEN;
+			be = ext_try_to_merge_left(root, be);
+			be = ext_try_to_merge_right(root, be);
+		}
+	}
+out:
+	spin_unlock(&bl->bl_ext_lock);
+	return err;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+		size_t buffer_size)
+{
+	if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+		int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+		for (i = 0; i < nr_pages; i++)
+			put_page(arg->layoutupdate_pages[i]);
+		kfree(arg->layoutupdate_pages);
+	} else {
+		put_page(arg->layoutupdate_page);
+	}
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+		size_t buffer_size, size_t *count)
+{
+	struct pnfs_block_extent *be;
+	int ret = 0;
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_WRITTEN)
+			continue;
+
+		(*count)++;
+		if (*count * BL_EXTENT_SIZE > buffer_size) {
+			/* keep counting.. */
+			ret = -ENOSPC;
+			continue;
+		}
+
+		p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+				NFS4_DEVICEID4_SIZE);
+		p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+		p = xdr_encode_hyper(p, 0LL);
+		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+		be->be_tag = EXTENT_COMMITTING;
+	}
+	spin_unlock(&bl->bl_ext_lock);
+
+	return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	size_t count = 0, buffer_size = PAGE_SIZE;
+	__be32 *start_p;
+	int ret;
+
+	dprintk("%s enter\n", __func__);
+
+	arg->layoutupdate_page = alloc_page(GFP_NOFS);
+	if (!arg->layoutupdate_page)
+		return -ENOMEM;
+	start_p = page_address(arg->layoutupdate_page);
+	arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	if (unlikely(ret)) {
+		ext_tree_free_commitdata(arg, buffer_size);
+
+		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		count = 0;
+
+		arg->layoutupdate_pages =
+			kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+				sizeof(struct page *), GFP_NOFS);
+		if (!arg->layoutupdate_pages)
+			return -ENOMEM;
+
+		start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+		if (!start_p) {
+			kfree(arg->layoutupdate_pages);
+			return -ENOMEM;
+		}
+
+		goto retry;
+	}
+
+	*start_p = cpu_to_be32(count);
+	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+		__be32 *p = start_p;
+		int i = 0;
+
+		for (p = start_p;
+		     p < start_p + arg->layoutupdate_len;
+		     p += PAGE_SIZE) {
+			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+		}
+	}
+
+	dprintk("%s found %zu ranges\n", __func__, count);
+	return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	struct rb_root *root = &bl->bl_ext_rw;
+	struct pnfs_block_extent *be;
+
+	dprintk("%s status %d\n", __func__, status);
+
+	ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+	spin_lock(&bl->bl_ext_lock);
+	for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+		    be->be_tag != EXTENT_COMMITTING)
+			continue;
+
+		if (status) {
+			/*
+			 * Mark as written and try again.
+			 *
+			 * XXX: some real error handling here wouldn't hurt..
+			 */
+			be->be_tag = EXTENT_WRITTEN;
+		} else {
+			be->be_state = PNFS_BLOCK_READWRITE_DATA;
+			be->be_tag = 0;
+		}
+
+		be = ext_try_to_merge_left(root, be);
+		be = ext_try_to_merge_right(root, be);
+	}
+	spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- *  linux/fs/nfs/blocklayout/blocklayout.h
- *
- *  Module for the NFSv4.1 pNFS block layout driver.
- *
- *  Copyright (c) 2006 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@citi.umich.edu>
- *  Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization.  if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose.  the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include "blocklayout.h"
-#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN     1
-#define EXTENT_IN_COMMIT   2
-#define INTERNAL_EXISTS    MY_MAX_TAGS
-#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
-
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
-	sector_t tmp = s; /* Since do_div modifies its argument */
-	return s - sector_div(tmp, base);
-}
-
-static inline sector_t normalize_up(sector_t s, int base)
-{
-	return normalize(s + base - 1, base);
-}
-
-/* Complete stub using list while determine API wanted */
-
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
-	struct pnfs_inval_tracking *pos;
-
-	dprintk("%s(%llu) enter\n", __func__, s);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector > s)
-			continue;
-		else if (pos->it_sector == s)
-			return pos->it_tags & INTERNAL_MASK;
-		else
-			break;
-	}
-	return -ENOENT;
-}
-
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
-	int32_t tags;
-
-	dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
-	s = normalize(s, tree->mtt_step_size);
-	tags = _find_entry(tree, s);
-	if ((tags < 0) || !(tags & (1 << tag)))
-		return 0;
-	else
-		return 1;
-}
-
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
-		      struct pnfs_inval_tracking *storage)
-{
-	int found = 0;
-	struct pnfs_inval_tracking *pos;
-
-	dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector > s)
-			continue;
-		else if (pos->it_sector == s) {
-			found = 1;
-			break;
-		} else
-			break;
-	}
-	if (found) {
-		pos->it_tags |= (1 << tag);
-		return 0;
-	} else {
-		struct pnfs_inval_tracking *new;
-		new = storage;
-		new->it_sector = s;
-		new->it_tags = (1 << tag);
-		list_add(&new->it_link, &pos->it_link);
-		return 1;
-	}
-}
-
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
-	u64 i;
-
-	dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
-	for (i = normalize(s, tree->mtt_step_size); i < s + length;
-	     i += tree->mtt_step_size)
-		if (_add_entry(tree, i, tag, NULL))
-			return -ENOMEM;
-	return 0;
-}
-
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
-		u64 offset, u64 length)
-{
-	u64 start, end, s;
-	int count, i, used = 0, status = -ENOMEM;
-	struct pnfs_inval_tracking **storage;
-	struct my_tree  *tree = &marks->im_tree;
-
-	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
-	start = normalize(offset, tree->mtt_step_size);
-	end = normalize_up(offset + length, tree->mtt_step_size);
-	count = (int)(end - start) / (int)tree->mtt_step_size;
-
-	/* Pre-malloc what memory we might need */
-	storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
-	if (!storage)
-		return -ENOMEM;
-	for (i = 0; i < count; i++) {
-		storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
-				     GFP_NOFS);
-		if (!storage[i])
-			goto out_cleanup;
-	}
-
-	spin_lock_bh(&marks->im_lock);
-	for (s = start; s < end; s += tree->mtt_step_size)
-		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
-	spin_unlock_bh(&marks->im_lock);
-
-	status = 0;
-
- out_cleanup:
-	for (i = used; i < count; i++) {
-		if (!storage[i])
-			break;
-		kfree(storage[i]);
-	}
-	kfree(storage);
-	return status;
-}
-
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
-	int rv;
-
-	spin_lock_bh(&marks->im_lock);
-	rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
-	spin_unlock_bh(&marks->im_lock);
-	return rv;
-}
-
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
-	struct pnfs_inval_tracking *pos;
-	u64 expect = 0;
-
-	dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
-	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-		if (pos->it_sector >= end)
-			continue;
-		if (!expect) {
-			if ((pos->it_sector == end - tree->mtt_step_size) &&
-			    (pos->it_tags & (1 << tag))) {
-				expect = pos->it_sector - tree->mtt_step_size;
-				if (pos->it_sector < tree->mtt_step_size || expect < start)
-					return 1;
-				continue;
-			} else {
-				return 0;
-			}
-		}
-		if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
-			return 0;
-		expect -= tree->mtt_step_size;
-		if (expect < start)
-			return 1;
-	}
-	return 0;
-}
-
-static int is_range_written(struct pnfs_inval_markings *marks,
-			    sector_t start, sector_t end)
-{
-	int rv;
-
-	spin_lock_bh(&marks->im_lock);
-	rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
-	spin_unlock_bh(&marks->im_lock);
-	return rv;
-}
-
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length)
-{
-	sector_t start, end;
-
-	dprintk("%s(offset=%llu,len=%llu) enter\n",
-		__func__, (u64)offset, (u64)length);
-
-	start = normalize(offset, marks->im_block_size);
-	end = normalize_up(offset + length, marks->im_block_size);
-	if (_preload_range(marks, start, end - start))
-		goto outerr;
-
-	spin_lock_bh(&marks->im_lock);
-	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
-		goto out_unlock;
-	spin_unlock_bh(&marks->im_lock);
-
-	return 0;
-
-out_unlock:
-	spin_unlock_bh(&marks->im_lock);
-outerr:
-	return -ENOMEM;
-}
-
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
-				sector_t offset, sector_t length)
-{
-	int status;
-
-	dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
-		(u64)offset, (u64)length);
-	spin_lock_bh(&marks->im_lock);
-	status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
-	spin_unlock_bh(&marks->im_lock);
-	return status;
-}
-
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
-	dprintk("PRINT SHORT EXTENT extent %p\n", be);
-	if (be) {
-		dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
-		dprintk("        be_length   %llu\n", (u64)be->bse_length);
-	}
-}
-
-static void print_clist(struct list_head *list, unsigned int count)
-{
-	struct pnfs_block_short_extent *be;
-	unsigned int i = 0;
-
-	ifdebug(FACILITY) {
-		printk(KERN_DEBUG "****************\n");
-		printk(KERN_DEBUG "Extent list looks like:\n");
-		list_for_each_entry(be, list, bse_node) {
-			i++;
-			print_short_extent(be);
-		}
-		if (i != count)
-			printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
-		printk(KERN_DEBUG "****************\n");
-	}
-}
-
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
-			      struct pnfs_block_short_extent *new)
-{
-	struct list_head *clist = &bl->bl_commit;
-	struct pnfs_block_short_extent *old, *save;
-	sector_t end = new->bse_f_offset + new->bse_length;
-
-	dprintk("%s enter\n", __func__);
-	print_short_extent(new);
-	print_clist(clist, bl->bl_count);
-	bl->bl_count++;
-	/* Scan for proper place to insert, extending new to the left
-	 * as much as possible.
-	 */
-	list_for_each_entry_safe(old, save, clist, bse_node) {
-		if (new->bse_f_offset < old->bse_f_offset)
-			break;
-		if (end <= old->bse_f_offset + old->bse_length) {
-			/* Range is already in list */
-			bl->bl_count--;
-			kfree(new);
-			return;
-		} else if (new->bse_f_offset <=
-				old->bse_f_offset + old->bse_length) {
-			/* new overlaps or abuts existing be */
-			if (new->bse_mdev == old->bse_mdev) {
-				/* extend new to fully replace old */
-				new->bse_length += new->bse_f_offset -
-						old->bse_f_offset;
-				new->bse_f_offset = old->bse_f_offset;
-				list_del(&old->bse_node);
-				bl->bl_count--;
-				kfree(old);
-			}
-		}
-	}
-	/* Note that if we never hit the above break, old will not point to a
-	 * valid extent.  However, in that case &old->bse_node==list.
-	 */
-	list_add_tail(&new->bse_node, &old->bse_node);
-	/* Scan forward for overlaps.  If we find any, extend new and
-	 * remove the overlapped extent.
-	 */
-	old = list_prepare_entry(new, clist, bse_node);
-	list_for_each_entry_safe_continue(old, save, clist, bse_node) {
-		if (end < old->bse_f_offset)
-			break;
-		/* new overlaps or abuts old */
-		if (new->bse_mdev == old->bse_mdev) {
-			if (end < old->bse_f_offset + old->bse_length) {
-				/* extend new to fully cover old */
-				end = old->bse_f_offset + old->bse_length;
-				new->bse_length = end - new->bse_f_offset;
-			}
-			list_del(&old->bse_node);
-			bl->bl_count--;
-			kfree(old);
-		}
-	}
-	dprintk("%s: after merging\n", __func__);
-	print_clist(clist, bl->bl_count);
-}
-
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
-		    sector_t offset, sector_t length,
-		    struct pnfs_block_short_extent *new)
-{
-	sector_t new_end, end = offset + length;
-	struct pnfs_block_layout *bl = container_of(be->be_inval,
-						    struct pnfs_block_layout,
-						    bl_inval);
-
-	mark_written_sectors(be->be_inval, offset, length);
-	/* We want to add the range to commit list, but it must be
-	 * block-normalized, and verified that the normalized range has
-	 * been entirely written to disk.
-	 */
-	new->bse_f_offset = offset;
-	offset = normalize(offset, bl->bl_blocksize);
-	if (offset < new->bse_f_offset) {
-		if (is_range_written(be->be_inval, offset, new->bse_f_offset))
-			new->bse_f_offset = offset;
-		else
-			new->bse_f_offset = offset + bl->bl_blocksize;
-	}
-	new_end = normalize_up(end, bl->bl_blocksize);
-	if (end < new_end) {
-		if (is_range_written(be->be_inval, end, new_end))
-			end = new_end;
-		else
-			end = new_end - bl->bl_blocksize;
-	}
-	if (end <= new->bse_f_offset) {
-		kfree(new);
-		return 0;
-	}
-	new->bse_length = end - new->bse_f_offset;
-	new->bse_devid = be->be_devid;
-	new->bse_mdev = be->be_mdev;
-
-	spin_lock(&bl->bl_ext_lock);
-	add_to_commitlist(bl, new);
-	spin_unlock(&bl->bl_ext_lock);
-	return 0;
-}
-
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
-	dprintk("PRINT EXTENT extent %p\n", be);
-	if (be) {
-		dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
-		dprintk("        be_length   %llu\n", (u64)be->be_length);
-		dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
-		dprintk("        be_state    %d\n", be->be_state);
-	}
-}
-
-static void
-destroy_extent(struct kref *kref)
-{
-	struct pnfs_block_extent *be;
-
-	be = container_of(kref, struct pnfs_block_extent, be_refcnt);
-	dprintk("%s be=%p\n", __func__, be);
-	kfree(be);
-}
-
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
-	if (be) {
-		dprintk("%s enter %p (%i)\n", __func__, be,
-			atomic_read(&be->be_refcnt.refcount));
-		kref_put(&be->be_refcnt, destroy_extent);
-	}
-}
-
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
-	struct pnfs_block_extent *be;
-
-	be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
-	if (!be)
-		return NULL;
-	INIT_LIST_HEAD(&be->be_node);
-	kref_init(&be->be_refcnt);
-	be->be_inval = NULL;
-	return be;
-}
-
-static void print_elist(struct list_head *list)
-{
-	struct pnfs_block_extent *be;
-	dprintk("****************\n");
-	dprintk("Extent list looks like:\n");
-	list_for_each_entry(be, list, be_node) {
-		print_bl_extent(be);
-	}
-	dprintk("****************\n");
-}
-
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
-	/* Note this assumes new->be_f_offset >= old->be_f_offset */
-	return (new->be_state == old->be_state) &&
-		((new->be_state == PNFS_BLOCK_NONE_DATA) ||
-		 ((new->be_v_offset - old->be_v_offset ==
-		   new->be_f_offset - old->be_f_offset) &&
-		  new->be_mdev == old->be_mdev));
-}
-
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set.  If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
-		     struct pnfs_block_extent *new)
-{
-	struct pnfs_block_extent *be, *tmp;
-	sector_t end = new->be_f_offset + new->be_length;
-	struct list_head *list;
-
-	dprintk("%s enter with be=%p\n", __func__, new);
-	print_bl_extent(new);
-	list = &bl->bl_extents[bl_choose_list(new->be_state)];
-	print_elist(list);
-
-	/* Scan for proper place to insert, extending new to the left
-	 * as much as possible.
-	 */
-	list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
-		if (new->be_f_offset >= be->be_f_offset + be->be_length)
-			break;
-		if (new->be_f_offset >= be->be_f_offset) {
-			if (end <= be->be_f_offset + be->be_length) {
-				/* new is a subset of existing be*/
-				if (extents_consistent(be, new)) {
-					dprintk("%s: new is subset, ignoring\n",
-						__func__);
-					bl_put_extent(new);
-					return 0;
-				} else {
-					goto out_err;
-				}
-			} else {
-				/* |<--   be   -->|
-				 *          |<--   new   -->| */
-				if (extents_consistent(be, new)) {
-					/* extend new to fully replace be */
-					new->be_length += new->be_f_offset -
-						be->be_f_offset;
-					new->be_f_offset = be->be_f_offset;
-					new->be_v_offset = be->be_v_offset;
-					dprintk("%s: removing %p\n", __func__, be);
-					list_del(&be->be_node);
-					bl_put_extent(be);
-				} else {
-					goto out_err;
-				}
-			}
-		} else if (end >= be->be_f_offset + be->be_length) {
-			/* new extent overlap existing be */
-			if (extents_consistent(be, new)) {
-				/* extend new to fully replace be */
-				dprintk("%s: removing %p\n", __func__, be);
-				list_del(&be->be_node);
-				bl_put_extent(be);
-			} else {
-				goto out_err;
-			}
-		} else if (end > be->be_f_offset) {
-			/*           |<--   be   -->|
-			 *|<--   new   -->| */
-			if (extents_consistent(new, be)) {
-				/* extend new to fully replace be */
-				new->be_length += be->be_f_offset + be->be_length -
-					new->be_f_offset - new->be_length;
-				dprintk("%s: removing %p\n", __func__, be);
-				list_del(&be->be_node);
-				bl_put_extent(be);
-			} else {
-				goto out_err;
-			}
-		}
-	}
-	/* Note that if we never hit the above break, be will not point to a
-	 * valid extent.  However, in that case &be->be_node==list.
-	 */
-	list_add(&new->be_node, &be->be_node);
-	dprintk("%s: inserting new\n", __func__);
-	print_elist(list);
-	/* FIXME - The per-list consistency checks have all been done,
-	 * should now check cross-list consistency.
-	 */
-	return 0;
-
- out_err:
-	bl_put_extent(new);
-	return -EIO;
-}
-
-/* Returns extent, or NULL.  If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID.  Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-	    struct pnfs_block_extent **cow_read)
-{
-	struct pnfs_block_extent *be, *cow, *ret;
-	int i;
-
-	dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-	cow = ret = NULL;
-	spin_lock(&bl->bl_ext_lock);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-			if (isect >= be->be_f_offset + be->be_length)
-				break;
-			if (isect >= be->be_f_offset) {
-				/* We have found an extent */
-				dprintk("%s Get %p (%i)\n", __func__, be,
-					atomic_read(&be->be_refcnt.refcount));
-				kref_get(&be->be_refcnt);
-				if (!ret)
-					ret = be;
-				else if (be->be_state != PNFS_BLOCK_READ_DATA)
-					bl_put_extent(be);
-				else
-					cow = be;
-				break;
-			}
-		}
-		if (ret &&
-		    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
-			break;
-	}
-	spin_unlock(&bl->bl_ext_lock);
-	if (cow_read)
-		*cow_read = cow;
-	print_bl_extent(ret);
-	return ret;
-}
-
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
-	struct pnfs_block_extent *be, *ret = NULL;
-	int i;
-
-	dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
-	for (i = 0; i < EXTENT_LISTS; i++) {
-		if (ret)
-			break;
-		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
-			if (isect >= be->be_f_offset + be->be_length)
-				break;
-			if (isect >= be->be_f_offset) {
-				/* We have found an extent */
-				dprintk("%s Get %p (%i)\n", __func__, be,
-					atomic_read(&be->be_refcnt.refcount));
-				kref_get(&be->be_refcnt);
-				ret = be;
-				break;
-			}
-		}
-	}
-	print_bl_extent(ret);
-	return ret;
-}
-
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-			       struct xdr_stream *xdr,
-			       const struct nfs4_layoutcommit_args *arg)
-{
-	struct pnfs_block_short_extent *lce, *save;
-	unsigned int count = 0;
-	__be32 *p, *xdr_start;
-
-	dprintk("%s enter\n", __func__);
-	/* BUG - creation of bl_commit is buggy - need to wait for
-	 * entire block to be marked WRITTEN before it can be added.
-	 */
-	spin_lock(&bl->bl_ext_lock);
-	/* Want to adjust for possible truncate */
-	/* We now want to adjust argument range */
-
-	/* XDR encode the ranges found */
-	xdr_start = xdr_reserve_space(xdr, 8);
-	if (!xdr_start)
-		goto out;
-	list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
-		p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
-		if (!p)
-			break;
-		p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
-		p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
-		p = xdr_encode_hyper(p, 0LL);
-		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-		list_move_tail(&lce->bse_node, &bl->bl_committing);
-		bl->bl_count--;
-		count++;
-	}
-	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
-	xdr_start[1] = cpu_to_be32(count);
-out:
-	spin_unlock(&bl->bl_ext_lock);
-	dprintk("%s found %i ranges\n", __func__, count);
-	return 0;
-}
-
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
-		 struct pnfs_block_extent *orig,
-		 sector_t offset, sector_t length, int state)
-{
-	kref_init(&new->be_refcnt);
-	/* don't need to INIT_LIST_HEAD(&new->be_node) */
-	memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
-	new->be_mdev = orig->be_mdev;
-	new->be_f_offset = offset;
-	new->be_length = length;
-	new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
-	new->be_state = state;
-	new->be_inval = orig->be_inval;
-}
-
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
-	     struct pnfs_block_extent *storage)
-{
-	struct pnfs_block_extent *prev;
-
-	if (!storage)
-		goto no_merge;
-	if (&be->be_node == head || be->be_node.prev == head)
-		goto no_merge;
-	prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
-	if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
-	    !extents_consistent(prev, be))
-		goto no_merge;
-	_prep_new_extent(storage, prev, prev->be_f_offset,
-			 prev->be_length + be->be_length, prev->be_state);
-	list_replace(&prev->be_node, &storage->be_node);
-	bl_put_extent(prev);
-	list_del(&be->be_node);
-	bl_put_extent(be);
-	return storage;
-
- no_merge:
-	kfree(storage);
-	return be;
-}
-
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
-	u64 rv = offset + length;
-	struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
-	struct pnfs_block_extent *children[3];
-	struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
-	int i = 0, j;
-
-	dprintk("%s(%llu, %llu)\n", __func__, offset, length);
-	/* Create storage for up to three new extents e1, e2, e3 */
-	e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
-	e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
-	e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
-	/* BUG - we are ignoring any failure */
-	if (!e1 || !e2 || !e3)
-		goto out_nosplit;
-
-	spin_lock(&bl->bl_ext_lock);
-	be = bl_find_get_extent_locked(bl, offset);
-	rv = be->be_f_offset + be->be_length;
-	if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
-		spin_unlock(&bl->bl_ext_lock);
-		goto out_nosplit;
-	}
-	/* Add e* to children, bumping e*'s krefs */
-	if (be->be_f_offset != offset) {
-		_prep_new_extent(e1, be, be->be_f_offset,
-				 offset - be->be_f_offset,
-				 PNFS_BLOCK_INVALID_DATA);
-		children[i++] = e1;
-		print_bl_extent(e1);
-	} else
-		merge1 = e1;
-	_prep_new_extent(e2, be, offset,
-			 min(length, be->be_f_offset + be->be_length - offset),
-			 PNFS_BLOCK_READWRITE_DATA);
-	children[i++] = e2;
-	print_bl_extent(e2);
-	if (offset + length < be->be_f_offset + be->be_length) {
-		_prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
-				 be->be_f_offset + be->be_length -
-				 offset - length,
-				 PNFS_BLOCK_INVALID_DATA);
-		children[i++] = e3;
-		print_bl_extent(e3);
-	} else
-		merge2 = e3;
-
-	/* Remove be from list, and insert the e* */
-	/* We don't get refs on e*, since this list is the base reference
-	 * set when init'ed.
-	 */
-	if (i < 3)
-		children[i] = NULL;
-	new = children[0];
-	list_replace(&be->be_node, &new->be_node);
-	bl_put_extent(be);
-	new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
-	for (j = 1; j < i; j++) {
-		old = new;
-		new = children[j];
-		list_add(&new->be_node, &old->be_node);
-	}
-	if (merge2) {
-		/* This is a HACK, should just create a _back_merge function */
-		new = list_entry(new->be_node.next,
-				 struct pnfs_block_extent, be_node);
-		new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
-	}
-	spin_unlock(&bl->bl_ext_lock);
-
-	/* Since we removed the base reference above, be is now scheduled for
-	 * destruction.
-	 */
-	bl_put_extent(be);
-	dprintk("%s returns %llu after split\n", __func__, rv);
-	return rv;
-
- out_nosplit:
-	kfree(e1);
-	kfree(e2);
-	kfree(e3);
-	dprintk("%s returns %llu without splitting\n", __func__, rv);
-	return rv;
-}
-
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-			      const struct nfs4_layoutcommit_args *arg,
-			      int status)
-{
-	struct pnfs_block_short_extent *lce, *save;
-
-	dprintk("%s status %d\n", __func__, status);
-	list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
-		if (likely(!status)) {
-			u64 offset = lce->bse_f_offset;
-			u64 end = offset + lce->bse_length;
-
-			do {
-				offset = set_to_rw(bl, offset, end - offset);
-			} while (offset < end);
-			list_del(&lce->bse_node);
-
-			kfree(lce);
-		} else {
-			list_del(&lce->bse_node);
-			spin_lock(&bl->bl_ext_lock);
-			add_to_commitlist(bl, lce);
-			spin_unlock(&bl->bl_ext_lock);
-		}
-	}
-}
-
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_block_short_extent *new;
-
-	new = kmalloc(sizeof(*new), GFP_NOFS);
-	if (unlikely(!new))
-		return -ENOMEM;
-
-	spin_lock_bh(&marks->im_lock);
-	list_add(&new->bse_node, &marks->im_extents);
-	spin_unlock_bh(&marks->im_lock);
-
-	return 0;
-}
-
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
-	struct pnfs_block_short_extent *rv = NULL;
-
-	spin_lock_bh(&marks->im_lock);
-	if (!list_empty(&marks->im_extents)) {
-		rv = list_entry((&marks->im_extents)->next,
-				struct pnfs_block_short_extent, bse_node);
-		list_del_init(&rv->bse_node);
-	}
-	spin_unlock_bh(&marks->im_lock);
-
-	return rv;
-}
-
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
-	struct pnfs_block_short_extent *se = NULL, *tmp;
-
-	if (num_to_free <= 0)
-		return;
-
-	spin_lock(&marks->im_lock);
-	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
-		list_del(&se->bse_node);
-		kfree(se);
-		if (--num_to_free == 0)
-			break;
-	}
-	spin_unlock(&marks->im_lock);
-
-	BUG_ON(num_to_free > 0);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+	int i;
+
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(b->type);
+	*p++ = cpu_to_be32(b->simple.nr_sigs);
+	for (i = 0; i < b->simple.nr_sigs; i++) {
+		p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+		p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+					 b->simple.sigs[i].sig_len);
+	}
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+		gfp_t gfp_mask)
+{
+	struct net *net = server->nfs_client->cl_net;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct bl_dev_msg *reply = &nn->bl_mount_reply;
+	struct bl_pipe_msg bl_pipe_msg;
+	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+	struct bl_msg_hdr *bl_msg;
+	DECLARE_WAITQUEUE(wq, current);
+	dev_t dev = 0;
+	int rc;
+
+	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+	bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+	b->simple.len += 4;	/* single volume */
+	if (b->simple.len > PAGE_SIZE)
+		return -EIO;
+
+	memset(msg, 0, sizeof(*msg));
+	msg->len = sizeof(*bl_msg) + b->simple.len;
+	msg->data = kzalloc(msg->len, gfp_mask);
+	if (!msg->data)
+		goto out;
+
+	bl_msg = msg->data;
+	bl_msg->type = BL_DEVICE_MOUNT,
+	bl_msg->totallen = b->simple.len;
+	nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+	add_wait_queue(&nn->bl_wq, &wq);
+	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+	if (rc < 0) {
+		remove_wait_queue(&nn->bl_wq, &wq);
+		goto out;
+	}
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&nn->bl_wq, &wq);
+
+	if (reply->status != BL_DEVICE_REQUEST_PROC) {
+		printk(KERN_WARNING "%s failed to decode device: %d\n",
+			__func__, reply->status);
+		goto out;
+	}
+
+	dev = MKDEV(reply->major, reply->minor);
+out:
+	kfree(msg->data);
+	return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+			 size_t mlen)
+{
+	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+					 nfs_net_id);
+
+	if (mlen != sizeof (struct bl_dev_msg))
+		return -EINVAL;
+
+	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+		return -EFAULT;
+
+	wake_up(&nn->bl_wq);
+
+	return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct bl_pipe_msg *bl_pipe_msg =
+		container_of(msg, struct bl_pipe_msg, msg);
+
+	if (msg->errno >= 0)
+		return;
+	wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+	.upcall		= rpc_pipe_generic_upcall,
+	.downcall	= bl_pipe_downcall,
+	.destroy_msg	= bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+					    struct rpc_pipe *pipe)
+{
+	struct dentry *dir, *dentry;
+
+	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+	if (dir == NULL)
+		return ERR_PTR(-ENOENT);
+	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+	dput(dir);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+					  struct rpc_pipe *pipe)
+{
+	if (pipe->dentry)
+		rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+			   void *ptr)
+{
+	struct super_block *sb = ptr;
+	struct net *net = sb->s_fs_info;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return 0;
+
+	if (nn->bl_device_pipe == NULL) {
+		module_put(THIS_MODULE);
+		return 0;
+	}
+
+	switch (event) {
+	case RPC_PIPEFS_MOUNT:
+		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			break;
+		}
+		nn->bl_device_pipe->dentry = dentry;
+		break;
+	case RPC_PIPEFS_UMOUNT:
+		if (nn->bl_device_pipe->dentry)
+			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+	.notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+						   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+	struct dentry *dentry;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (!pipefs_sb)
+		return NULL;
+	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+	rpc_put_sb_net(net);
+	return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+					   struct rpc_pipe *pipe)
+{
+	struct super_block *pipefs_sb;
+
+	pipefs_sb = rpc_get_sb_net(net);
+	if (pipefs_sb) {
+		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+		rpc_put_sb_net(net);
+	}
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct dentry *dentry;
+
+	init_waitqueue_head(&nn->bl_wq);
+	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+	if (IS_ERR(nn->bl_device_pipe))
+		return PTR_ERR(nn->bl_device_pipe);
+	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+	if (IS_ERR(dentry)) {
+		rpc_destroy_pipe_data(nn->bl_device_pipe);
+		return PTR_ERR(dentry);
+	}
+	nn->bl_device_pipe->dentry = dentry;
+	return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+	rpc_destroy_pipe_data(nn->bl_device_pipe);
+	nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+	.init = nfs4blocklayout_net_init,
+	.exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+	int ret;
+
+	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+	if (ret)
+		goto out;
+	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+	if (ret)
+		goto out_unregister_notifier;
+	return 0;
+
+out_unregister_notifier:
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+	return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 54de482143cc..b8fb3a4ef649 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 
 	cb_info->serv = serv;
 	cb_info->rqst = rqstp;
-	cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+	cb_info->task = kthread_create(callback_svc, cb_info->rqst,
 				    "nfsv4.%u-svc", minorversion);
 	if (IS_ERR(cb_info->task)) {
 		ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 		cb_info->task = NULL;
 		return ret;
 	}
+	rqstp->rq_task = cb_info->task;
+	wake_up_process(cb_info->task);
 	dprintk("nfs_callback_up: service started\n");
 	return 0;
 }
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 		goto out;
 
 	ino = lo->plh_inode;
+
+	spin_lock(&ino->i_lock);
+	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+	spin_unlock(&ino->i_lock);
+
+	pnfs_layoutcommit_inode(ino, false);
+
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
-					&args->cbl_range))
+					&args->cbl_range)) {
 		rv = NFS4ERR_DELAY;
-	else
-		rv = NFS4ERR_NOMATCHING_LAYOUT;
-	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+		goto unlock;
+	}
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+			&args->cbl_range);
+	}
+unlock:
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
 	pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
 		}
 
 	found:
-		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
-			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
-				"deleting instead\n", __func__);
 		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
 	}
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6a4f3666e273..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
  * set up the iterator to start reading from the server list and return the first item
  */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
  * clean up after reading from the transports list
  */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
@@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
  */
 static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &nfs_server_list_ops,
+	return seq_open_net(inode, file, &nfs_volume_list_ops,
 			   sizeof(struct seq_net_private));
 }
 
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
  * set up the iterator to start reading from the volume list and return the first item
  */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+				__acquires(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
  * clean up after reading from the transports list
  */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
+				__releases(&nn->nfs_client_lock)
 {
 	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 /*
  * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
  * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 	WARN_ON_ONCE(verfp->committed < 0);
 	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 }
-#endif
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
 	return result;
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
 
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
-	nfs_direct_complete(dreq, true);
-}
-#endif
-
 static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..6920127c5eb7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #include "nfstrace.h"
 
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
 	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int end = offset + len;
 
+	if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+		if (!PageUptodate(page))
+			return 1;
+		return 0;
+	}
+
 	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
 	    !PageUptodate(page) &&		/* Uptodate? */
 	    !PagePrivate(page) &&		/* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-	/* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
-	 * doing this memory reclaim for a fs-related allocation.
+	/* Always try to initiate a 'commit' if relevant, but only
+	 * wait for it if __GFP_WAIT is set.  Even then, only wait 1
+	 * second and only if the 'bdi' is not congested.
+	 * Waiting indefinitely can cause deadlocks when the NFS
+	 * server is on this machine, when a new TCP connection is
+	 * needed and in other rare cases.  There is no particular
+	 * need to wait extensively here.  A short wait has the
+	 * benefit that someone else can worry about the freezer.
 	 */
-	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
-	    !(current->flags & PF_FSTRANS)) {
-		int how = FLUSH_SYNC;
-
-		/* Don't let kswapd deadlock waiting for OOM RPC calls */
-		if (current_is_kswapd())
-			how = 0;
-		nfs_commit_inode(mapping->host, how);
+	if (mapping) {
+		struct nfs_server *nfss = NFS_SERVER(mapping->host);
+		nfs_commit_inode(mapping->host, 0);
+		if ((gfp & __GFP_WAIT) &&
+		    !bdi_write_congested(&nfss->backing_dev_info)) {
+			wait_on_page_bit_killable_timeout(page, PG_private,
+							  HZ);
+			if (PagePrivate(page))
+				set_bdi_congested(&nfss->backing_dev_info,
+						  BLK_RW_ASYNC);
+		}
 	}
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 						sector_t *span)
 {
+	int ret;
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
 	*span = sis->pages;
-	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+	rcu_read_lock();
+	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
-	xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+	rcu_read_lock();
+	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+	rcu_read_unlock();
 }
 #endif
 
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 90978075f730..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
 
 	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-	    hdr->res.verf->committed == NFS_FILE_SYNC)
+	    hdr->res.verf->committed != NFS_DATA_SYNC)
 		return;
 
 	pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 		return -EAGAIN;
 	}
 
+	if (data->verf.committed == NFS_UNSTABLE)
+		pnfs_commit_set_layoutcommit(data);
+
 	return 0;
 }
 
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
-				   NFS_SERVER(lo->plh_inode)->nfs_client, id);
-	if (d == NULL) {
-		dsaddr = filelayout_get_device_info(lo->plh_inode, id,
-				lo->plh_lc_cred, gfp_flags);
-		if (dsaddr == NULL)
-			goto out;
-	} else
-		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+			lo->plh_lc_cred, gfp_flags);
+	if (d == NULL)
+		goto out;
+
+	dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
 	/* Found deviceid is unavailable */
 	if (filelayout_test_devid_unavailable(&dsaddr->id_node))
-			goto out_put;
+		goto out_put;
 
 	fl->dsaddr = dsaddr;
 
@@ -1368,6 +1368,17 @@ out:
 	cinfo->ds->ncommitting = 0;
 	return PNFS_ATTEMPTED;
 }
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+		struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+
+	dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+	if (!dsaddr)
+		return NULL;
+	return &dsaddr->id_node;
+}
 
 static void
 filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1420,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
+	.alloc_deviceid_node	= filelayout_alloc_deviceid_node,
 	.free_deviceid_node	= filelayout_free_deveiceid_node,
 };
 
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+	struct pnfs_device *pdev, gfp_t gfp_flags);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
-		struct rpc_cred *cred, gfp_t gfp_flags);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
 }
 
 /* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+		gfp_t gfp_flags)
 {
 	int i;
 	u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	dsaddr->stripe_indices = stripe_indices;
 	stripe_indices = NULL;
 	dsaddr->ds_num = num;
-	nfs4_init_deviceid_node(&dsaddr->id_node,
-				NFS_SERVER(ino)->pnfs_curr_ld,
-				NFS_SERVER(ino)->nfs_client,
-				&pdev->dev_id);
+	nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
 
 	INIT_LIST_HEAD(&dsaddrs);
 
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 
 		mp_count = be32_to_cpup(p); /* multipath count */
 		for (j = 0; j < mp_count; j++) {
-			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+			da = decode_ds_addr(server->nfs_client->cl_net,
 					    &stream, gfp_flags);
 			if (da)
 				list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
 	return NULL;
 }
 
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
-	struct nfs4_deviceid_node *d;
-	struct nfs4_file_layout_dsaddr *n, *new;
-
-	new = decode_device(inode, dev, gfp_flags);
-	if (!new) {
-		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
-			__func__);
-		return NULL;
-	}
-
-	d = nfs4_insert_deviceid_node(&new->id_node);
-	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-	if (n != new) {
-		nfs4_fl_free_deviceid(new);
-		return n;
-	}
-
-	return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
-		struct nfs4_deviceid *dev_id,
-		struct rpc_cred *cred,
-		gfp_t gfp_flags)
-{
-	struct pnfs_device *pdev = NULL;
-	u32 max_resp_sz;
-	int max_pages;
-	struct page **pages = NULL;
-	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
-	int rc, i;
-	struct nfs_server *server = NFS_SERVER(inode);
-
-	/*
-	 * Use the session max response size as the basis for setting
-	 * GETDEVICEINFO's maxcount
-	 */
-	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-	max_pages = nfs_page_array_len(0, max_resp_sz);
-	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
-		__func__, inode, max_resp_sz, max_pages);
-
-	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
-	if (pdev == NULL)
-		return NULL;
-
-	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
-	if (pages == NULL) {
-		kfree(pdev);
-		return NULL;
-	}
-	for (i = 0; i < max_pages; i++) {
-		pages[i] = alloc_page(gfp_flags);
-		if (!pages[i])
-			goto out_free;
-	}
-
-	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
-	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
-	pdev->pages = pages;
-	pdev->pgbase = 0;
-	pdev->pglen = max_resp_sz;
-	pdev->mincount = 0;
-	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
-	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
-	dprintk("%s getdevice info returns %d\n", __func__, rc);
-	if (rc)
-		goto out_free;
-
-	/*
-	 * Found new device, need to decode it and then add it to the
-	 * list of known devices for this mountpoint.
-	 */
-	dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
-	for (i = 0; i < max_pages; i++)
-		__free_page(pages[i]);
-	kfree(pages);
-	kfree(pdev);
-	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
-	return dsaddr;
-}
-
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
 	struct nfs_server_key *key = buffer;
 	uint16_t len = sizeof(struct nfs_server_key);
 
+	memset(key, 0, len);
 	key->nfsversion = clp->rpc_ops->version;
 	key->family = clp->cl_addr.ss_family;
 
-	memset(key, 0, len);
-
 	switch (clp->cl_addr.ss_family) {
 	case AF_INET:
 		key->port = sin->sin_port;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		attr->ia_valid &= ~ATTR_MODE;
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+		BUG_ON(!S_ISREG(inode->i_mode));
+
+		if (attr->ia_size == i_size_read(inode))
 			attr->ia_valid &= ~ATTR_SIZE;
 	}
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9056622d2230..14ae6f20a172 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
 int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
 #endif
 
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
-				     struct nfs_fattr *, rpc_authflavor_t);
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		struct posix_acl *dfacl)
+{
+	return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+				     struct nfs_fattr *, rpc_authflavor_t);
+
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 24c6898159cc..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
 #include <linux/nfsacl.h>
 
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #ifdef CONFIG_NFS_V3_ACL
 static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
 
 #include "iostat.h"
 #include "internal.h"
+#include "nfs3_fs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/nfs_fs.h>
 #include "internal.h"
+#include "nfs3_fs.h"
 #include "nfs.h"
 
 static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 53e435a95260..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_proto != new->cl_proto)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos" */
 		if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (pos->rpc_ops != new->rpc_ops)
-			continue;
-
-		if (pos->cl_proto != new->cl_proto)
-			continue;
-
-		if (pos->cl_minorversion != new->cl_minorversion)
-			continue;
-
 		if (pos->cl_clientid != new->cl_clientid)
 			continue;
 
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_proto != new->cl_proto)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos", especially the client
 		 * ID and serverowner fields.  Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (pos->rpc_ops != new->rpc_ops)
-			continue;
-
-		if (pos->cl_proto != new->cl_proto)
-			continue;
-
-		if (pos->cl_minorversion != new->cl_minorversion)
-			continue;
-
 		if (!nfs4_match_clientids(pos, new))
 			continue;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7dd8aca31c29..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
 static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start);
 }
 
+static long nfs4_update_delay(long *timeout)
+{
+	long ret;
+	if (!timeout)
+		return NFS4_POLL_RETRY_MAX;
+	if (*timeout <= 0)
+		*timeout = NFS4_POLL_RETRY_MIN;
+	if (*timeout > NFS4_POLL_RETRY_MAX)
+		*timeout = NFS4_POLL_RETRY_MAX;
+	ret = *timeout;
+	*timeout <<= 1;
+	return ret;
+}
+
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
 	int res = 0;
 
 	might_sleep();
 
-	if (*timeout <= 0)
-		*timeout = NFS4_POLL_RETRY_MIN;
-	if (*timeout > NFS4_POLL_RETRY_MAX)
-		*timeout = NFS4_POLL_RETRY_MAX;
-	freezable_schedule_timeout_killable_unsafe(*timeout);
+	freezable_schedule_timeout_killable_unsafe(
+		nfs4_update_delay(timeout));
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
-	*timeout <<= 1;
 	return res;
 }
 
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	int ret = -EAGAIN;
 
 	for (;;) {
+		spin_lock(&state->owner->so_lock);
 		if (can_open_cached(state, fmode, open_mode)) {
-			spin_lock(&state->owner->so_lock);
-			if (can_open_cached(state, fmode, open_mode)) {
-				update_open_stateflags(state, fmode);
-				spin_unlock(&state->owner->so_lock);
-				goto out_return_state;
-			}
+			update_open_stateflags(state, fmode);
 			spin_unlock(&state->owner->so_lock);
+			goto out_return_state;
 		}
+		spin_unlock(&state->owner->so_lock);
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
 		if (!can_open_delegated(delegation, fmode)) {
@@ -2226,9 +2234,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	ret = _nfs4_proc_open(opendata);
 	if (ret != 0) {
 		if (ret == -ENOENT) {
-			d_drop(opendata->dentry);
-			d_add(opendata->dentry, NULL);
-			nfs_set_verifier(opendata->dentry,
+			dentry = opendata->dentry;
+			if (dentry->d_inode)
+				d_delete(dentry);
+			else if (d_unhashed(dentry))
+				d_add(dentry, NULL);
+
+			nfs_set_verifier(dentry,
 					 nfs_save_change_attribute(opendata->dir->d_inode));
 		}
 		goto out;
@@ -2585,7 +2597,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			if (calldata->arg.fmode == 0)
 				break;
 		default:
-			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
 				rpc_restart_call_prepare(task);
 				goto out_release;
 			}
@@ -2614,23 +2626,23 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
 	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
 	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
-	/* Calculate the current open share mode */
-	calldata->arg.fmode = 0;
-	if (is_rdonly || is_rdwr)
-		calldata->arg.fmode |= FMODE_READ;
-	if (is_wronly || is_rdwr)
-		calldata->arg.fmode |= FMODE_WRITE;
 	/* Calculate the change in open mode */
+	calldata->arg.fmode = 0;
 	if (state->n_rdwr == 0) {
-		if (state->n_rdonly == 0) {
-			call_close |= is_rdonly || is_rdwr;
-			calldata->arg.fmode &= ~FMODE_READ;
-		}
-		if (state->n_wronly == 0) {
-			call_close |= is_wronly || is_rdwr;
-			calldata->arg.fmode &= ~FMODE_WRITE;
-		}
-	}
+		if (state->n_rdonly == 0)
+			call_close |= is_rdonly;
+		else if (is_rdonly)
+			calldata->arg.fmode |= FMODE_READ;
+		if (state->n_wronly == 0)
+			call_close |= is_wronly;
+		else if (is_wronly)
+			calldata->arg.fmode |= FMODE_WRITE;
+	} else if (is_rdwr)
+		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+	if (calldata->arg.fmode == 0)
+		call_close |= is_rdwr;
+
 	if (!nfs4_valid_open_stateid(state))
 		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
@@ -3213,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	struct nfs4_label *label = NULL;
 	int status;
 
-	if (pnfs_ld_layoutret_on_setattr(inode))
+	if (pnfs_ld_layoutret_on_setattr(inode) &&
+	    sattr->ia_valid & ATTR_SIZE &&
+	    sattr->ia_size < i_size_read(inode))
 		pnfs_commit_and_return_layout(inode);
 
 	nfs_fattr_init(fattr);
@@ -3572,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 
 	if (!nfs4_sequence_done(task, &res->seq_res))
 		return 0;
-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+	if (nfs4_async_handle_error(task, res->server, NULL,
+				    &data->timeout) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
 	return 1;
@@ -3605,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 
 	if (!nfs4_sequence_done(task, &res->seq_res))
 		return 0;
-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+	if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
 		return 0;
 
 	update_changeattr(old_dir, &res->old_cinfo);
@@ -4109,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 
 	trace_nfs4_read(hdr, task->tk_status);
 	if (nfs4_async_handle_error(task, server,
-				    hdr->args.context->state) == -EAGAIN) {
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4177,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
 			      struct nfs_pgio_header *hdr)
 {
 	struct inode *inode = hdr->inode;
-	
+
 	trace_nfs4_write(hdr, task->tk_status);
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-				    hdr->args.context->state) == -EAGAIN) {
+				    hdr->args.context->state,
+				    NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4260,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
 	struct inode *inode = data->inode;
 
 	trace_nfs4_commit(data, task->tk_status);
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+				    NULL, NULL) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -4813,7 +4831,8 @@ out:
 
 
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+			struct nfs4_state *state, long *timeout)
 {
 	struct nfs_client *clp = server->nfs_client;
 
@@ -4863,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
+			rpc_delay(task, nfs4_update_delay(timeout));
+			goto restart_call;
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5103,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
 		break;
 	default:
-		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
-				-EAGAIN) {
+		if (nfs4_async_handle_error(task, data->res.server,
+					    NULL, NULL) == -EAGAIN) {
 			rpc_restart_call_prepare(task);
 			return;
 		}
@@ -5368,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_EXPIRED:
 			break;
 		default:
-			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+			if (nfs4_async_handle_error(task, calldata->server,
+						    NULL, NULL) == -EAGAIN)
 				rpc_restart_call_prepare(task);
 	}
 	nfs_release_seqid(calldata->arg.seqid);
@@ -5974,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 		break;
 	case -NFS4ERR_LEASE_MOVED:
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+		if (nfs4_async_handle_error(task, server,
+					    NULL, NULL) == -EAGAIN)
 			rpc_restart_call_prepare(task);
 	}
 }
@@ -7349,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 	int ret = 0;
 
 	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
-		return 0;
+		return -EAGAIN;
 	task = _nfs41_proc_sequence(clp, cred, false);
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
@@ -7579,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		} else {
 			LIST_HEAD(head);
 
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
 			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
 			spin_unlock(&inode->i_lock);
-			/* Mark the bad layout state as invalid, then
-			 * retry using the open stateid. */
 			pnfs_free_lseg_list(&head);
+	
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
 		}
 	}
-	if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+	if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
 		rpc_restart_call_prepare(task);
 out:
 	dprintk("<-- %s\n", __func__);
@@ -7746,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 	case 0:
 		break;
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
 			break;
 		rpc_restart_call_prepare(task);
 		return;
@@ -7805,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
 	return status;
 }
 
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
-				    const struct nfs_fh *fh,
-				    struct pnfs_devicelist *devlist)
-{
-	struct nfs4_getdevicelist_args args = {
-		.fh = fh,
-		.layoutclass = server->pnfs_curr_ld->id,
-	};
-	struct nfs4_getdevicelist_res res = {
-		.devlist = devlist,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
-	};
-	int status;
-
-	dprintk("--> %s\n", __func__);
-	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
-	dprintk("<-- %s status=%d\n", __func__, status);
-	return status;
-}
-
-int nfs4_proc_getdevicelist(struct nfs_server *server,
-			    const struct nfs_fh *fh,
-			    struct pnfs_devicelist *devlist)
-{
-	struct nfs4_exception exception = { };
-	int err;
-
-	do {
-		err = nfs4_handle_exception(server,
-				_nfs4_getdevicelist(server, fh, devlist),
-				&exception);
-	} while (exception.retry);
-
-	dprintk("%s: err=%d, num_devs=%u\n", __func__,
-		err, devlist->num_devs);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 		struct pnfs_device *pdev,
@@ -7925,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 	case 0:
 		break;
 	default:
-		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
 			rpc_restart_call_prepare(task);
 			return;
 		}
@@ -8221,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
 
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+		if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
 			rpc_restart_call_prepare(task);
 	}
 }
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
 			}
 			nfs_expire_all_delegations(clp);
 		} else {
+			int ret;
+
 			/* Queue an asynchronous RENEW. */
-			ops->sched_state_renewal(clp, cred, renew_flags);
+			ret = ops->sched_state_renewal(clp, cred, renew_flags);
 			put_rpccred(cred);
-			goto out_exp;
+			switch (ret) {
+			default:
+				goto out_exp;
+			case -EAGAIN:
+			case -ENOMEM:
+				break;
+			}
 		}
 	} else {
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 22fe35104c0c..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1705,7 +1705,8 @@ restart:
 			if (status < 0) {
 				set_bit(ops->owner_flag_bit, &sp->so_flags);
 				nfs4_put_state_owner(sp);
-				return nfs4_recovery_handle_error(clp, status);
+				status = nfs4_recovery_handle_error(clp, status);
+				return (status != 0) ? status : -EAGAIN;
 			}
 
 			nfs4_put_state_owner(sp);
@@ -1714,7 +1715,7 @@ restart:
 		spin_unlock(&clp->cl_lock);
 	}
 	rcu_read_unlock();
-	return status;
+	return 0;
 }
 
 static int nfs4_check_lease(struct nfs_client *clp)
@@ -1761,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 		break;
 	case -NFS4ERR_STALE_CLIENTID:
 		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-		nfs4_state_clear_reclaim_reboot(clp);
 		nfs4_state_start_reclaim_reboot(clp);
 		break;
 	case -NFS4ERR_CLID_INUSE:
@@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
+			continue;
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2366,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			section = "reclaim reboot";
 			status = nfs4_do_reclaim(clp,
 				clp->cl_mvops->reboot_recovery_ops);
-			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-				continue;
-			nfs4_state_end_reclaim_reboot(clp);
-			if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+			if (status == -EAGAIN)
 				continue;
 			if (status < 0)
 				goto out_error;
+			nfs4_state_end_reclaim_reboot(clp);
 		}
 
 		/* Now recover expired state... */
@@ -2381,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			section = "reclaim nograce";
 			status = nfs4_do_reclaim(clp,
 				clp->cl_mvops->nograce_recovery_ops);
-			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
-			    test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+			if (status == -EAGAIN)
 				continue;
 			if (status < 0)
 				goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
-				encode_verifier_maxsz)
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
-				2 /* nfs_cookie4 gdlr_cookie */ + \
-				decode_verifier_maxsz \
-				  /* verifier4 gdlr_verifier */ + \
-				1 /* gdlr_deviceid_list count */ + \
-				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
-					    NFS4_DEVICEID4_SIZE) \
-				  /* gdlr_deviceid_list */ + \
-				1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
-				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+				1 /* layout type */ + \
+				1 /* maxcount */ + \
+				1 /* bitmap size */ + \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap, word 0 */)
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
 				1 /* layout type */ + \
 				1 /* opaque devaddr4 length */ + \
 				  /* devaddr4 payload is read into page */ \
 				1 /* notification bitmap length */ + \
-				1 /* notification bitmap */)
+				1 /* notification bitmap, word 0 */)
 #define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
 				encode_stateid_maxsz)
 #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
 				2 /* last byte written */ + \
 				1 /* nt_timechanged (false) */ + \
 				1 /* layoutupdate4 layout type */ + \
-				1 /* NULL filelayout layoutupdate4 payload */)
+				1 /* layoutupdate4 opaqueue len */)
+				  /* the actual content of layoutupdate4 should
+				     be allocated by drivers and spliced in
+				     using xdr_write_pages */
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
 				encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
 					 decode_sequence_maxsz + \
 					 decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
-				encode_sequence_maxsz + \
-				encode_putfh_maxsz + \
-				encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
-				decode_sequence_maxsz + \
-				decode_putfh_maxsz + \
-				decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
 				encode_sequence_maxsz +\
 				encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 
 #ifdef CONFIG_NFS_V4_1
 static void
-encode_getdevicelist(struct xdr_stream *xdr,
-		     const struct nfs4_getdevicelist_args *args,
-		     struct compound_hdr *hdr)
-{
-	__be32 *p;
-	nfs4_verifier dummy = {
-		.data = "dummmmmy",
-	};
-
-	encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
-	p = reserve_space(xdr, 16);
-	*p++ = cpu_to_be32(args->layoutclass);
-	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
-	xdr_encode_hyper(p, 0ULL);                          /* cookie */
-	encode_nfs4_verifier(xdr, &dummy);
-}
-
-static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
 		     const struct nfs4_getdeviceinfo_args *args,
 		     struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
-	p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+	p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
 	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
 				    NFS4_DEVICEID4_SIZE);
 	*p++ = cpu_to_be32(args->pdev->layout_type);
 	*p++ = cpu_to_be32(args->pdev->maxcount);	/* gdia_maxcount */
-	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
+
+	p = reserve_space(xdr, 4 + 4);
+	*p++ = cpu_to_be32(1);			/* bitmap length */
+	*p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
 }
 
 static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
 		    struct inode *inode,
-		    const struct nfs4_layoutcommit_args *args,
+		    struct nfs4_layoutcommit_args *args,
 		    struct compound_hdr *hdr)
 {
 	__be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
 	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
 
-	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
 		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
 			NFS_I(inode)->layout, xdr, args);
-	else
-		encode_uint32(xdr, 0); /* no layout-type payload */
+	} else {
+		encode_uint32(xdr, args->layoutupdate_len);
+		if (args->layoutupdate_pages) {
+			xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+					args->layoutupdate_len);
+		}
+	}
 
 	return 0;
 }
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
 }
 
 /*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
-				       struct xdr_stream *xdr,
-				       struct nfs4_getdevicelist_args *args)
-{
-	struct compound_hdr hdr = {
-		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-	};
-
-	encode_compound_hdr(xdr, req, &hdr);
-	encode_sequence(xdr, &args->seq_args, &hdr);
-	encode_putfh(xdr, args->fh, &hdr);
-	encode_getdevicelist(xdr, args, &hdr);
-	encode_nops(&hdr);
-}
-
-/*
  * Encode GETDEVICEINFO request
  */
 static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
-				struct pnfs_devicelist *res)
-{
-	__be32 *p;
-	int status, i;
-	nfs4_verifier verftemp;
-
-	status = decode_op_hdr(xdr, OP_GETDEVICELIST);
-	if (status)
-		return status;
-
-	p = xdr_inline_decode(xdr, 8 + 8 + 4);
-	if (unlikely(!p))
-		goto out_overflow;
-
-	/* TODO: Skip cookie for now */
-	p += 2;
-
-	/* Read verifier */
-	p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-
-	res->num_devs = be32_to_cpup(p);
-
-	dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-
-	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
-		printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
-				__func__, res->num_devs);
-		return -EIO;
-	}
-
-	p = xdr_inline_decode(xdr,
-			      res->num_devs * NFS4_DEVICEID4_SIZE + 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	for (i = 0; i < res->num_devs; i++)
-		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
-					    NFS4_DEVICEID4_SIZE);
-	res->eof = be32_to_cpup(p);
-	return 0;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
-}
-
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
 				struct pnfs_device *pdev)
 {
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
 		p = xdr_inline_decode(xdr, 4 * len);
 		if (unlikely(!p))
 			goto out_overflow;
-		for (i = 0; i < len; i++, p++) {
-			if (be32_to_cpup(p)) {
-				dprintk("%s: notifications not supported\n",
+
+		if (be32_to_cpup(p++) &
+		    ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+			dprintk("%s: unsupported notification\n",
+				__func__);
+		}
+
+		for (i = 1; i < len; i++) {
+			if (be32_to_cpup(p++)) {
+				dprintk("%s: unsupported notification\n",
 					__func__);
 				return -EIO;
 			}
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 }
 
 /*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
-				      struct xdr_stream *xdr,
-				      struct nfs4_getdevicelist_res *res)
-{
-	struct compound_hdr hdr;
-	int status;
-
-	dprintk("encoding getdevicelist!\n");
-
-	status = decode_compound_hdr(xdr, &hdr);
-	if (status != 0)
-		goto out;
-	status = decode_sequence(xdr, &res->seq_res, rqstp);
-	if (status != 0)
-		goto out;
-	status = decode_putfh(xdr);
-	if (status != 0)
-		goto out;
-	status = decode_getdevicelist(xdr, res->devlist);
-out:
-	return status;
-}
-
-/*
  * Decode GETDEVINFO response
  */
 static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(SECINFO_NO_NAME,	enc_secinfo_no_name,	dec_secinfo_no_name),
 	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid),
 	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid),
-	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
 	PROC(BIND_CONN_TO_SESSION,
 			enc_bind_conn_to_session, dec_bind_conn_to_session),
 	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 	kfree(de);
 }
 
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
-	const struct nfs4_deviceid *d_id)
-{
-	struct nfs4_deviceid_node *d;
-	struct objio_dev_ent *de;
-
-	d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
-	if (!d)
-		return NULL;
-
-	de = container_of(d, struct objio_dev_ent, id_node);
-	return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
-	const struct nfs4_deviceid *d_id, struct osd_dev *od,
-	gfp_t gfp_flags)
-{
-	struct nfs4_deviceid_node *d;
-	struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
-	struct objio_dev_ent *n;
-
-	if (!de) {
-		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
-		return NULL;
-	}
-
-	dprintk("%s: Adding od=%p\n", __func__, od);
-	nfs4_init_deviceid_node(&de->id_node,
-				nfss->pnfs_curr_ld,
-				nfss->nfs_client,
-				d_id);
-	de->od.od = od;
-
-	d = nfs4_insert_deviceid_node(&de->id_node);
-	n = container_of(d, struct objio_dev_ent, id_node);
-	if (n != de) {
-		dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
-		objio_free_deviceid_node(&de->id_node);
-		de = n;
-	}
-
-	return de;
-}
-
 struct objio_segment {
 	struct pnfs_layout_segment lseg;
 
@@ -130,29 +84,24 @@ struct objio_state {
 
 /* Send and wait for a get_device_info of devices in the layout,
    then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-	struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
-	gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags)
 {
 	struct pnfs_osd_deviceaddr *deviceaddr;
-	struct objio_dev_ent *ode;
+	struct objio_dev_ent *ode = NULL;
 	struct osd_dev *od;
 	struct osd_dev_info odi;
 	bool retry_flag = true;
+	__be32 *p;
 	int err;
 
-	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
-	if (ode) {
-		objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
-		return 0;
-	}
+	deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+	if (!deviceaddr)
+		return NULL;
 
-	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
-	if (unlikely(err)) {
-		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
-			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
-		return err;
-	}
+	p = page_address(pdev->pages[0]);
+	pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
 
 	odi.systemid_len = deviceaddr->oda_systemid.len;
 	if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
 		goto out;
 	}
 
-	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
-			    gfp_flags);
-	objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
 	dprintk("Adding new dev_id(%llx:%llx)\n",
-		_DEVID_LO(d_id), _DEVID_HI(d_id));
+		_DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+	ode = kzalloc(sizeof(*ode), gfp_flags);
+	if (!ode) {
+		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+		goto out;
+	}
+
+	nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+	kfree(deviceaddr);
+
+	ode->od.od = od;
+	return &ode->id_node;
+
 out:
-	objlayout_put_deviceinfo(deviceaddr);
-	return err;
+	kfree(deviceaddr);
+	return NULL;
 }
 
 static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	struct xdr_stream *xdr,
 	gfp_t gfp_flags)
 {
+	struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
 	struct objio_segment *objio_seg;
 	struct pnfs_osd_xdr_decode_layout_iter iter;
 	struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	objio_seg->oc.first_dev = layout.olo_comps_index;
 	cur_comp = 0;
 	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+		struct nfs4_deviceid_node *d;
+		struct objio_dev_ent *ode;
+
 		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-		err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
-					   &src_comp.oc_object_id.oid_device_id,
-					   gfp_flags);
-		if (err)
+
+		d = nfs4_find_get_deviceid(server,
+				&src_comp.oc_object_id.oid_device_id,
+				pnfslay->plh_lc_cred, gfp_flags);
+		if (!d) {
+			err = -ENXIO;
 			goto err;
-		++cur_comp;
+		}
+
+		ode = container_of(d, struct objio_dev_ent, id_node);
+		objio_seg->oc.ods[cur_comp++] = &ode->od;
 	}
 	/* pnfs_osd_xdr_decode_layout_comp returns false on error */
 	if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
 				   PNFS_LAYOUTRET_ON_ERROR,
 
+	.max_deviceinfo_size	 = PAGE_SIZE,
 	.owner		       	 = THIS_MODULE,
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
 	dprintk("%s: Return\n", __func__);
 }
 
-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
-	struct page *page;
-	struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-	gfp_t gfp_flags)
-{
-	struct objlayout_deviceinfo *odi;
-	struct pnfs_device pd;
-	struct page *page, **pages;
-	u32 *p;
-	int err;
-
-	page = alloc_page(gfp_flags);
-	if (!page)
-		return -ENOMEM;
-
-	pages = &page;
-	pd.pages = pages;
-
-	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
-	pd.layout_type = LAYOUT_OSD2_OBJECTS;
-	pd.pages = &page;
-	pd.pgbase = 0;
-	pd.pglen = PAGE_SIZE;
-	pd.mincount = 0;
-	pd.maxcount = PAGE_SIZE;
-
-	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
-			pnfslay->plh_lc_cred);
-	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
-	if (err)
-		goto err_out;
-
-	p = page_address(page);
-	odi = kzalloc(sizeof(*odi), gfp_flags);
-	if (!odi) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
-	odi->page = page;
-	*deviceaddr = &odi->da;
-	return 0;
-
-err_out:
-	__free_page(page);
-	return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
-	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
-						struct objlayout_deviceinfo,
-						da);
-
-	__free_page(odi->page);
-	kfree(odi);
-}
-
 enum {
 	OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
 	OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
 extern void objlayout_write_done(struct objlayout_io_res *oir,
 				 ssize_t status, bool sync);
 
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
-	gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
 /*
  * exported generic objects function vectors
  */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index be7cbce6e4c7..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 		return 0;
 	}
 
+	/*
+	 * Limit the request size so that we can still allocate a page array
+	 * for it without upsetting the slab allocator.
+	 */
+	if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+			sizeof(struct page) > PAGE_SIZE)
+		return 0;
+
 	return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 		dprintk("%s freeing layout for inode %lu\n", __func__,
 			lo->plh_inode->i_ino);
 		inode = lo->plh_inode;
+
+		pnfs_layoutcommit_inode(inode, false);
+
 		spin_lock(&inode->i_lock);
 		list_del_init(&lo->plh_bulk_destroy);
 		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 	return (s32)(s1 - s2) > 0;
 }
 
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
-		const nfs4_stateid *new,
-		struct list_head *free_me_list)
-{
-	if (nfs4_stateid_match_other(&lo->plh_stateid, new))
-		return;
-	/* Layout is new! Kill existing layout segments */
-	pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
-
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 		status = -EAGAIN;
 	} else if (!nfs4_valid_open_stateid(open_state)) {
 		status = -EBADF;
-	} else if (list_empty(&lo->plh_segs)) {
+	} else if (list_empty(&lo->plh_segs) ||
+		   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
 		int seq;
 
 		do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
 	empty = list_empty(&lo->plh_segs);
 	pnfs_clear_layoutcommit(ino, &tmp_list);
 	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+		struct pnfs_layout_range range = {
+			.iomode		= IOMODE_ANY,
+			.offset		= 0,
+			.length		= NFS4_MAX_UINT64,
+		};
+		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+	}
+
 	/* Don't send a LAYOUTRETURN if list was initially empty */
 	if (empty) {
 		spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
 		dprintk("NFS: %s no layout segments to return\n", __func__);
 		goto out;
 	}
+
+	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 	lo->plh_block_lgets++;
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		goto out;
 	}
 
+	init_lseg(lo, lseg);
+	lseg->pls_range = res->range;
+
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
 		dprintk("%s forget reply due to recall\n", __func__);
 		goto out_forget_reply;
 	}
 
-	if (pnfs_layoutgets_blocked(lo, 1) ||
-	    pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+	if (pnfs_layoutgets_blocked(lo, 1)) {
 		dprintk("%s forget reply due to state\n", __func__);
 		goto out_forget_reply;
 	}
 
-	/* Check that the new stateid matches the old stateid */
-	pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
-	/* Done processing layoutget. Set the layout stateid */
-	pnfs_set_layout_stateid(lo, &res->stateid, false);
+	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+		/* existing state ID, make sure the sequence number matches. */
+		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+			dprintk("%s forget reply due to sequence\n", __func__);
+			goto out_forget_reply;
+		}
+		pnfs_set_layout_stateid(lo, &res->stateid, false);
+	} else {
+		/*
+		 * We got an entirely new state ID.  Mark all segments for the
+		 * inode invalid, and don't bother validating the stateid
+		 * sequence number.
+		 */
+		pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+		nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+		lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+	}
+
+	clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 
-	init_lseg(lo, lseg);
-	lseg->pls_range = res->range;
 	pnfs_get_lseg(lseg);
 	pnfs_layout_insert_lseg(lo, lseg);
 
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+	struct inode *inode = data->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	bool mark_as_dirty = false;
+
+	spin_lock(&inode->i_lock);
+	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+		mark_as_dirty = true;
+		dprintk("%s: Set layoutcommit for inode %lu ",
+			__func__, inode->i_ino);
+	}
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+		/* references matched in nfs4_layoutcommit_release */
+		pnfs_get_lseg(data->lseg);
+	}
+	if (data->lwb > nfsi->layout->plh_lwb)
+		nfsi->layout->plh_lwb = data->lwb;
+	spin_unlock(&inode->i_lock);
+	dprintk("%s: lseg %p end_pos %llu\n",
+		__func__, data->lseg, nfsi->layout->plh_lwb);
+
+	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+	if (mark_as_dirty)
+		mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 {
 	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 int
 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs4_layoutcommit_data *data;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	data->args.lastbytewritten = end_pos - 1;
 	data->res.server = NFS_SERVER(inode);
 
+	if (ld->prepare_layoutcommit) {
+		status = ld->prepare_layoutcommit(&data->args);
+		if (status) {
+			spin_lock(&inode->i_lock);
+			if (end_pos < nfsi->layout->plh_lwb)
+				nfsi->layout->plh_lwb = end_pos;
+			spin_unlock(&inode->i_lock);
+			put_rpccred(data->cred);
+			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+			goto clear_layoutcommitting;
+		}
+	}
+
+
 	status = nfs4_proc_layoutcommit(data, sync);
 out:
 	if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
+	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 };
 
 enum layoutdriver_policy_flags {
-	/* Should the pNFS client commit and return the layout upon a setattr */
+	/* Should the pNFS client commit and return the layout upon truncate to
+	 * a smaller size */
 	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
 	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
+	PNFS_READ_WHOLE_PAGE		= 1 << 2,
 };
 
 struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
 	const char *name;
 	struct module *owner;
 	unsigned flags;
+	unsigned max_deviceinfo_size;
 
 	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
 	int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
+	void (*return_range) (struct pnfs_layout_hdr *lo,
+			      struct pnfs_layout_range *range);
+
 	/* test for nfs page cache coalescing */
 	const struct nfs_pageio_ops *pg_read_ops;
 	const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
 	enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+	struct nfs4_deviceid_node * (*alloc_deviceid_node)
+			(struct nfs_server *server, struct pnfs_device *pdev,
+			gfp_t gfp_flags);
 
 	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutreturn_args *args);
 
 	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
-
-	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+	int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutcommit_args *args);
 };
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
 /* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
-				   const struct nfs_fh *fh,
-				   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
 				   struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
 void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
 	atomic_t			ref;
 };
 
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
-			     const struct pnfs_layoutdriver_type *,
-			     const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
 			     const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
 
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+	atomic_inc(&d->ref);
+	return d;
+}
+
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 }
 
 static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline bool
 pnfs_layoutcommit_outstanding(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
 }
 
 static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+	return false;
+}
+
+static inline bool
 pnfs_roc(struct inode *ino)
 {
 	return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
  */
 
 #include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
 	return NULL;
 }
 
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+		const struct nfs4_deviceid *dev_id,
+		struct rpc_cred *cred, gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d = NULL;
+	struct pnfs_device *pdev = NULL;
+	struct page **pages = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	int rc, i;
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	if (server->pnfs_curr_ld->max_deviceinfo_size &&
+	    server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+		max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+	max_pages = nfs_page_array_len(0, max_resp_sz);
+	dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+		__func__, server, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(*pdev), gfp_flags);
+	if (!pdev)
+		return NULL;
+
+	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+	if (!pages)
+		goto out_free_pdev;
+
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(gfp_flags);
+		if (!pages[i])
+			goto out_free_pages;
+	}
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = server->pnfs_curr_ld->id;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = max_resp_sz;
+	pdev->mincount = 0;
+	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free_pages;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+			gfp_flags);
+
+out_free_pages:
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+out_free_pdev:
+	kfree(pdev);
+	dprintk("<-- %s d %p\n", __func__, d);
+	return d;
+}
+
 /*
  * Lookup a deviceid in cache and get a reference count on it if found
  *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
  * @id deviceid to look up
  */
 static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
-		   const struct nfs_client *clp, const struct nfs4_deviceid *id,
-		   long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, long hash)
 {
 	struct nfs4_deviceid_node *d;
 
 	rcu_read_lock();
-	d = _lookup_deviceid(ld, clp, id, hash);
+	d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+			hash);
 	if (d != NULL)
 		atomic_inc(&d->ref);
 	rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
 }
 
 struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
-		       const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+		const struct nfs4_deviceid *id, struct rpc_cred *cred,
+		gfp_t gfp_mask)
 {
-	return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+	long hash = nfs4_deviceid_hash(id);
+	struct nfs4_deviceid_node *d, *new;
+
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d)
+		return d;
+
+	new = nfs4_get_device_info(server, id, cred, gfp_mask);
+	if (!new)
+		return new;
+
+	spin_lock(&nfs4_deviceid_lock);
+	d = __nfs4_find_get_deviceid(server, id, hash);
+	if (d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		server->pnfs_curr_ld->free_deviceid_node(new);
+		return d;
+	}
+	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+	atomic_inc(&new->ref);
+	spin_unlock(&nfs4_deviceid_lock);
+
+	return new;
 }
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
 EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
 
 void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
-			const struct pnfs_layoutdriver_type *ld,
-			const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
 			const struct nfs4_deviceid *id)
 {
 	INIT_HLIST_NODE(&d->node);
 	INIT_HLIST_NODE(&d->tmpnode);
-	d->ld = ld;
-	d->nfs_client = nfs_client;
+	d->ld = server->pnfs_curr_ld;
+	d->nfs_client = server->nfs_client;
 	d->flags = 0;
 	d->deviceid = *id;
 	atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
 EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
 
 /*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- *      Note that the caller must set up the following members:
- *        new->ld
- *        new->nfs_client
- *        new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
-	struct nfs4_deviceid_node *d;
-	long hash;
-
-	spin_lock(&nfs4_deviceid_lock);
-	hash = nfs4_deviceid_hash(&new->deviceid);
-	d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
-	if (d) {
-		spin_unlock(&nfs4_deviceid_lock);
-		return d;
-	}
-
-	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
-	spin_unlock(&nfs4_deviceid_lock);
-	atomic_inc(&new->ref);
-
-	return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
-/*
  * Dereference a deviceid node and delete it when its reference count drops
  * to zero.
  *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
 	}
 	rcu_read_unlock();
 }
-
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
 		return NFS_TEXT_DATA;
 	}
 
-#if !IS_ENABLED(CONFIG_NFS_V3)
-	if (args->version == 3)
-		goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
-
 	return 0;
 
 out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
 	dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
 	return -EINVAL;
 
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
-	dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
-	return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
-
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 175d5d073ccf..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 }
 
 /*
- * nfs_page_search_commits_for_head_request_locked
- *
- * Search through commit lists on @inode for the head request for @page.
- * Must be called while holding the inode (which is cinfo) lock.
- *
- * Returns the head request if found, or NULL if not found.
- */
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
-						struct page *page)
-{
-	struct nfs_page *freq, *t;
-	struct nfs_commit_info cinfo;
-	struct inode *inode = &nfsi->vfs_inode;
-
-	nfs_init_cinfo_from_inode(&cinfo, inode);
-
-	/* search through pnfs commit lists */
-	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
-	if (freq)
-		return freq->wb_head;
-
-	/* Linearly search the commit list for the correct request */
-	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
-		if (freq->wb_page == page)
-			return freq->wb_head;
-	}
-
-	return NULL;
-}
-
-/*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
  * must be called while holding the inode lock.
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
 
 static int wb_priority(struct writeback_control *wbc)
 {
+	int ret = 0;
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ret = FLUSH_COND_STABLE;
 	if (wbc->for_kupdate || wbc->for_background)
-		return FLUSH_LOWPRI | FLUSH_COND_STABLE;
-	return FLUSH_COND_STABLE;
+		ret |= FLUSH_LOWPRI;
+	return ret;
 }
 
 /*
@@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		if (likely(!PageSwapCache(head->wb_page))) {
 			set_page_private(head->wb_page, 0);
 			ClearPagePrivate(head->wb_page);
+			smp_mb__after_atomic();
+			wake_up_page(head->wb_page, PG_private);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
 		nfsi->npages--;
@@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	__set_page_dirty_nobuffers(req->wb_page);
 }
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct nfs_commit_info cinfo;
+	struct inode *inode = &nfsi->vfs_inode;
+
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+
+	/* search through pnfs commit lists */
+	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+	if (freq)
+		return freq->wb_head;
+
+	/* Linearly search the commit list for the correct request */
+	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+		if (freq->wb_page == page)
+			return freq->wb_head;
+	}
+
+	return NULL;
+}
+
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
@@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 	return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
-#else
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
-				      struct inode *inode)
-{
-}
-
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
-		    struct inode *inode,
-		    struct nfs_direct_req *dreq)
-{
-}
-
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-			struct nfs_commit_info *cinfo)
-{
-}
-
-static void
-nfs_clear_request_commit(struct nfs_page *req)
-{
-}
-
-int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-{
-	return 0;
-}
-
-#endif
-
 static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_commit_info cinfo;
@@ -932,7 +909,6 @@ out:
 	hdr->release(hdr);
 }
 
-#if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
@@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 	return ret;
 }
 
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
-	return 0;
-}
-
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
-		    struct nfs_commit_info *cinfo)
-{
-	return 0;
-}
-#endif
-
 /*
  * Search for an existing write request, and attempt to update
  * it to reflect a new dirty region on a given page.
@@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
 		return status;
 	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 	if (hdr->res.verf->committed < hdr->args.stable &&
 	    task->tk_status >= 0) {
 		/* We tried a write call, but the server did not
@@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
 			complain = jiffies + 300 * HZ;
 		}
 	}
-#endif
 
 	/* Deal with the suid/sgid bit corner case */
 	if (nfs_should_remove_suid(inode))
@@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
 }
 
 
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 {
 	int ret;
@@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_commit);
 
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+	loff_t lwb = 0;
+	struct nfs_page *req;
+
+	list_for_each_entry(req, head, wb_list)
+		if (lwb < (req_offset(req) + req->wb_bytes))
+			lwb = req_offset(req) + req->wb_bytes;
+
+	return lwb;
+}
+
 /*
  * Set up the argument/result storage required for the RPC call.
  */
@@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
 	data->lseg	  = lseg; /* reference transferred */
+	/* only set lwb for pnfs commit */
+	if (lseg)
+		data->lwb = nfs_get_lwb(&data->pages);
 	data->mds_ops     = &nfs_commit_ops;
 	data->completion_ops = cinfo->completion_ops;
 	data->dreq	  = cinfo->dreq;
@@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
 	struct nfs_commit_info cinfo;
+	struct nfs_server *nfss;
 
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	next:
 		nfs_unlock_and_release_request(req);
 	}
+	nfss = NFS_SERVER(data->inode);
+	if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
 		nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1778,12 +1758,6 @@ out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
-	return 0;
-}
-#endif
 
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
 #
 
 obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
-
 nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
 /*
  * Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
  */
 
 #include <linux/module.h>
-#include <linux/lockd/bind.h>
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
 
-#include "netns.h"
-
+static int grace_net_id;
 static DEFINE_SPINLOCK(grace_lock);
 
 /**
  * locks_start_grace
+ * @net: net namespace that this lock manager belongs to
  * @lm: who this grace period is for
  *
  * A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
  *
  * This function is called to start a grace period.
  */
-void locks_start_grace(struct net *net, struct lock_manager *lm)
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
 {
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
+	struct list_head *grace_list = net_generic(net, grace_net_id);
 
 	spin_lock(&grace_lock);
-	list_add(&lm->list, &ln->grace_list);
+	list_add(&lm->list, grace_list);
 	spin_unlock(&grace_lock);
 }
 EXPORT_SYMBOL_GPL(locks_start_grace);
 
 /**
  * locks_end_grace
+ * @net: net namespace that this lock manager belongs to
  * @lm: who this grace period is for
  *
  * Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
  * Note that callers count on it being safe to call this more than once,
  * and the second call should be a no-op.
  */
-void locks_end_grace(struct lock_manager *lm)
+void
+locks_end_grace(struct lock_manager *lm)
 {
 	spin_lock(&grace_lock);
 	list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
  * to answer ordinary lock requests, and when they should accept only
  * lock reclaims.
  */
-int locks_in_grace(struct net *net)
+int
+locks_in_grace(struct net *net)
 {
-	struct lockd_net *ln = net_generic(net, lockd_net_id);
+	struct list_head *grace_list = net_generic(net, grace_net_id);
 
-	return !list_empty(&ln->grace_list);
+	return !list_empty(grace_list);
 }
 EXPORT_SYMBOL_GPL(locks_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	INIT_LIST_HEAD(grace_list);
+	return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+	struct list_head *grace_list = net_generic(net, grace_net_id);
+
+	BUG_ON(!list_empty(grace_list));
+}
+
+static struct pernet_operations grace_net_ops = {
+	.init = grace_init_net,
+	.exit = grace_exit_net,
+	.id   = &grace_net_id,
+	.size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+	return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+	unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
 	select FS_POSIX_ACL
 	select SUNRPC_GSS
 	select CRYPTO
+	select GRACE_PERIOD
 	help
 	  This option enables support in your system's NFS server for
 	  version 4 of the NFS protocol (RFC 3530).
@@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL
 	If you do not wish to enable fine-grained security labels SELinux or
 	Smack policies on NFSv4 files, say N.
 
-	WARNING: there is still a chance of backwards-incompatible protocol changes.
-	For now we recommend "Y" only for developers and testers.
-
 config NFSD_FAULT_INJECTION
 	bool "NFS server manual fault injection"
 	depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
  * is much larger than a sockaddr_in6.
  */
 struct svc_cacherep {
-	struct hlist_node	c_hash;
 	struct list_head	c_lru;
 
 	unsigned char		c_state,	/* unused, inprog, done */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 72ffd7cce3c3..30a739d896ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1145,6 +1145,7 @@ static struct flags {
 	{ NFSEXP_ALLSQUASH, {"all_squash", ""}},
 	{ NFSEXP_ASYNC, {"async", "sync"}},
 	{ NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+	{ NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
 	{ NFSEXP_NOHIDE, {"nohide", ""}},
 	{ NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
 	{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index fa2525b2e9d7..12f2aab4f614 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -223,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
 	attr   = &argp->attrs;
 
-	/* Get the directory inode */
-	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
-	if (nfserr)
-		RETURN_STATUS(nfserr);
-
 	/* Unfudge the mode bits */
 	attr->ia_mode &= ~S_IFMT;
 	if (!(attr->ia_valid & ATTR_MODE)) { 
@@ -471,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 	resp->buflen = resp->count;
 	resp->rqstp = rqstp;
 	offset = argp->cookie;
+
+	nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
+		RETURN_STATUS(nfserr_notsupp);
+
 	nfserr = nfsd_readdir(rqstp, &resp->fh,
 				     &offset,
 				     &resp->common,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e0be57b0f79b..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
 
 /* Index of predefined Linux callback client operations */
 
-enum {
-	NFSPROC4_CLNT_CB_NULL = 0,
-	NFSPROC4_CLNT_CB_RECALL,
-	NFSPROC4_CLNT_CB_SEQUENCE,
-};
-
 struct nfs4_cb_compound_hdr {
 	/* args */
 	u32		ident;	/* minorversion 0 only */
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
 				   const struct nfsd4_callback *cb)
 {
-	const struct nfs4_delegation *args = cb->cb_op;
+	const struct nfs4_delegation *dp = cb_to_delegation(cb);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = cb->cb_clp->cl_cb_ident,
 		.minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
 
 	encode_cb_compound4args(xdr, &hdr);
 	encode_cb_sequence4args(xdr, cb, &hdr);
-	encode_cb_recall4args(xdr, args, &hdr);
+	encode_cb_recall4args(xdr, dp, &hdr);
 	encode_cb_nops(&hdr);
 }
 
@@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
 
 static struct workqueue_struct *callback_wq;
 
-static void run_nfsd4_cb(struct nfsd4_callback *cb)
-{
-	queue_work(callback_wq, &cb->cb_work);
-}
-
-static void do_probe_callback(struct nfs4_client *clp)
-{
-	struct nfsd4_callback *cb = &clp->cl_cb_null;
-
-	cb->cb_op = NULL;
-	cb->cb_clp = clp;
-
-	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
-	cb->cb_msg.rpc_argp = NULL;
-	cb->cb_msg.rpc_resp = NULL;
-
-	cb->cb_ops = &nfsd4_cb_probe_ops;
-
-	run_nfsd4_cb(cb);
-}
-
 /*
  * Poke the callback thread to process any updates to the callback
  * parameters, and send a null probe.
@@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
 {
 	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
 	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
-	do_probe_callback(clp);
+	nfsd4_run_cb(&clp->cl_cb_null);
 }
 
 void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		rpc_wake_up_next(&clp->cl_cb_waitq);
 		dprintk("%s: freed slot, new seqid=%d\n", __func__,
 			clp->cl_cb_session->se_cb_seq_nr);
-
-		/* We're done looking into the sequence information */
-		task->tk_msg.rpc_resp = NULL;
 	}
-}
-
-
-static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-{
-	struct nfsd4_callback *cb = calldata;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = cb->cb_clp;
-	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
-
-	nfsd4_cb_done(task, calldata);
 
-	if (current_rpc_client != task->tk_client) {
+	if (clp->cl_cb_client != task->tk_client) {
 		/* We're shutting down or changing cl_cb_client; leave
 		 * it to nfsd4_process_cb_update to restart the call if
 		 * necessary. */
@@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 
 	if (cb->cb_done)
 		return;
-	switch (task->tk_status) {
+
+	switch (cb->cb_ops->done(cb, task)) {
 	case 0:
-		cb->cb_done = true;
+		task->tk_status = 0;
+		rpc_restart_call_prepare(task);
 		return;
-	case -EBADHANDLE:
-	case -NFS4ERR_BAD_STATEID:
-		/* Race: client probably got cb_recall
-		 * before open reply granting delegation */
+	case 1:
 		break;
-	default:
+	case -1:
 		/* Network partition? */
 		nfsd4_mark_cb_down(clp, task->tk_status);
+		break;
+	default:
+		BUG();
 	}
-	if (dp->dl_retries--) {
-		rpc_delay(task, 2*HZ);
-		task->tk_status = 0;
-		rpc_restart_call_prepare(task);
-		return;
-	}
-	nfsd4_mark_cb_down(clp, task->tk_status);
 	cb->cb_done = true;
 }
 
-static void nfsd4_cb_recall_release(void *calldata)
+static void nfsd4_cb_release(void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_client *clp = cb->cb_clp;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 
 	if (cb->cb_done) {
 		spin_lock(&clp->cl_lock);
 		list_del(&cb->cb_per_client);
 		spin_unlock(&clp->cl_lock);
-		nfs4_put_stid(&dp->dl_stid);
+
+		cb->cb_ops->release(cb);
 	}
 }
 
-static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+static const struct rpc_call_ops nfsd4_cb_ops = {
 	.rpc_call_prepare = nfsd4_cb_prepare,
-	.rpc_call_done = nfsd4_cb_recall_done,
-	.rpc_release = nfsd4_cb_recall_release,
+	.rpc_call_done = nfsd4_cb_done,
+	.rpc_release = nfsd4_cb_release,
 };
 
 int nfsd4_create_callback_queue(void)
@@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	 * instead, nfsd4_run_cb_null() will detect the killed
 	 * client, destroy the rpc client, and stop:
 	 */
-	do_probe_callback(clp);
+	nfsd4_run_cb(&clp->cl_cb_null);
 	flush_workqueue(callback_wq);
 }
 
-static void nfsd4_release_cb(struct nfsd4_callback *cb)
-{
-	if (cb->cb_ops->rpc_release)
-		cb->cb_ops->rpc_release(cb);
-}
-
 /* requires cl_lock: */
 static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
 {
@@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	}
 	/* Yay, the callback channel's back! Restart any callbacks: */
 	list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
-		run_nfsd4_cb(cb);
+		queue_work(callback_wq, &cb->cb_work);
 }
 
 static void
-nfsd4_run_callback_rpc(struct nfsd4_callback *cb)
+nfsd4_run_cb_work(struct work_struct *work)
 {
+	struct nfsd4_callback *cb =
+		container_of(work, struct nfsd4_callback, cb_work);
 	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *clnt;
 
+	if (cb->cb_ops && cb->cb_ops->prepare)
+		cb->cb_ops->prepare(cb);
+
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
 	clnt = clp->cl_cb_client;
 	if (!clnt) {
 		/* Callback channel broken, or client killed; give up: */
-		nfsd4_release_cb(cb);
+		if (cb->cb_ops && cb->cb_ops->release)
+			cb->cb_ops->release(cb);
 		return;
 	}
 	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
-			cb->cb_ops, cb);
+			cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
 }
 
-void
-nfsd4_run_cb_null(struct work_struct *w)
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
 {
-	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
-							cb_work);
-	nfsd4_run_callback_rpc(cb);
-}
-
-void
-nfsd4_run_cb_recall(struct work_struct *w)
-{
-	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
-							cb_work);
-
-	nfsd4_prepare_cb_recall(cb->cb_op);
-	nfsd4_run_callback_rpc(cb);
-}
-
-void nfsd4_cb_recall(struct nfs4_delegation *dp)
-{
-	struct nfsd4_callback *cb = &dp->dl_recall;
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
-
-	dp->dl_retries = 1;
-	cb->cb_op = dp;
 	cb->cb_clp = clp;
-	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
-
-	cb->cb_ops = &nfsd4_cb_recall_ops;
-
+	cb->cb_ops = ops;
+	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	INIT_LIST_HEAD(&cb->cb_per_client);
 	cb->cb_done = true;
+}
 
-	run_nfsd4_cb(&dp->dl_recall);
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+	queue_work(callback_wq, &cb->cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 	memset(&ent, 0, sizeof(ent));
 
 	/* Authentication name */
-	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
 		goto out;
 	memcpy(ent.authname, buf1, sizeof(ent.authname));
 
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 	/* Name */
 	error = -EINVAL;
 	len = qword_get(&buf, buf1, PAGE_SIZE);
-	if (len < 0)
+	if (len < 0 || len >= IDMAP_NAMESZ)
 		goto out;
 	if (len == 0)
 		set_bit(CACHE_NEGATIVE, &ent.h.flags);
-	else if (len >= IDMAP_NAMESZ)
-		goto out;
 	else
 		memcpy(ent.name, buf1, sizeof(ent.name));
 	error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 		goto out;
 
 	cache_put(&res->h, cd);
-
 	error = 0;
 out:
 	kfree(buf1);
-
 	return error;
 }
 
-
 static struct ent *
 idtoname_lookup(struct cache_detail *cd, struct ent *item)
 {
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 {
 	struct ent ent, *res;
 	char *buf1;
-	int error = -EINVAL;
+	int len, error = -EINVAL;
 
 	if (buf[buflen - 1] != '\n')
 		return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	memset(&ent, 0, sizeof(ent));
 
 	/* Authentication name */
-	if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
 		goto out;
 	memcpy(ent.authname, buf1, sizeof(ent.authname));
 
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 		IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
 
 	/* Name */
-	error = qword_get(&buf, buf1, PAGE_SIZE);
-	if (error <= 0 || error >= IDMAP_NAMESZ)
+	len = qword_get(&buf, buf1, PAGE_SIZE);
+	if (len <= 0 || len >= IDMAP_NAMESZ)
 		goto out;
 	memcpy(ent.name, buf1, sizeof(ent.name));
 
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	error = 0;
 out:
 	kfree(buf1);
-
 	return (error);
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5e0dc528a0e8..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_seek *seek)
+{
+	int whence;
+	__be32 status;
+	struct file *file;
+
+	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+					    &seek->seek_stateid,
+					    RD_STATE, &file);
+	if (status) {
+		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+		return status;
+	}
+
+	switch (seek->seek_whence) {
+	case NFS4_CONTENT_DATA:
+		whence = SEEK_DATA;
+		break;
+	case NFS4_CONTENT_HOLE:
+		whence = SEEK_HOLE;
+		break;
+	default:
+		status = nfserr_union_notsupp;
+		goto out;
+	}
+
+	/*
+	 * Note:  This call does change file->f_pos, but nothing in NFSD
+	 *        should ever file->f_pos.
+	 */
+	seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+	if (seek->seek_pos < 0)
+		status = nfserrno(seek->seek_pos);
+	else if (seek->seek_pos >= i_size_read(file_inode(file)))
+		seek->seek_eof = true;
+
+out:
+	fput(file);
+	return status;
+}
+
 /* This routine never returns NFS_OK!  If there are no other errors, it
  * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
  * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
@@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+
+	/* NFSv4.2 operations */
+	[OP_SEEK] = {
+		.op_func = (nfsd4op_func)nfsd4_seek,
+		.op_name = "OP_SEEK",
+	},
 };
 
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..ea95a2bc21b5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
 	void (*create)(struct nfs4_client *);
 	void (*remove)(struct nfs4_client *);
 	int (*check)(struct nfs4_client *);
-	void (*grace_done)(struct nfsd_net *, time_t);
+	void (*grace_done)(struct nfsd_net *);
 };
 
 /* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
 	status = mnt_want_write_file(nn->rec_file);
 	if (status)
-		return;
+		goto out_creds;
 
 	dir = nn->rec_file->f_path.dentry;
 	/* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
 				user_recovery_dirname);
 	}
 	mnt_drop_write_file(nn->rec_file);
+out_creds:
 	nfs4_reset_creds(original_cred);
 }
 
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 }
 
 static void
-nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
 {
 	int status;
 
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
 	return status;
 }
 
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (!nn->rec_file)
+		return;
+	fput(nn->rec_file);
+	nn->rec_file = NULL;
+}
 
 static int
 nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
 	int status;
 
 	status = nfsd4_init_recdir(net);
-	if (!status)
-		status = nfsd4_recdir_load(net);
 	if (status)
-		printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+		return status;
+
+	status = nfsd4_recdir_load(net);
+	if (status)
+		nfsd4_shutdown_recdir(net);
+
 	return status;
 }
 
@@ -546,21 +560,12 @@ err:
 }
 
 static void
-nfsd4_shutdown_recdir(struct nfsd_net *nn)
-{
-	if (!nn->rec_file)
-		return;
-	fput(nn->rec_file);
-	nn->rec_file = NULL;
-}
-
-static void
 nfsd4_legacy_tracking_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfs4_release_reclaim(nn);
-	nfsd4_shutdown_recdir(nn);
+	nfsd4_shutdown_recdir(net);
 	nfs4_legacy_state_shutdown(net);
 }
 
@@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
 }
 
 static void
-nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn)
 {
 	int ret;
 	struct cld_upcall *cup;
@@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
 	}
 
 	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+	cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
 	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
 	if (!ret)
 		ret = cup->cu_msg.cm_status;
@@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
 
 #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
 #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
 
 static char *
 nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
 	return result;
 }
 
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	/* prefix + Y/N character + terminating NULL */
+	len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+				clp->cl_minorversion ? 'Y' : 'N');
+	if (copied >= len) {
+		/* just return nothing if output was truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+	int copied;
+	size_t len;
+	char *result;
+
+	/* prefix + max width of int64_t string + terminating NULL */
+	len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+	result = kmalloc(len, GFP_KERNEL);
+	if (!result)
+		return result;
+
+	copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+				grace_start);
+	if (copied >= len) {
+		/* just return nothing if output was truncated */
+		kfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
 static int
-nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
 {
-	char *envp[2];
+	char *envp[3];
 	char *argv[4];
 	int ret;
 
@@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
 
 	dprintk("%s: cmd: %s\n", __func__, cmd);
 	dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
-	dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+	dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+	dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
 
-	envp[0] = legacy;
-	envp[1] = NULL;
+	envp[0] = env0;
+	envp[1] = env1;
+	envp[2] = NULL;
 
 	argv[0] = (char *)cltrack_prog;
 	argv[1] = cmd;
@@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
 }
 
 static int
-nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+nfsd4_umh_cltrack_init(struct net *net)
 {
+	int ret;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
 	/* XXX: The usermode helper s not working in container yet. */
 	if (net != &init_net) {
 		WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
 			"tracking in a container!\n");
 		return -EINVAL;
 	}
-	return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+
+	ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+	kfree(grace_start);
+	return ret;
+}
+
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+	wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+	smp_mb__before_atomic();
+	clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
 }
 
 static void
 nfsd4_umh_cltrack_create(struct nfs4_client *clp)
 {
-	char *hexid;
+	char *hexid, *has_session, *grace_start;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	/*
+	 * With v4.0 clients, there's little difference in outcome between a
+	 * create and check operation, and we can end up calling into this
+	 * function multiple times per client (once for each openowner). So,
+	 * for v4.0 clients skip upcalling once the client has been recorded
+	 * on stable storage.
+	 *
+	 * For v4.1+ clients, the outcome of the two operations is different,
+	 * so we must ensure that we upcall for the create operation. v4.1+
+	 * clients call this on RECLAIM_COMPLETE though, so we should only end
+	 * up doing a single create upcall per client.
+	 */
+	if (clp->cl_minorversion == 0 &&
+	    test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
 
 	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
 	if (!hexid) {
 		dprintk("%s: can't allocate memory for upcall!\n", __func__);
 		return;
 	}
-	nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+
+	has_session = nfsd4_cltrack_client_has_session(clp);
+	grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	nfsd4_cltrack_upcall_unlock(clp);
+
+	kfree(has_session);
+	kfree(grace_start);
 	kfree(hexid);
 }
 
@@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
 {
 	char *hexid;
 
+	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
 	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
 	if (!hexid) {
 		dprintk("%s: can't allocate memory for upcall!\n", __func__);
 		return;
 	}
-	nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+	    nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	nfsd4_cltrack_upcall_unlock(clp);
+
 	kfree(hexid);
 }
 
@@ -1230,30 +1348,45 @@ static int
 nfsd4_umh_cltrack_check(struct nfs4_client *clp)
 {
 	int ret;
-	char *hexid, *legacy;
+	char *hexid, *has_session, *legacy;
+
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
 
 	hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
 	if (!hexid) {
 		dprintk("%s: can't allocate memory for upcall!\n", __func__);
 		return -ENOMEM;
 	}
+
+	has_session = nfsd4_cltrack_client_has_session(clp);
 	legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
-	ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+
+	nfsd4_cltrack_upcall_lock(clp);
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+		ret = 0;
+	} else {
+		ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+		if (ret == 0)
+			set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+	nfsd4_cltrack_upcall_unlock(clp);
+	kfree(has_session);
 	kfree(legacy);
 	kfree(hexid);
+
 	return ret;
 }
 
 static void
-nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
-				time_t boot_time)
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
 {
 	char *legacy;
 	char timestr[22]; /* FIXME: better way to determine max size? */
 
-	sprintf(timestr, "%ld", boot_time);
+	sprintf(timestr, "%ld", nn->boot_time);
 	legacy = nfsd4_cltrack_legacy_topdir();
-	nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+	nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
 	kfree(legacy);
 }
 
@@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
 }
 
 void
-nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn)
 {
 	if (nn->client_tracking_ops)
-		nn->client_tracking_ops->grace_done(nn, boot_time);
+		nn->client_tracking_ops->grace_done(nn);
 }
 
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e80a59e7e91..5c0cac173068 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab;
 
 static void free_session(struct nfsd4_session *);
 
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+
 static bool is_session_dead(struct nfsd4_session *ses)
 {
 	return ses->se_flags & NFS4_SESSION_DEAD;
@@ -645,7 +647,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
 	dp->dl_type = NFS4_OPEN_DELEGATE_READ;
-	INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall);
+	dp->dl_retries = 1;
+	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
 	return dp;
 out_dec:
 	atomic_long_dec(&num_delegations);
@@ -673,15 +677,20 @@ nfs4_put_stid(struct nfs4_stid *s)
 
 static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
-	lockdep_assert_held(&state_lock);
+	struct file *filp = NULL;
+	struct file_lock *fl;
 
-	if (!fp->fi_lease)
-		return;
-	if (atomic_dec_and_test(&fp->fi_delegees)) {
-		vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
+	spin_lock(&fp->fi_lock);
+	if (fp->fi_lease && atomic_dec_and_test(&fp->fi_delegees)) {
+		swap(filp, fp->fi_deleg_file);
+		fl = fp->fi_lease;
 		fp->fi_lease = NULL;
-		fput(fp->fi_deleg_file);
-		fp->fi_deleg_file = NULL;
+	}
+	spin_unlock(&fp->fi_lock);
+
+	if (filp) {
+		vfs_setlease(filp, F_UNLCK, &fl);
+		fput(filp);
 	}
 }
 
@@ -717,8 +726,6 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
 	list_del_init(&dp->dl_recall_lru);
 	list_del_init(&dp->dl_perfile);
 	spin_unlock(&fp->fi_lock);
-	if (fp)
-		nfs4_put_deleg_lease(fp);
 }
 
 static void destroy_delegation(struct nfs4_delegation *dp)
@@ -726,6 +733,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	spin_lock(&state_lock);
 	unhash_delegation_locked(dp);
 	spin_unlock(&state_lock);
+	nfs4_put_deleg_lease(dp->dl_stid.sc_file);
 	nfs4_put_stid(&dp->dl_stid);
 }
 
@@ -735,6 +743,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	WARN_ON(!list_empty(&dp->dl_recall_lru));
 
+	nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+
 	if (clp->cl_minorversion == 0)
 		nfs4_put_stid(&dp->dl_stid);
 	else {
@@ -1635,6 +1645,7 @@ __destroy_client(struct nfs4_client *clp)
 	while (!list_empty(&reaplist)) {
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
+		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
 		nfs4_put_stid(&dp->dl_stid);
 	}
 	while (!list_empty(&clp->cl_revoked)) {
@@ -1862,7 +1873,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 		free_client(clp);
 		return NULL;
 	}
-	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null);
+	nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	copy_verf(clp, verf);
@@ -3349,8 +3360,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	return ret;
 }
 
-void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
 {
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
 	struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
 					  nfsd_net_id);
 
@@ -3371,6 +3383,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
 	spin_unlock(&state_lock);
 }
 
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+		struct rpc_task *task)
+{
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+	switch (task->tk_status) {
+	case 0:
+		return 1;
+	case -EBADHANDLE:
+	case -NFS4ERR_BAD_STATEID:
+		/*
+		 * Race: client probably got cb_recall before open reply
+		 * granting delegation.
+		 */
+		if (dp->dl_retries--) {
+			rpc_delay(task, 2 * HZ);
+			return 0;
+		}
+		/*FALLTHRU*/
+	default:
+		return -1;
+	}
+}
+
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+	nfs4_put_stid(&dp->dl_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+	.prepare	= nfsd4_cb_recall_prepare,
+	.done		= nfsd4_cb_recall_done,
+	.release	= nfsd4_cb_recall_release,
+};
+
 static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
 	/*
@@ -3381,7 +3430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * it's safe to take a reference.
 	 */
 	atomic_inc(&dp->dl_stid.sc_count);
-	nfsd4_cb_recall(dp);
+	nfsd4_run_cb(&dp->dl_recall);
 }
 
 /* Called from break_lease() with i_lock held. */
@@ -3759,7 +3808,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 	fl = locks_alloc_lock();
 	if (!fl)
 		return NULL;
-	locks_init_lock(fl);
 	fl->fl_lmops = &nfsd_lease_mng_ops;
 	fl->fl_flags = FL_DELEG;
 	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
@@ -4107,7 +4155,7 @@ out:
 	return status;
 }
 
-static void
+void
 nfsd4_end_grace(struct nfsd_net *nn)
 {
 	/* do nothing if grace period already ended */
@@ -4116,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
 
 	dprintk("NFSD: end of grace period\n");
 	nn->grace_ended = true;
-	nfsd4_record_grace_done(nn, nn->boot_time);
+	/*
+	 * If the server goes down again right now, an NFSv4
+	 * client will still be allowed to reclaim after it comes back up,
+	 * even if it hasn't yet had a chance to reclaim state this time.
+	 *
+	 */
+	nfsd4_record_grace_done(nn);
+	/*
+	 * At this point, NFSv4 clients can still reclaim.  But if the
+	 * server crashes, any that have not yet reclaimed will be out
+	 * of luck on the next boot.
+	 *
+	 * (NFSv4.1+ clients are considered to have reclaimed once they
+	 * call RECLAIM_COMPLETE.  NFSv4.0 clients are considered to
+	 * have reclaimed after their first OPEN.)
+	 */
 	locks_end_grace(&nn->nfsd4_manager);
 	/*
-	 * Now that every NFSv4 client has had the chance to recover and
-	 * to see the (possibly new, possibly shorter) lease time, we
-	 * can safely set the next grace time to the current lease time:
+	 * At this point, and once lockd and/or any other containers
+	 * exit their grace period, further reclaims will fail and
+	 * regular locking can resume.
 	 */
-	nn->nfsd4_grace = nn->nfsd4_lease;
 }
 
 static time_t
@@ -5210,7 +5272,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 
 	fp = lock_stp->st_stid.sc_file;
-	locks_init_lock(file_lock);
 	switch (lock->lk_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
@@ -5354,7 +5415,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_jukebox;
 		goto out;
 	}
-	locks_init_lock(file_lock);
+
 	switch (lockt->lt_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
@@ -5432,7 +5493,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_jukebox;
 		goto fput;
 	}
-	locks_init_lock(file_lock);
+
 	file_lock->fl_type = F_UNLCK;
 	file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
 	file_lock->fl_pid = current->tgid;
@@ -5645,6 +5706,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
 	if (status)
 		return nfserr_reclaim_bad;
 
+	if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+		return nfserr_no_grace;
+
 	if (nfsd4_client_record_check(cstate->clp))
 		return nfserr_reclaim_bad;
 
@@ -6342,10 +6406,10 @@ nfs4_state_start_net(struct net *net)
 	ret = nfs4_state_create_net(net);
 	if (ret)
 		return ret;
-	nfsd4_client_tracking_init(net);
 	nn->boot_time = get_seconds();
-	locks_start_grace(net, &nn->nfsd4_manager);
 	nn->grace_ended = false;
+	locks_start_grace(net, &nn->nfsd4_manager);
+	nfsd4_client_tracking_init(net);
 	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
 	       nn->nfsd4_grace, net);
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
@@ -6402,6 +6466,7 @@ nfs4_state_shutdown_net(struct net *net)
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
+		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
 		nfs4_put_stid(&dp->dl_stid);
 	}
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e94457c33ad6..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -31,13 +31,6 @@
  *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * TODO: Neil Brown made the following observation:  We currently
- * initially reserve NFSD_BUFSIZE space on the transmit queue and
- * never release any of that until the request is complete.
- * It would be good to calculate a new maximum response size while
- * decoding the COMPOUND, and call svc_reserve with this number
- * at the end of nfs4svc_decode_compoundargs.
  */
 
 #include <linux/slab.h>
@@ -1521,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 }
 
 static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8 + 4);
+	p = xdr_decode_hyper(p, &seek->seek_offset);
+	seek->seek_whence = be32_to_cpup(p);
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
 	return nfs_ok;
@@ -1593,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_destroy_clientid,
 	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
+
+	/* new operations for NFSv4.2 */
+	[OP_ALLOCATE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DEALLOCATE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTERROR]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
+	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
 };
 
 static inline bool
@@ -1670,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 			readbytes += nfsd4_max_reply(argp->rqstp, op);
 		} else
 			max_reply += nfsd4_max_reply(argp->rqstp, op);
+		/*
+		 * OP_LOCK may return a conflicting lock.  (Special case
+		 * because it will just skip encoding this if it runs
+		 * out of xdr buffer space, and it is the only operation
+		 * that behaves this way.)
+		 */
+		if (op->opnum == OP_LOCK)
+			max_reply += NFS4_OPAQUE_LIMIT;
 
 		if (op->status) {
 			argp->opcnt = i+1;
@@ -3104,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
 
 	buf->page_len = maxcount;
 	buf->len += maxcount;
-	xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+	xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+							/ PAGE_SIZE;
 
 	/* Use rest of head for padding and remaining ops: */
 	buf->tail[0].iov_base = xdr->p;
@@ -3763,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_seek *seek)
+{
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(&resp->xdr, 4 + 8);
+	*p++ = cpu_to_be32(seek->seek_eof);
+	p = xdr_encode_hyper(p, seek->seek_pos);
+
+	return nfserr;
+}
+
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
 	return nfserr;
@@ -3834,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.2 operations */
+	[OP_ALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DEALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTERROR]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
+	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
 };
 
 /*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ff9567633245..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,8 +27,12 @@
  */
 #define TARGET_BUCKET_SIZE	64
 
-static struct hlist_head *	cache_hash;
-static struct list_head 	lru_head;
+struct nfsd_drc_bucket {
+	struct list_head lru_head;
+	spinlock_t cache_lock;
+};
+
+static struct nfsd_drc_bucket	*drc_hashtbl;
 static struct kmem_cache	*drc_slab;
 
 /* max number of entries allowed in the cache */
@@ -36,6 +40,7 @@ static unsigned int		max_drc_entries;
 
 /* number of significant bits in the hash value */
 static unsigned int		maskbits;
+static unsigned int		drc_hashsize;
 
 /*
  * Stats and other tracking of on the duplicate reply cache. All of these and
@@ -43,7 +48,7 @@ static unsigned int		maskbits;
  */
 
 /* total number of entries */
-static unsigned int		num_drc_entries;
+static atomic_t			num_drc_entries;
 
 /* cache misses due only to checksum comparison failures */
 static unsigned int		payload_misses;
@@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
  * A cache entry is "single use" if c_state == RC_INPROG
  * Otherwise, it when accessing _prev or _next, the lock must be held.
  */
-static DEFINE_SPINLOCK(cache_lock);
 static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
 
 /*
@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
 	return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
 }
 
+static u32
+nfsd_cache_hash(__be32 xid)
+{
+	return hash_32(be32_to_cpu(xid), maskbits);
+}
+
 static struct svc_cacherep *
 nfsd_reply_cache_alloc(void)
 {
@@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void)
 		rp->c_state = RC_UNUSED;
 		rp->c_type = RC_NOCACHE;
 		INIT_LIST_HEAD(&rp->c_lru);
-		INIT_HLIST_NODE(&rp->c_hash);
 	}
 	return rp;
 }
@@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 		drc_mem_usage -= rp->c_replvec.iov_len;
 		kfree(rp->c_replvec.iov_base);
 	}
-	if (!hlist_unhashed(&rp->c_hash))
-		hlist_del(&rp->c_hash);
 	list_del(&rp->c_lru);
-	--num_drc_entries;
+	atomic_dec(&num_drc_entries);
 	drc_mem_usage -= sizeof(*rp);
 	kmem_cache_free(drc_slab, rp);
 }
 
 static void
-nfsd_reply_cache_free(struct svc_cacherep *rp)
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
 {
-	spin_lock(&cache_lock);
+	spin_lock(&b->cache_lock);
 	nfsd_reply_cache_free_locked(rp);
-	spin_unlock(&cache_lock);
+	spin_unlock(&b->cache_lock);
 }
 
 int nfsd_reply_cache_init(void)
 {
 	unsigned int hashsize;
+	unsigned int i;
 
-	INIT_LIST_HEAD(&lru_head);
 	max_drc_entries = nfsd_cache_size_limit();
-	num_drc_entries = 0;
+	atomic_set(&num_drc_entries, 0);
 	hashsize = nfsd_hashsize(max_drc_entries);
 	maskbits = ilog2(hashsize);
 
@@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void)
 	if (!drc_slab)
 		goto out_nomem;
 
-	cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!cache_hash)
+	drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
+	if (!drc_hashtbl)
 		goto out_nomem;
+	for (i = 0; i < hashsize; i++) {
+		INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+		spin_lock_init(&drc_hashtbl[i].cache_lock);
+	}
+	drc_hashsize = hashsize;
 
 	return 0;
 out_nomem:
@@ -184,17 +196,22 @@ out_nomem:
 void nfsd_reply_cache_shutdown(void)
 {
 	struct svc_cacherep	*rp;
+	unsigned int i;
 
 	unregister_shrinker(&nfsd_reply_cache_shrinker);
 	cancel_delayed_work_sync(&cache_cleaner);
 
-	while (!list_empty(&lru_head)) {
-		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
-		nfsd_reply_cache_free_locked(rp);
+	for (i = 0; i < drc_hashsize; i++) {
+		struct list_head *head = &drc_hashtbl[i].lru_head;
+		while (!list_empty(head)) {
+			rp = list_first_entry(head, struct svc_cacherep, c_lru);
+			nfsd_reply_cache_free_locked(rp);
+		}
 	}
 
-	kfree (cache_hash);
-	cache_hash = NULL;
+	kfree (drc_hashtbl);
+	drc_hashtbl = NULL;
+	drc_hashsize = 0;
 
 	if (drc_slab) {
 		kmem_cache_destroy(drc_slab);
@@ -207,61 +224,63 @@ void nfsd_reply_cache_shutdown(void)
  * not already scheduled.
  */
 static void
-lru_put_end(struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
 {
 	rp->c_timestamp = jiffies;
-	list_move_tail(&rp->c_lru, &lru_head);
+	list_move_tail(&rp->c_lru, &b->lru_head);
 	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
 }
 
-/*
- * Move a cache entry from one hash list to another
- */
-static void
-hash_refile(struct svc_cacherep *rp)
-{
-	hlist_del_init(&rp->c_hash);
-	/*
-	 * No point in byte swapping c_xid since we're just using it to pick
-	 * a hash bucket.
-	 */
-	hlist_add_head(&rp->c_hash, cache_hash +
-			hash_32((__force u32)rp->c_xid, maskbits));
-}
-
-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
- */
 static long
-prune_cache_entries(void)
+prune_bucket(struct nfsd_drc_bucket *b)
 {
 	struct svc_cacherep *rp, *tmp;
 	long freed = 0;
 
-	list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+	list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
 		/*
 		 * Don't free entries attached to calls that are still
 		 * in-progress, but do keep scanning the list.
 		 */
 		if (rp->c_state == RC_INPROG)
 			continue;
-		if (num_drc_entries <= max_drc_entries &&
+		if (atomic_read(&num_drc_entries) <= max_drc_entries &&
 		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
 			break;
 		nfsd_reply_cache_free_locked(rp);
 		freed++;
 	}
+	return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(void)
+{
+	unsigned int i;
+	long freed = 0;
+	bool cancel = true;
+
+	for (i = 0; i < drc_hashsize; i++) {
+		struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+
+		if (list_empty(&b->lru_head))
+			continue;
+		spin_lock(&b->cache_lock);
+		freed += prune_bucket(b);
+		if (!list_empty(&b->lru_head))
+			cancel = false;
+		spin_unlock(&b->cache_lock);
+	}
 
 	/*
-	 * Conditionally rearm the job. If we cleaned out the list, then
-	 * cancel any pending run (since there won't be any work to do).
-	 * Otherwise, we rearm the job or modify the existing one to run in
-	 * RC_EXPIRE since we just ran the pruner.
+	 * Conditionally rearm the job to run in RC_EXPIRE since we just
+	 * ran the pruner.
 	 */
-	if (list_empty(&lru_head))
-		cancel_delayed_work(&cache_cleaner);
-	else
+	if (!cancel)
 		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
 	return freed;
 }
@@ -269,32 +288,19 @@ prune_cache_entries(void)
 static void
 cache_cleaner_func(struct work_struct *unused)
 {
-	spin_lock(&cache_lock);
 	prune_cache_entries();
-	spin_unlock(&cache_lock);
 }
 
 static unsigned long
 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	unsigned long num;
-
-	spin_lock(&cache_lock);
-	num = num_drc_entries;
-	spin_unlock(&cache_lock);
-
-	return num;
+	return atomic_read(&num_drc_entries);
 }
 
 static unsigned long
 nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	unsigned long freed;
-
-	spin_lock(&cache_lock);
-	freed = prune_cache_entries();
-	spin_unlock(&cache_lock);
-	return freed;
+	return prune_cache_entries();
 }
 /*
  * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -332,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
 static bool
 nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
 {
-	/* Check RPC header info first */
-	if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
-	    rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
-	    rqstp->rq_arg.len != rp->c_len ||
-	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
-	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+	/* Check RPC XID first */
+	if (rqstp->rq_xid != rp->c_xid)
 		return false;
-
 	/* compare checksum of NFS data */
 	if (csum != rp->c_csum) {
 		++payload_misses;
 		return false;
 	}
 
+	/* Other discriminators */
+	if (rqstp->rq_proc != rp->c_proc ||
+	    rqstp->rq_prot != rp->c_prot ||
+	    rqstp->rq_vers != rp->c_vers ||
+	    rqstp->rq_arg.len != rp->c_len ||
+	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+		return false;
+
 	return true;
 }
 
@@ -355,18 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
  * NULL on failure.
  */
 static struct svc_cacherep *
-nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
+		__wsum csum)
 {
 	struct svc_cacherep	*rp, *ret = NULL;
-	struct hlist_head 	*rh;
+	struct list_head 	*rh = &b->lru_head;
 	unsigned int		entries = 0;
 
-	/*
-	 * No point in byte swapping rq_xid since we're just using it to pick
-	 * a hash bucket.
-	 */
-	rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)];
-	hlist_for_each_entry(rp, rh, c_hash) {
+	list_for_each_entry(rp, rh, c_lru) {
 		++entries;
 		if (nfsd_cache_match(rqstp, csum, rp)) {
 			ret = rp;
@@ -377,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
 	/* tally hash chain length stats */
 	if (entries > longest_chain) {
 		longest_chain = entries;
-		longest_chain_cachesize = num_drc_entries;
+		longest_chain_cachesize = atomic_read(&num_drc_entries);
 	} else if (entries == longest_chain) {
 		/* prefer to keep the smallest cachesize possible here */
-		longest_chain_cachesize = min(longest_chain_cachesize,
-						num_drc_entries);
+		longest_chain_cachesize = min_t(unsigned int,
+				longest_chain_cachesize,
+				atomic_read(&num_drc_entries));
 	}
 
 	return ret;
@@ -403,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 				vers = rqstp->rq_vers,
 				proc = rqstp->rq_proc;
 	__wsum			csum;
+	u32 hash = nfsd_cache_hash(xid);
+	struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
 	unsigned long		age;
 	int type = rqstp->rq_cachetype;
 	int rtn = RC_DOIT;
@@ -420,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	 * preallocate an entry.
 	 */
 	rp = nfsd_reply_cache_alloc();
-	spin_lock(&cache_lock);
+	spin_lock(&b->cache_lock);
 	if (likely(rp)) {
-		++num_drc_entries;
+		atomic_inc(&num_drc_entries);
 		drc_mem_usage += sizeof(*rp);
 	}
 
 	/* go ahead and prune the cache */
-	prune_cache_entries();
+	prune_bucket(b);
 
-	found = nfsd_cache_search(rqstp, csum);
+	found = nfsd_cache_search(b, rqstp, csum);
 	if (found) {
 		if (likely(rp))
 			nfsd_reply_cache_free_locked(rp);
@@ -454,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	rp->c_len = rqstp->rq_arg.len;
 	rp->c_csum = csum;
 
-	hash_refile(rp);
-	lru_put_end(rp);
+	lru_put_end(b, rp);
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
@@ -465,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	}
 	rp->c_type = RC_NOCACHE;
  out:
-	spin_unlock(&cache_lock);
+	spin_unlock(&b->cache_lock);
 	return rtn;
 
 found_entry:
 	nfsdstats.rchits++;
 	/* We found a matching entry which is either in progress or done. */
 	age = jiffies - rp->c_timestamp;
-	lru_put_end(rp);
+	lru_put_end(b, rp);
 
 	rtn = RC_DROPIT;
 	/* Request being processed or excessive rexmits */
@@ -527,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 {
 	struct svc_cacherep *rp = rqstp->rq_cacherep;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
+	u32		hash;
+	struct nfsd_drc_bucket *b;
 	int		len;
 	size_t		bufsize = 0;
 
 	if (!rp)
 		return;
 
+	hash = nfsd_cache_hash(rp->c_xid);
+	b = &drc_hashtbl[hash];
+
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
 
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
-		nfsd_reply_cache_free(rp);
+		nfsd_reply_cache_free(b, rp);
 		return;
 	}
 
@@ -553,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 		bufsize = len << 2;
 		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
 		if (!cachv->iov_base) {
-			nfsd_reply_cache_free(rp);
+			nfsd_reply_cache_free(b, rp);
 			return;
 		}
 		cachv->iov_len = bufsize;
 		memcpy(cachv->iov_base, statp, bufsize);
 		break;
 	case RC_NOCACHE:
-		nfsd_reply_cache_free(rp);
+		nfsd_reply_cache_free(b, rp);
 		return;
 	}
-	spin_lock(&cache_lock);
+	spin_lock(&b->cache_lock);
 	drc_mem_usage += bufsize;
-	lru_put_end(rp);
+	lru_put_end(b, rp);
 	rp->c_secure = rqstp->rq_secure;
 	rp->c_type = cachetype;
 	rp->c_state = RC_DONE;
-	spin_unlock(&cache_lock);
+	spin_unlock(&b->cache_lock);
 	return;
 }
 
@@ -600,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
  */
 static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 {
-	spin_lock(&cache_lock);
 	seq_printf(m, "max entries:           %u\n", max_drc_entries);
-	seq_printf(m, "num entries:           %u\n", num_drc_entries);
+	seq_printf(m, "num entries:           %u\n",
+			atomic_read(&num_drc_entries));
 	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
 	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
 	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
@@ -611,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "payload misses:        %u\n", payload_misses);
 	seq_printf(m, "longest chain len:     %u\n", longest_chain);
 	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
-	spin_unlock(&cache_lock);
 	return 0;
 }
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
 	NFSD_Leasetime,
 	NFSD_Gracetime,
 	NFSD_RecoveryDir,
+	NFSD_V4EndGrace,
 #endif
 };
 
@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
 #endif
 
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Leasetime] = write_leasetime,
 	[NFSD_Gracetime] = write_gracetime,
 	[NFSD_RecoveryDir] = write_recoverydir,
+	[NFSD_V4EndGrace] = write_v4_end_grace,
 #endif
 };
 
@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 	return rv;
 }
 
+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * OR
+ *
+ * Input:
+ * 			buf:		any value
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *			passed-in buffer filled with "Y" or "N" with a newline
+ *			and NULL-terminated C string. This indicates whether
+ *			the grace period has ended in the current net
+ *			namespace. Return code is the size in bytes of the
+ *			string. Writing a string that starts with 'Y', 'y', or
+ *			'1' to the file will end the grace period for nfsd's v4
+ *			lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (size > 0) {
+		switch(buf[0]) {
+		case 'Y':
+		case 'y':
+		case '1':
+			nfsd4_end_grace(nn);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+			 nn->grace_ended ? 'Y' : 'N');
+}
+
 #endif
 
 /*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
 #endif
 		/* last one */ {""}
 	};
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847daf37e566..747f3b95bd11 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -251,7 +251,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_deleg_revoked		cpu_to_be32(NFS4ERR_DELEG_REVOKED)
 #define nfserr_partner_notsupp		cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
 #define nfserr_partner_no_auth		cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
-#define nfserr_metadata_notsupp		cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_union_notsupp		cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
 #define nfserr_offload_denied		cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
 #define nfserr_wrong_lfs		cpu_to_be32(NFS4ERR_WRONG_LFS)
 #define nfserr_badlabel		cpu_to_be32(NFS4ERR_BADLABEL)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e883a5868be6..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * fix that case easily.
 		 */
 		struct cred *new = prepare_creds();
-		if (!new)
-			return nfserrno(-ENOMEM);
+		if (!new) {
+			error =  nfserrno(-ENOMEM);
+			goto out;
+		}
 		new->cap_effective =
 			cap_raise_nfsd_set(new->cap_effective,
 					   new->cap_permitted);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4a89e00d7461..0a47c6a6b301 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,16 +62,21 @@ typedef struct {
 	(s)->si_generation
 
 struct nfsd4_callback {
-	void *cb_op;
 	struct nfs4_client *cb_clp;
 	struct list_head cb_per_client;
 	u32 cb_minorversion;
 	struct rpc_message cb_msg;
-	const struct rpc_call_ops *cb_ops;
+	struct nfsd4_callback_ops *cb_ops;
 	struct work_struct cb_work;
 	bool cb_done;
 };
 
+struct nfsd4_callback_ops {
+	void (*prepare)(struct nfsd4_callback *);
+	int (*done)(struct nfsd4_callback *, struct rpc_task *);
+	void (*release)(struct nfsd4_callback *);
+};
+
 /*
  * A core object that represents a "common" stateid. These are generally
  * embedded within the different (more specific) stateid objects and contain
@@ -127,6 +132,9 @@ struct nfs4_delegation {
 	struct nfsd4_callback	dl_recall;
 };
 
+#define cb_to_delegation(cb) \
+	container_of(cb, struct nfs4_delegation, dl_recall)
+
 /* client delegation callback info */
 struct nfs4_cb_conn {
 	/* SETCLIENTID info */
@@ -306,6 +314,7 @@ struct nfs4_client {
 #define NFSD4_CLIENT_STABLE		(2)	/* client on stable storage */
 #define NFSD4_CLIENT_RECLAIM_COMPLETE	(3)	/* reclaim_complete done */
 #define NFSD4_CLIENT_CONFIRMED		(4)	/* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK	(5)	/* upcall serialization */
 #define NFSD4_CLIENT_CB_FLAG_MASK	(1 << NFSD4_CLIENT_CB_UPDATE | \
 					 1 << NFSD4_CLIENT_CB_KILL)
 	unsigned long		cl_flags;
@@ -517,6 +526,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
 
+enum nfsd4_cb_op {
+	NFSPROC4_CLNT_CB_NULL = 0,
+	NFSPROC4_CLNT_CB_RECALL,
+	NFSPROC4_CLNT_CB_SEQUENCE,
+};
+
+
 struct nfsd4_compound_state;
 struct nfsd_net;
 
@@ -531,12 +547,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
 		struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
-void nfsd4_run_cb_null(struct work_struct *w);
-void nfsd4_run_cb_recall(struct work_struct *w);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -545,13 +561,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
 
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
+
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
 extern void nfsd4_client_tracking_exit(struct net *net);
 extern void nfsd4_client_record_create(struct nfs4_client *clp);
 extern void nfsd4_client_record_remove(struct nfs4_client *clp);
 extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
 
 /* nfs fault injection functions */
 #ifdef CONFIG_NFSD_FAULT_INJECTION
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f501a9b5c9df..965cffd17a0c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		if (err)
 			goto out;
 		size_change = 1;
+
+		/*
+		 * RFC5661, Section 18.30.4:
+		 *   Changing the size of a file with SETATTR indirectly
+		 *   changes the time_modify and change attributes.
+		 *
+		 * (and similar for the older RFCs)
+		 */
+		if (iap->ia_size != i_size_read(inode))
+			iap->ia_valid |= ATTR_MTIME;
 	}
 
 	iap->ia_valid |= ATTR_CTIME;
@@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 {
 	struct path	path;
 	struct inode	*inode;
+	struct file	*file;
 	int		flags = O_RDONLY|O_LARGEFILE;
 	__be32		err;
 	int		host_err = 0;
@@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		else
 			flags = O_WRONLY|O_LARGEFILE;
 	}
-	*filp = dentry_open(&path, flags, current_cred());
-	if (IS_ERR(*filp)) {
-		host_err = PTR_ERR(*filp);
-		*filp = NULL;
-	} else {
-		host_err = ima_file_check(*filp, may_flags);
 
-		if (may_flags & NFSD_MAY_64BIT_COOKIE)
-			(*filp)->f_mode |= FMODE_64BITHASH;
-		else
-			(*filp)->f_mode |= FMODE_32BITHASH;
+	file = dentry_open(&path, flags, current_cred());
+	if (IS_ERR(file)) {
+		host_err = PTR_ERR(file);
+		goto out_nfserr;
 	}
 
+	host_err = ima_file_check(file, may_flags);
+	if (host_err) {
+		nfsd_close(file);
+		goto out_nfserr;
+	}
+
+	if (may_flags & NFSD_MAY_64BIT_COOKIE)
+		file->f_mode |= FMODE_64BITHASH;
+	else
+		file->f_mode |= FMODE_32BITHASH;
+
+	*filp = file;
 out_nfserr:
 	err = nfserrno(host_err);
 out:
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 465e7799742a..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd4_seek {
+	/* request */
+	stateid_t	seek_stateid;
+	loff_t		seek_offset;
+	u32		seek_whence;
+
+	/* response */
+	u32		seek_eof;
+	loff_t		seek_pos;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -473,6 +484,9 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+
+		/* NFSv4.2 */
+		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;
 };
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..d071e7f23de2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
+#include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/aio.h>
 #include "nilfs.h"
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 
 static int nilfs_set_page_dirty(struct page *page)
 {
+	struct inode *inode = page->mapping->host;
 	int ret = __set_page_dirty_nobuffers(page);
 
 	if (page_has_buffers(page)) {
-		struct inode *inode = page->mapping->host;
 		unsigned nr_dirty = 0;
 		struct buffer_head *bh, *head;
 
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
 
 		if (nr_dirty)
 			nilfs_set_file_dirty(inode, nr_dirty);
+	} else if (ret) {
+		unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		nilfs_set_file_dirty(inode, nr_dirty);
 	}
 	return ret;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b13992a41bd9..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	client_fd = get_unused_fd();
+	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
 	if (client_fd < 0)
 		return client_fd;
 
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..9c0898c4cfe1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 				      struct fsnotify_group *group, struct vfsmount *mnt,
 				      int allow_dups);
 
-/* final kfree of a group */
-extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
-
 /* vfsmount specific destruction of a mark */
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
 /*
  * Final freeing of a group
  */
-void fsnotify_final_destroy_group(struct fsnotify_group *group)
+static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0f88bc0b4e6c..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_destroy(&group->inotify_data.idr);
-	atomic_dec(&group->inotify_data.user->inotify_devs);
-	free_uid(group->inotify_data.user);
+	if (group->inotify_data.user) {
+		atomic_dec(&group->inotify_data.user->inotify_devs);
+		free_uid(group->inotify_data.user);
+	}
 }
 
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
 /* If 1, output debug messages, and if 0, don't. */
 int debug_msgs = 0;
 
-void __ntfs_debug (const char *file, int line, const char *function,
+void __ntfs_debug(const char *file, int line, const char *function,
 		const char *fmt, ...)
 {
 	struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5ec1ce7a532..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 	BUG_ON(!nr_pages);
 	err = nr = 0;
 	do {
-		pages[nr] = find_lock_page(mapping, index);
+		pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+				FGP_ACCESSED);
 		if (!pages[nr]) {
 			if (!*cached_page) {
 				*cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
 }
 
 MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
 	page = find_or_create_page(mapping, 0, GFP_NOFS);
 	if (!page) {
+		ocfs2_commit_trans(osb, handle);
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 	wc->w_pages[0] = wc->w_target_page = page;
 	wc->w_num_pages = 1;
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..d13385448168 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
 }
 EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
 
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long flags;
+
+	spin_lock_irqsave(&o2hb_live_lock, flags);
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	spin_unlock_irqrestore(&o2hb_live_lock, flags);
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
+
 int o2hb_check_node_heartbeating_from_callback(u8 node_num)
 {
 	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
 void o2hb_exit(void);
 int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
 static int nst_fop_open(struct inode *inode, struct file *file)
 {
 	struct o2net_send_tracking *dummy_nst;
-	struct seq_file *seq;
-	int ret;
 
-	dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
-	if (dummy_nst == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	dummy_nst->st_task = NULL;
-
-	ret = seq_open(file, &nst_seq_ops);
-	if (ret)
-		goto out;
-
-	seq = file->private_data;
-	seq->private = dummy_nst;
+	dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
+	if (!dummy_nst)
+		return -ENOMEM;
 	o2net_debug_add_nst(dummy_nst);
 
-	dummy_nst = NULL;
-
-out:
-	kfree(dummy_nst);
-	return ret;
+	return 0;
 }
 
 static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
 	.show = sc_seq_show,
 };
 
-static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+static int sc_common_open(struct file *file, int ctxt)
 {
+	struct o2net_sock_debug *sd;
 	struct o2net_sock_container *dummy_sc;
-	struct seq_file *seq;
-	int ret;
 
-	dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
-	if (dummy_sc == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	dummy_sc->sc_page = NULL;
+	dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
+	if (!dummy_sc)
+		return -ENOMEM;
 
-	ret = seq_open(file, &sc_seq_ops);
-	if (ret)
-		goto out;
+	sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
+	if (!sd) {
+		kfree(dummy_sc);
+		return -ENOMEM;
+	}
 
-	seq = file->private_data;
-	seq->private = sd;
+	sd->dbg_ctxt = ctxt;
 	sd->dbg_sock = dummy_sc;
-	o2net_debug_add_sc(dummy_sc);
 
-	dummy_sc = NULL;
+	o2net_debug_add_sc(dummy_sc);
 
-out:
-	kfree(dummy_sc);
-	return ret;
+	return 0;
 }
 
 static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 
 static int stats_fop_open(struct inode *inode, struct file *file)
 {
-	struct o2net_sock_debug *sd;
-
-	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-	if (sd == NULL)
-		return -ENOMEM;
-
-	sd->dbg_ctxt = SHOW_SOCK_STATS;
-	sd->dbg_sock = NULL;
-
-	return sc_common_open(file, sd);
+	return sc_common_open(file, SHOW_SOCK_STATS);
 }
 
 static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
 
 static int sc_fop_open(struct inode *inode, struct file *file)
 {
-	struct o2net_sock_debug *sd;
-
-	sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
-	if (sd == NULL)
-		return -ENOMEM;
-
-	sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
-	sd->dbg_sock = NULL;
-
-	return sc_common_open(file, sd);
+	return sc_common_open(file, SHOW_SOCK_CONTAINERS);
 }
 
 static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ea34952f9496..97de0fbd9f78 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	if (nn->nn_persistent_error || nn->nn_sc_valid)
 		wake_up(&nn->nn_sc_wq);
 
-	if (!was_err && nn->nn_persistent_error) {
+	if (was_valid && !was_err && nn->nn_persistent_error) {
 		o2quo_conn_err(o2net_num_from_nn(nn));
 		queue_delayed_work(o2net_wq, &nn->nn_still_up,
 				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -1601,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
 	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
 	int ret = 0, stop;
 	unsigned int timeout;
+	unsigned int noio_flag;
 
+	/*
+	 * sock_create allocates the sock with GFP_KERNEL. We must set
+	 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+	 * by this process are done as if GFP_NOIO was specified. So we
+	 * are not reentering filesystem while doing memory reclaim.
+	 */
+	noio_flag = memalloc_noio_save();
 	/* if we're greater we initiate tx, otherwise we accept */
 	if (o2nm_this_node() <= o2net_num_from_nn(nn))
 		goto out;
@@ -1710,6 +1718,7 @@ out:
 	if (mynode)
 		o2nm_node_put(mynode);
 
+	memalloc_noio_restore(noio_flag);
 	return;
 }
 
@@ -1721,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
 	spin_lock(&nn->nn_lock);
 	if (!nn->nn_sc_valid) {
 		printk(KERN_NOTICE "o2net: No connection established with "
-		       "node %u after %u.%u seconds, giving up.\n",
+		       "node %u after %u.%u seconds, check network and"
+		       " cluster configuration.\n",
 		     o2net_num_from_nn(nn),
 		     o2net_idle_timeout() / 1000,
 		     o2net_idle_timeout() % 1000);
@@ -1835,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	struct o2nm_node *local_node = NULL;
 	struct o2net_sock_container *sc = NULL;
 	struct o2net_node *nn;
+	unsigned int noio_flag;
+
+	/*
+	 * sock_create_lite allocates the sock with GFP_KERNEL. We must set
+	 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+	 * by this process are done as if GFP_NOIO was specified. So we
+	 * are not reentering filesystem while doing memory reclaim.
+	 */
+	noio_flag = memalloc_noio_save();
 
 	BUG_ON(sock == NULL);
 	*more = 0;
@@ -1951,6 +1970,8 @@ out:
 		o2nm_node_put(local_node);
 	if (sc)
 		sc_put(sc);
+
+	memalloc_noio_restore(noio_flag);
 	return ret;
 }
 
@@ -2146,17 +2167,13 @@ int o2net_init(void)
 	o2quo_init();
 
 	if (o2net_debugfs_init())
-		return -ENOMEM;
+		goto out;
 
 	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
 	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
 	o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
-	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
-		kfree(o2net_hand);
-		kfree(o2net_keep_req);
-		kfree(o2net_keep_resp);
-		return -ENOMEM;
-	}
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
+		goto out;
 
 	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
 	o2net_hand->connector_id = cpu_to_be64(1);
@@ -2181,6 +2198,14 @@ int o2net_init(void)
 	}
 
 	return 0;
+
+out:
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+
+	o2quo_exit();
+	return -ENOMEM;
 }
 
 void o2net_exit(void)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
 static int debug_lockres_open(struct inode *inode, struct file *file)
 {
 	struct dlm_ctxt *dlm = inode->i_private;
-	int ret = -ENOMEM;
-	struct seq_file *seq;
-	struct debug_lockres *dl = NULL;
+	struct debug_lockres *dl;
+	void *buf;
 
-	dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
-	if (!dl) {
-		mlog_errno(ret);
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
 		goto bail;
-	}
 
-	dl->dl_len = PAGE_SIZE;
-	dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
-	if (!dl->dl_buf) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
+	if (!dl)
+		goto bailfree;
 
-	ret = seq_open(file, &debug_lockres_ops);
-	if (ret) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	seq = file->private_data;
-	seq->private = dl;
+	dl->dl_len = PAGE_SIZE;
+	dl->dl_buf = buf;
 
 	dlm_grab(dlm);
 	dl->dl_ctxt = dlm;
 
 	return 0;
+
+bailfree:
+	kfree(buf);
 bail:
-	if (dl)
-		kfree(dl->dl_buf);
-	kfree(dl);
-	return ret;
+	mlog_errno(-ENOMEM);
+	return -ENOMEM;
 }
 
 static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3fcf205ee900..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
 	 * to back off and try again.  This gives heartbeat a chance
 	 * to catch up.
 	 */
-	if (!o2hb_check_node_heartbeating(query->node_idx)) {
+	if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
 		mlog(0, "node %u is not in our live map yet\n",
 		     query->node_idx);
 
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 
 	dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
 	if (!dlm) {
-		mlog_errno(-ENOMEM);
+		ret = -ENOMEM;
+		mlog_errno(ret);
 		goto leave;
 	}
 
 	dlm->name = kstrdup(domain, GFP_KERNEL);
 	if (dlm->name == NULL) {
-		mlog_errno(-ENOMEM);
-		kfree(dlm);
-		dlm = NULL;
+		ret = -ENOMEM;
+		mlog_errno(ret);
 		goto leave;
 	}
 
 	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
 	if (!dlm->lockres_hash) {
-		mlog_errno(-ENOMEM);
-		kfree(dlm->name);
-		kfree(dlm);
-		dlm = NULL;
+		ret = -ENOMEM;
+		mlog_errno(ret);
 		goto leave;
 	}
 
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	dlm->master_hash = (struct hlist_head **)
 				dlm_alloc_pagevec(DLM_HASH_PAGES);
 	if (!dlm->master_hash) {
-		mlog_errno(-ENOMEM);
-		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
-		kfree(dlm->name);
-		kfree(dlm);
-		dlm = NULL;
+		ret = -ENOMEM;
+		mlog_errno(ret);
 		goto leave;
 	}
 
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	dlm->node_num = o2nm_this_node();
 
 	ret = dlm_create_debugfs_subroot(dlm);
-	if (ret < 0) {
-		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
-		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
-		kfree(dlm->name);
-		kfree(dlm);
-		dlm = NULL;
+	if (ret < 0)
 		goto leave;
-	}
 
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		  atomic_read(&dlm->dlm_refs.refcount));
 
 leave:
+	if (ret < 0 && dlm) {
+		if (dlm->master_hash)
+			dlm_free_pagevec((void **)dlm->master_hash,
+					DLM_HASH_PAGES);
+
+		if (dlm->lockres_hash)
+			dlm_free_pagevec((void **)dlm->lockres_hash,
+					DLM_HASH_PAGES);
+
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+	}
 	return dlm;
 }
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3ec906ef5d9a..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 	return res;
 
 error:
-	if (res && res->lockname.name)
-		kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-
 	if (res)
 		kmem_cache_free(dlm_lockres_cache, res);
 	return NULL;
@@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
 	clear_bit(bit, res->refmap);
 }
 
-
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 				   struct dlm_lock_resource *res)
 {
-	assert_spin_locked(&res->spinlock);
-
 	res->inflight_locks++;
 
 	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 	     __builtin_return_address(0));
 }
 
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+				   struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	__dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 				   struct dlm_lock_resource *res)
 {
@@ -894,10 +895,8 @@ lookup:
 	/* finally add the lockres to its hash bucket */
 	__dlm_insert_lockres(dlm, res);
 
-	/* Grab inflight ref to pin the resource */
-	spin_lock(&res->spinlock);
-	dlm_lockres_grab_inflight_ref(dlm, res);
-	spin_unlock(&res->spinlock);
+	/* since this lockres is new it doesn't not require the spinlock */
+	__dlm_lockres_grab_inflight_ref(dlm, res);
 
 	/* get an extra ref on the mle in case this is a BLOCK
 	 * if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2036,10 @@ kill:
 	     "and killing the other node now!  This node is OK and can continue.\n");
 	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
+	spin_lock(&dlm->master_lock);
+	if (mle)
+		__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 	*ret_data = (void *)res;
 	dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 				BUG();
 			} else
 				__dlm_lockres_grab_inflight_worker(dlm, res);
-		} else /* put.. incase we are not the master */
+			spin_unlock(&res->spinlock);
+		} else {
+			/* put.. incase we are not the master */
+			spin_unlock(&res->spinlock);
 			dlm_lockres_put(res);
-		spin_unlock(&res->spinlock);
+		}
 	}
 	spin_unlock(&dlm->spinlock);
 
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
 
 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
 {
-	int ret;
 	struct ocfs2_dlm_seq_priv *priv;
-	struct seq_file *seq;
 	struct ocfs2_super *osb;
 
-	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
 	if (!priv) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
 	}
+
 	osb = inode->i_private;
 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
 	priv->p_dlm_debug = osb->osb_dlm_debug;
 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
 
-	ret = seq_open(file, &ocfs2_dlm_seq_ops);
-	if (ret) {
-		kfree(priv);
-		mlog_errno(ret);
-		goto out;
-	}
-
-	seq = file->private_data;
-	seq->private = priv;
-
 	ocfs2_add_lockres_tracking(&priv->p_iter_res,
 				   priv->p_dlm_debug);
 
-out:
-	return ret;
+	return 0;
 }
 
 static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..682732f3f0d8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-	handle_t *handle = NULL;
+	handle_t *handle;
 	int ret = 0;
 	unsigned zero_from, zero_to, block_start, block_end;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
+	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
 	page = find_or_create_page(mapping, index, GFP_NOFS);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
-		goto out;
+		goto out_commit_trans;
 	}
 
 	/* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 			goto out_unlock;
 		}
 
-		if (!handle) {
-			handle = ocfs2_zero_start_ordered_transaction(inode,
-								      di_bh);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				handle = NULL;
-				break;
-			}
-		}
 
 		/* must not update i_size! */
 		ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 			ret = 0;
 	}
 
+	/*
+	 * fs-writeback will release the dirty pages without page lock
+	 * whose offset are over inode size, the release happens at
+	 * block_write_full_page().
+	 */
+	i_size_write(inode, abs_to);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	di->i_mtime_nsec = di->i_ctime_nsec;
 	if (handle) {
-		/*
-		 * fs-writeback will release the dirty pages without page lock
-		 * whose offset are over inode size, the release happens at
-		 * block_write_full_page().
-		 */
-		i_size_write(inode, abs_to);
-		inode->i_blocks = ocfs2_inode_sector_count(inode);
-		di->i_size = cpu_to_le64((u64)i_size_read(inode));
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-		di->i_mtime_nsec = di->i_ctime_nsec;
 		ocfs2_journal_dirty(handle, di_bh);
 		ocfs2_update_inode_fsync_trans(handle, inode, 1);
-		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 	}
 
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
+out_commit_trans:
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 {
 	int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
 
-	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+	return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
 }
 
 /* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6219aaadeb08..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
 	 * 'vict_blkno' was out of the valid range.
 	 */
 	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
-	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+	    (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
 				bits_per_unit))) {
 		ret = -EINVAL;
 		goto out;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
 		 */
 		ocfs2_control_this_node = -1;
 		running_proto.pv_major = 0;
-		running_proto.pv_major = 0;
+		running_proto.pv_minor = 0;
 	}
 
 out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..4142546aedae 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2532,6 +2532,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	kfree(osb->journal);
 	kfree(osb->local_alloc_copy);
 	kfree(osb->uuid_str);
+	kfree(osb->vol_label);
 	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
 	memset(osb, 0, sizeof(struct ocfs2_super));
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index baf852b648ad..950100e326a1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -376,37 +376,6 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
-#ifdef CONFIG_CGROUPS
-static int cgroup_open(struct inode *inode, struct file *file)
-{
-	struct pid *pid = PROC_I(inode)->pid;
-	return single_open(file, proc_cgroup_show, pid);
-}
-
-static const struct file_operations proc_cgroup_operations = {
-	.open		= cgroup_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
-#ifdef CONFIG_PROC_PID_CPUSET
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
-	struct pid *pid = PROC_I(inode)->pid;
-	return single_open(file, proc_cpuset_show, pid);
-}
-
-static const struct file_operations proc_cpuset_operations = {
-	.open		= cpuset_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 			  struct pid *pid, struct task_struct *task)
 {
@@ -632,29 +601,35 @@ static const struct file_operations proc_single_file_operations = {
 	.release	= single_release,
 };
 
-static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 {
-	struct task_struct *task = get_proc_task(file_inode(file));
-	struct mm_struct *mm;
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct *mm = ERR_PTR(-ESRCH);
 
-	if (!task)
-		return -ESRCH;
+	if (task) {
+		mm = mm_access(task, mode);
+		put_task_struct(task);
 
-	mm = mm_access(task, mode);
-	put_task_struct(task);
+		if (!IS_ERR_OR_NULL(mm)) {
+			/* ensure this mm_struct can't be freed */
+			atomic_inc(&mm->mm_count);
+			/* but do not pin its memory */
+			mmput(mm);
+		}
+	}
+
+	return mm;
+}
+
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+	struct mm_struct *mm = proc_mem_open(inode, mode);
 
 	if (IS_ERR(mm))
 		return PTR_ERR(mm);
 
-	if (mm) {
-		/* ensure this mm_struct can't be freed */
-		atomic_inc(&mm->mm_count);
-		/* but do not pin its memory */
-		mmput(mm);
-	}
-
 	file->private_data = mm;
-
 	return 0;
 }
 
@@ -2573,10 +2548,10 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",     S_IRUGO, proc_cpuset_operations),
+	ONE("cpuset",     S_IRUGO, proc_cpuset_show),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
@@ -2919,10 +2894,10 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",    S_IRUGO, proc_cpuset_operations),
+	ONE("cpuset",    S_IRUGO, proc_cpuset_show),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
+	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
 #endif
 	ONE("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
  * task_[no]mmu.c
  */
 struct proc_maps_private {
-	struct pid *pid;
+	struct inode *inode;
 	struct task_struct *task;
+	struct mm_struct *mm;
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
 #endif
@@ -278,6 +279,8 @@ struct proc_maps_private {
 #endif
 };
 
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
+
 extern const struct file_operations proc_pid_maps_operations;
 extern const struct file_operations proc_tid_maps_operations;
 extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6df8d0722c97..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
 struct kcore_list kcore_modules;
 static void __init add_modules_range(void)
 {
-	kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+	if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+		kclist_add(&kcore_modules, (void *)MODULES_VADDR,
 			MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+	}
 }
 #else
 static void __init add_modules_range(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
 	if (PageBuddy(page))
 		u |= 1 << KPF_BUDDY;
 
+	if (PageBalloon(page))
+		u |= 1 << KPF_BALLOON;
+
 	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
 
 	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c42d64..b7a7dc963a35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
 
 #ifdef CONFIG_NUMA
 /*
- * These functions are for numa_maps but called in generic **maps seq_file
- * ->start(), ->stop() ops.
- *
- * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
- * Each mempolicy object is controlled by reference counting. The problem here
- * is how to avoid accessing dead mempolicy object.
- *
- * Because we're holding mmap_sem while reading seq_file, it's safe to access
- * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
- *
- * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
- * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
- * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
- * gurantee the task never exits under us. But taking task_lock() around
- * get_vma_plicy() causes lock order problem.
- *
- * To access task->mempolicy without lock, we hold a reference count of an
- * object pointed by task->mempolicy and remember it. This will guarantee
- * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ * Save get_task_policy() for show_numa_map().
  */
 static void hold_task_mempolicy(struct proc_maps_private *priv)
 {
 	struct task_struct *task = priv->task;
 
 	task_lock(task);
-	priv->task_mempolicy = task->mempolicy;
+	priv->task_mempolicy = get_task_policy(task);
 	mpol_get(priv->task_mempolicy);
 	task_unlock(task);
 }
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+static void vma_stop(struct proc_maps_private *priv)
 {
-	if (vma && vma != priv->tail_vma) {
-		struct mm_struct *mm = vma->vm_mm;
-		release_task_mempolicy(priv);
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-	}
+	struct mm_struct *mm = priv->mm;
+
+	release_task_mempolicy(priv);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+}
+
+static struct vm_area_struct *
+m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+	if (vma == priv->tail_vma)
+		return NULL;
+	return vma->vm_next ?: priv->tail_vma;
+}
+
+static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+	if (m->count < m->size)	/* vma is copied successfully */
+		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
 }
 
-static void *m_start(struct seq_file *m, loff_t *pos)
+static void *m_start(struct seq_file *m, loff_t *ppos)
 {
 	struct proc_maps_private *priv = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma = NULL;
-	loff_t l = *pos;
-
-	/* Clear the per syscall fields in priv */
-	priv->task = NULL;
-	priv->tail_vma = NULL;
-
-	/*
-	 * We remember last_addr rather than next_addr to hit with
-	 * vmacache most of the time. We have zero last_addr at
-	 * the beginning and also after lseek. We will have -1 last_addr
-	 * after the end of the vmas.
-	 */
+	struct vm_area_struct *vma;
+	unsigned int pos = *ppos;
 
+	/* See m_cache_vma(). Zero at the start or after lseek. */
 	if (last_addr == -1UL)
 		return NULL;
 
-	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	priv->task = get_proc_task(priv->inode);
 	if (!priv->task)
 		return ERR_PTR(-ESRCH);
 
-	mm = mm_access(priv->task, PTRACE_MODE_READ);
-	if (!mm || IS_ERR(mm))
-		return mm;
-	down_read(&mm->mmap_sem);
+	mm = priv->mm;
+	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+		return NULL;
 
-	tail_vma = get_gate_vma(priv->task->mm);
-	priv->tail_vma = tail_vma;
+	down_read(&mm->mmap_sem);
 	hold_task_mempolicy(priv);
-	/* Start with last addr hint */
-	vma = find_vma(mm, last_addr);
-	if (last_addr && vma) {
-		vma = vma->vm_next;
-		goto out;
+	priv->tail_vma = get_gate_vma(mm);
+
+	if (last_addr) {
+		vma = find_vma(mm, last_addr);
+		if (vma && (vma = m_next_vma(priv, vma)))
+			return vma;
 	}
 
-	/*
-	 * Check the vma index is within the range and do
-	 * sequential scan until m_index.
-	 */
-	vma = NULL;
-	if ((unsigned long)l < mm->map_count) {
-		vma = mm->mmap;
-		while (l-- && vma)
+	m->version = 0;
+	if (pos < mm->map_count) {
+		for (vma = mm->mmap; pos; pos--) {
+			m->version = vma->vm_start;
 			vma = vma->vm_next;
-		goto out;
+		}
+		return vma;
 	}
 
-	if (l != mm->map_count)
-		tail_vma = NULL; /* After gate vma */
-
-out:
-	if (vma)
-		return vma;
+	/* we do not bother to update m->version in this case */
+	if (pos == mm->map_count && priv->tail_vma)
+		return priv->tail_vma;
 
-	release_task_mempolicy(priv);
-	/* End of vmas has been reached */
-	m->version = (tail_vma != NULL)? 0: -1UL;
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return tail_vma;
+	vma_stop(priv);
+	return NULL;
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = priv->tail_vma;
+	struct vm_area_struct *next;
 
 	(*pos)++;
-	if (vma && (vma != tail_vma) && vma->vm_next)
-		return vma->vm_next;
-	vma_stop(priv, vma);
-	return (vma != tail_vma)? tail_vma: NULL;
+	next = m_next_vma(priv, v);
+	if (!next)
+		vma_stop(priv);
+	return next;
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
 
-	if (!IS_ERR(vma))
-		vma_stop(priv, vma);
-	if (priv->task)
+	if (!IS_ERR_OR_NULL(v))
+		vma_stop(priv);
+	if (priv->task) {
 		put_task_struct(priv->task);
+		priv->task = NULL;
+	}
+}
+
+static int proc_maps_open(struct inode *inode, struct file *file,
+			const struct seq_operations *ops, int psize)
+{
+	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+
+	if (!priv)
+		return -ENOMEM;
+
+	priv->inode = inode;
+	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+	if (IS_ERR(priv->mm)) {
+		int err = PTR_ERR(priv->mm);
+
+		seq_release_private(inode, file);
+		return err;
+	}
+
+	return 0;
+}
+
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct proc_maps_private *priv = seq->private;
+
+	if (priv->mm)
+		mmdrop(priv->mm);
+
+	return seq_release_private(inode, file);
 }
 
 static int do_maps_open(struct inode *inode, struct file *file,
 			const struct seq_operations *ops)
 {
-	struct proc_maps_private *priv;
-	int ret = -ENOMEM;
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (priv) {
-		priv->pid = proc_pid(inode);
-		ret = seq_open(file, ops);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			m->private = priv;
-		} else {
-			kfree(priv);
-		}
+	return proc_maps_open(inode, file, ops,
+				sizeof(struct proc_maps_private));
+}
+
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+				struct vm_area_struct *vma, bool is_pid)
+{
+	struct inode *inode = priv->inode;
+	struct task_struct *task;
+	pid_t ret = 0;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		task = task_of_stack(task, vma, is_pid);
+		if (task)
+			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
 	}
+	rcu_read_unlock();
+
 	return ret;
 }
 
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
 	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
 	vm_flags_t flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 			goto done;
 		}
 
-		tid = vm_is_stack(task, vma, is_pid);
-
+		tid = pid_of_stack(priv, vma, is_pid);
 		if (tid != 0) {
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
 
 static int show_map(struct seq_file *m, void *v, int is_pid)
 {
-	struct vm_area_struct *vma = v;
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
-
-	show_map_vma(m, vma, is_pid);
-
-	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task->mm))
-			? vma->vm_start : 0;
+	show_map_vma(m, v, is_pid);
+	m_cache_vma(m, v);
 	return 0;
 }
 
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
 	.open		= pid_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 
 const struct file_operations proc_tid_maps_operations = {
 	.open		= tid_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 
 /*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
 	struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 				mss.nonlinear >> 10);
 
 	show_smap_vma_flags(m, vma);
-
-	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task->mm))
-			? vma->vm_start : 0;
+	m_cache_vma(m, vma);
 	return 0;
 }
 
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
 	.open		= pid_smaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 
 const struct file_operations proc_tid_smaps_operations = {
 	.open		= tid_smaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 
 /*
@@ -931,23 +929,32 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	while (addr < end) {
 		struct vm_area_struct *vma = find_vma(walk->mm, addr);
 		pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-		unsigned long vm_end;
-
-		if (!vma) {
-			vm_end = end;
-		} else {
-			vm_end = min(end, vma->vm_end);
-			if (vma->vm_flags & VM_SOFTDIRTY)
-				pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+		/* End of address space hole, which we mark as non-present. */
+		unsigned long hole_end;
+
+		if (vma)
+			hole_end = min(end, vma->vm_start);
+		else
+			hole_end = end;
+
+		for (; addr < hole_end; addr += PAGE_SIZE) {
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				goto out;
 		}
 
-		for (; addr < vm_end; addr += PAGE_SIZE) {
+		if (!vma)
+			break;
+
+		/* Addresses in the VMA. */
+		if (vma->vm_flags & VM_SOFTDIRTY)
+			pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
 			err = add_to_pagemap(addr, &pme, pm);
 			if (err)
 				goto out;
 		}
 	}
-
 out:
 	return err;
 }
@@ -1020,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	spinlock_t *ptl;
 	pte_t *pte;
 	int err = 0;
-	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
@@ -1034,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
+			pagemap_entry_t pme;
 
 			offset = (addr & ~PAGEMAP_WALK_MASK) >>
 					PAGE_SHIFT;
@@ -1048,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 	if (pmd_trans_unstable(pmd))
 		return 0;
-	for (; addr != end; addr += PAGE_SIZE) {
-		int flags2;
-
-		/* check to see if we've left 'vma' behind
-		 * and need a new, higher one */
-		if (vma && (addr >= vma->vm_end)) {
-			vma = find_vma(walk->mm, addr);
-			if (vma && (vma->vm_flags & VM_SOFTDIRTY))
-				flags2 = __PM_SOFT_DIRTY;
-			else
-				flags2 = 0;
-			pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+
+	while (1) {
+		/* End of address space hole, which we mark as non-present. */
+		unsigned long hole_end;
+
+		if (vma)
+			hole_end = min(end, vma->vm_start);
+		else
+			hole_end = end;
+
+		for (; addr < hole_end; addr += PAGE_SIZE) {
+			pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				return err;
 		}
 
-		/* check that 'vma' actually covers this address,
-		 * and that it isn't a huge page vma */
-		if (vma && (vma->vm_start <= addr) &&
-		    !is_vm_hugetlb_page(vma)) {
+		if (!vma || vma->vm_start >= end)
+			break;
+		/*
+		 * We can't possibly be in a hugetlb VMA. In general,
+		 * for a mm_walk with a pmd_entry and a hugetlb_entry,
+		 * the pmd_entry can only be called on addresses in a
+		 * hugetlb if the walk starts in a non-hugetlb VMA and
+		 * spans a hugepage VMA. Since pagemap_read walks are
+		 * PMD-sized and PMD-aligned, this will never be true.
+		 */
+		BUG_ON(is_vm_hugetlb_page(vma));
+
+		/* Addresses in the VMA. */
+		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+			pagemap_entry_t pme;
 			pte = pte_offset_map(pmd, addr);
 			pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
-			/* unmap before userspace copy */
 			pte_unmap(pte);
+			err = add_to_pagemap(addr, &pme, pm);
+			if (err)
+				return err;
 		}
-		err = add_to_pagemap(addr, &pme, pm);
-		if (err)
-			return err;
+
+		if (addr == end)
+			break;
+
+		vma = find_vma(walk->mm, addr);
 	}
 
 	cond_resched();
@@ -1406,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
-	struct task_struct *task = proc_priv->task;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {};
 	struct mempolicy *pol;
@@ -1426,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.private = md;
 	walk.mm = mm;
 
-	pol = get_vma_policy(task, vma, vma->vm_start);
-	mpol_to_str(buffer, sizeof(buffer), pol);
-	mpol_cond_put(pol);
+	pol = __get_vma_policy(vma, vma->vm_start);
+	if (pol) {
+		mpol_to_str(buffer, sizeof(buffer), pol);
+		mpol_cond_put(pol);
+	} else {
+		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+	}
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1438,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
 	} else {
-		pid_t tid = vm_is_stack(task, vma, is_pid);
+		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
 		if (tid != 0) {
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
@@ -1486,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
 out:
 	seq_putc(m, '\n');
-
-	if (m->count < m->size)
-		m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+	m_cache_vma(m, vma);
 	return 0;
 }
 
@@ -1519,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
 static int numa_maps_open(struct inode *inode, struct file *file,
 			  const struct seq_operations *ops)
 {
-	struct numa_maps_private *priv;
-	int ret = -ENOMEM;
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (priv) {
-		priv->proc_maps.pid = proc_pid(inode);
-		ret = seq_open(file, ops);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			m->private = priv;
-		} else {
-			kfree(priv);
-		}
-	}
-	return ret;
+	return proc_maps_open(inode, file, ops,
+				sizeof(struct numa_maps_private));
 }
 
 static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1549,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
 	.open		= pid_numa_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 
 const struct file_operations proc_tid_numa_maps_operations = {
 	.open		= tid_numa_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= proc_map_release,
 };
 #endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
 	return size;
 }
 
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+				struct vm_area_struct *vma, bool is_pid)
+{
+	struct inode *inode = priv->inode;
+	struct task_struct *task;
+	pid_t ret = 0;
+
+	rcu_read_lock();
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		task = task_of_stack(task, vma, is_pid);
+		if (task)
+			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /*
  * display a single VMA to a sequenced file
  */
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "");
 	} else if (mm) {
-		pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+		pid_t tid = pid_of_stack(priv, vma, is_pid);
 
 		if (tid != 0) {
 			seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	loff_t n = *pos;
 
 	/* pin the task and mm whilst we play with them */
-	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	priv->task = get_proc_task(priv->inode);
 	if (!priv->task)
 		return ERR_PTR(-ESRCH);
 
-	mm = mm_access(priv->task, PTRACE_MODE_READ);
-	if (!mm || IS_ERR(mm)) {
-		put_task_struct(priv->task);
-		priv->task = NULL;
-		return mm;
-	}
-	down_read(&mm->mmap_sem);
+	mm = priv->mm;
+	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+		return NULL;
 
+	down_read(&mm->mmap_sem);
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)
 			return p;
+
+	up_read(&mm->mmap_sem);
+	mmput(mm);
 	return NULL;
 }
 
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
 {
 	struct proc_maps_private *priv = m->private;
 
+	if (!IS_ERR_OR_NULL(_vml)) {
+		up_read(&priv->mm->mmap_sem);
+		mmput(priv->mm);
+	}
 	if (priv->task) {
-		struct mm_struct *mm = priv->task->mm;
-		up_read(&mm->mmap_sem);
-		mmput(mm);
 		put_task_struct(priv->task);
+		priv->task = NULL;
 	}
 }
 
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
 		     const struct seq_operations *ops)
 {
 	struct proc_maps_private *priv;
-	int ret = -ENOMEM;
-
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (priv) {
-		priv->pid = proc_pid(inode);
-		ret = seq_open(file, ops);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			m->private = priv;
-		} else {
-			kfree(priv);
-		}
+
+	priv = __seq_open_private(file, ops, sizeof(*priv));
+	if (!priv)
+		return -ENOMEM;
+
+	priv->inode = inode;
+	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+	if (IS_ERR(priv->mm)) {
+		int err = PTR_ERR(priv->mm);
+
+		seq_release_private(inode, file);
+		return err;
 	}
-	return ret;
+
+	return 0;
+}
+
+
+static int map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct proc_maps_private *priv = seq->private;
+
+	if (priv->mm)
+		mmdrop(priv->mm);
+
+	return seq_release_private(inode, file);
 }
 
 static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
 	.open		= pid_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= map_release,
 };
 
 const struct file_operations proc_tid_maps_operations = {
 	.open		= tid_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= map_release,
 };
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f2d0eee9d1f1..8b663b2d9562 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2725,7 +2725,7 @@ static int __init dquot_init(void)
 		panic("Cannot create dquot hash table");
 
 	for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
-		ret = percpu_counter_init(&dqstats.counter[i], 0);
+		ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
 		if (ret)
 			panic("Cannot create dquot stat counters");
 	}
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
 	 * is called, so take i_lock for that case.
 	 *
-	 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+	 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
 	 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
 	 * for that case too, and do both at once by combining the tests.
 	 *
diff --git a/fs/super.c b/fs/super.c
index b9a214d2fe98..1b836107acee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		goto fail;
 
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-		if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
+		if (percpu_counter_init(&s->s_writers.counter[i], 0,
+					GFP_KERNEL) < 0)
 			goto fail;
 		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
 				 &type->s_writers_key[i], 0);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c350216ea8..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		spin_lock_irq(&ctx->wqh.lock);
 		if (!timerfd_canceled(ctx)) {
 			ctx->ticks = ticks;
-			if (ticks)
-				wake_up_locked(&ctx->wqh);
+			wake_up_locked(&ctx->wqh);
 		} else
 			ret = -ECANCELED;
 		spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
 	ufsi->i_oeftflag = 0;
 	ufsi->i_dir_start_lookup = 0;
 	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EIO;
+		goto failed;
+	}
 	mark_inode_dirty(inode);
 
 	if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
 fail_remove_inode:
 	unlock_ufs(sb);
 	clear_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	UFSD("EXIT (FAILED): err %d\n", err);
 	return ERR_PTR(err);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 2df62a73f20c..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ufs_add_link(dentry, inode);
 	if (!err) {
+		unlock_new_inode(inode);
 		d_instantiate(dentry, inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -155,6 +157,7 @@ out_notlocked:
 
 out_fail:
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	goto out;
 }
@@ -210,6 +213,7 @@ out:
 out_fail:
 	inode_dec_link_count(inode);
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput (inode);
 	inode_dec_link_count(dir);
 	unlock_ufs(dir->i_sb);