From 94387fb1aa16ee853d00f959373132a181b0196b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 22 Jul 2007 17:09:05 -0400
Subject: NFS: Add the helper nfs_vm_page_mkwrite

This is needed in order to set up a proper nfs_page request for mmapped
files.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 579cf8a7d4a7..f2270fff4962 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -33,6 +33,7 @@
 #include <asm/system.h>
 
 #include "delegation.h"
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
@@ -55,6 +56,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 
+static struct vm_operations_struct nfs_file_vm_ops;
+
 const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
@@ -257,8 +260,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	status = nfs_revalidate_mapping(inode, file->f_mapping);
-	if (!status)
-		status = generic_file_mmap(file, vma);
+	if (!status) {
+		vma->vm_ops = &nfs_file_vm_ops;
+		vma->vm_flags |= VM_CAN_NONLINEAR;
+		file_accessed(file);
+	}
 	return status;
 }
 
@@ -346,6 +352,31 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 };
 
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct file *filp = vma->vm_file;
+	unsigned pagelen;
+	int ret = -EINVAL;
+
+	lock_page(page);
+	if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+		goto out_unlock;
+	pagelen = nfs_page_length(page);
+	if (pagelen == 0)
+		goto out_unlock;
+	ret = nfs_prepare_write(filp, page, 0, pagelen);
+	if (!ret)
+		ret = nfs_commit_write(filp, page, 0, pagelen);
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
+static struct vm_operations_struct nfs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = nfs_vm_page_mkwrite,
+};
+
 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
-- 
cgit v1.2.3


From 9cccef95052c7169040c3577e17d4f6fa230cc28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 22 Jul 2007 17:09:05 -0400
Subject: NFS: Clean up write code...

The addition of nfs_page_mkwrite means that We should no longer need to
create requests inside nfs_writepage()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c            |  2 +-
 fs/nfs/write.c           | 68 +++---------------------------------------------
 include/linux/nfs_fs.h   |  1 -
 include/linux/nfs_page.h |  1 -
 4 files changed, 5 insertions(+), 67 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f2270fff4962..07f43791f696 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -339,7 +339,7 @@ static int nfs_launder_page(struct page *page)
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
-	.set_page_dirty = nfs_set_page_dirty,
+	.set_page_dirty = __set_page_dirty_nobuffers,
 	.writepage = nfs_writepage,
 	.writepages = nfs_writepages,
 	.prepare_write = nfs_prepare_write,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0d7a77cc394b..0c346d79fb34 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -243,10 +243,7 @@ static void nfs_end_page_writeback(struct page *page)
 
 /*
  * Find an associated nfs write request, and prepare to flush it out
- * Returns 1 if there was no write request, or if the request was
- * already tagged by nfs_set_page_dirty.Returns 0 if the request
- * was not tagged.
- * May also return an error if the user signalled nfs_wait_on_request().
+ * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page)
@@ -261,7 +258,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL) {
 			spin_unlock(&inode->i_lock);
-			return 1;
+			return 0;
 		}
 		if (nfs_lock_request_dontget(req))
 			break;
@@ -282,7 +279,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		spin_unlock(&inode->i_lock);
 		nfs_unlock_request(req);
 		nfs_pageio_complete(pgio);
-		return 1;
+		return 0;
 	}
 	if (nfs_set_page_writeback(page) != 0) {
 		spin_unlock(&inode->i_lock);
@@ -290,10 +287,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	}
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
 			NFS_PAGE_TAG_LOCKED);
-	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
 	spin_unlock(&inode->i_lock);
 	nfs_pageio_add_request(pgio, req);
-	return ret;
+	return 0;
 }
 
 /*
@@ -302,9 +298,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
 	struct nfs_pageio_descriptor mypgio, *pgio;
-	struct nfs_open_context *ctx;
 	struct inode *inode = page->mapping->host;
-	unsigned offset;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
@@ -320,28 +314,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	nfs_pageio_cond_complete(pgio, page->index);
 
 	err = nfs_page_async_flush(pgio, page);
-	if (err <= 0)
-		goto out;
-	err = 0;
-	offset = nfs_page_length(page);
-	if (!offset)
-		goto out;
 
-	nfs_pageio_cond_complete(pgio, page->index);
-
-	ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
-	if (ctx == NULL) {
-		err = -EBADF;
-		goto out;
-	}
-	err = nfs_writepage_setup(ctx, page, 0, offset);
-	put_nfs_open_context(ctx);
-	if (err != 0)
-		goto out;
-	err = nfs_page_async_flush(pgio, page);
-	if (err > 0)
-		err = 0;
-out:
 	if (!wbc->for_writepages)
 		nfs_pageio_complete(pgio);
 	return err;
@@ -395,8 +368,6 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	}
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
-	if (PageDirty(req->wb_page))
-		set_bit(PG_NEED_FLUSH, &req->wb_flags);
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
 	return 0;
@@ -416,8 +387,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	set_page_private(req->wb_page, 0);
 	ClearPagePrivate(req->wb_page);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
-	if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
-		__set_page_dirty_nobuffers(req->wb_page);
 	nfsi->npages--;
 	if (!nfsi->npages) {
 		spin_unlock(&inode->i_lock);
@@ -1477,35 +1446,6 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
-int nfs_set_page_dirty(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode;
-	struct nfs_page *req;
-	int ret;
-
-	if (!mapping)
-		goto out_raced;
-	inode = mapping->host;
-	if (!inode)
-		goto out_raced;
-	spin_lock(&inode->i_lock);
-	req = nfs_page_find_request_locked(page);
-	if (req != NULL) {
-		/* Mark any existing write requests for flushing */
-		ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
-		spin_unlock(&inode->i_lock);
-		nfs_release_request(req);
-		return ret;
-	}
-	ret = __set_page_dirty_nobuffers(page);
-	spin_unlock(&inode->i_lock);
-	return ret;
-out_raced:
-	return !TestSetPageDirty(page);
-}
-
-
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7250eeadd7b5..32bd5d38cdfd 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -420,7 +420,6 @@ extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 extern void nfs_writedata_release(void *);
-extern int nfs_set_page_dirty(struct page *);
 
 /*
  * Try to write back everything synchronously (but check the
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 78e60798d10e..30dbcc185e69 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -30,7 +30,6 @@
 #define PG_BUSY			0
 #define PG_NEED_COMMIT		1
 #define PG_NEED_RESCHED		2
-#define PG_NEED_FLUSH		3
 
 struct nfs_inode;
 struct nfs_page {
-- 
cgit v1.2.3


From f758c885199611504ff681e3ba66c410b4e9e995 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 22 Jul 2007 19:27:32 -0400
Subject: NFS: Clean up nfs_writepages()

Just call write_cache_pages directly instead of hacking the writeback
control structure in order to find out if we were called from writepages()
or directly from the VM.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 57 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0c346d79fb34..667ff27ef235 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -292,41 +292,51 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	return 0;
 }
 
-/*
- * Write an mmapped page to the server.
- */
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct nfs_pageio_descriptor mypgio, *pgio;
 	struct inode *inode = page->mapping->host;
-	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-	if (wbc->for_writepages)
-		pgio = wbc->fs_private;
-	else {
-		nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
-		pgio = &mypgio;
-	}
-
 	nfs_pageio_cond_complete(pgio, page->index);
+	return nfs_page_async_flush(pgio, page);
+}
 
-	err = nfs_page_async_flush(pgio, page);
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+{
+	struct nfs_pageio_descriptor pgio;
+	int err;
 
-	if (!wbc->for_writepages)
-		nfs_pageio_complete(pgio);
-	return err;
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	err = nfs_do_writepage(page, wbc, &pgio);
+	nfs_pageio_complete(&pgio);
+	if (err < 0)
+		return err;
+	if (pgio.pg_error < 0)
+		return pgio.pg_error;
+	return 0;
 }
 
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int err;
+	int ret;
 
-	err = nfs_writepage_locked(page, wbc);
+	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
-	return err; 
+	return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+	int ret;
+
+	ret = nfs_do_writepage(page, wbc, data);
+	unlock_page(page);
+	return ret;
 }
 
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -338,12 +348,11 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
-	wbc->fs_private = &pgio;
-	err = generic_writepages(mapping, wbc);
+	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
-	if (err)
+	if (err < 0)
 		return err;
-	if (pgio.pg_error)
+	if (pgio.pg_error < 0)
 		return pgio.pg_error;
 	return 0;
 }
-- 
cgit v1.2.3


From 90e9a3f9b0a14198a8ae5a0a5c13ad30f0b8b40d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 22 Jul 2007 19:27:46 -0400
Subject: VFS: Remove writeback_control->fs_private

The only user of this field was NFS.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/writeback.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b4af6bcb7b7a..a111d3de2a57 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -61,8 +61,6 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
-
-	void *fs_private;		/* For use by ->writepages() */
 };
 
 /*
-- 
cgit v1.2.3


From ed90ef51a33f572fa7d00c8b05f7457be727e74f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 20 Jul 2007 13:13:28 -0400
Subject: NFS: Clean up NFS writeback flush code

The only user of nfs_sync_mapping_range() is nfs_getattr(), which uses it
to flush out the entire inode without sending a commit. We therefore
replace nfs_sync_mapping_range with a more appropriate helper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  2 +-
 fs/nfs/write.c         | 41 ++++++++++++++---------------------------
 include/linux/nfs_fs.h |  2 +-
 3 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 71a49c3acabd..119fefef13f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -431,7 +431,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 	/* Flush out writes to the server in order to update c/mtime */
 	if (S_ISREG(inode->i_mode))
-		nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
+		nfs_wb_nocommit(inode);
 
 	/*
 	 * We may force a getattr if the user cares about atime.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 667ff27ef235..75adb8e78db9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1325,12 +1325,8 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 	return ret;
 }
 
-/*
- * flush the inode to disk.
- */
-int nfs_wb_all(struct inode *inode)
+static int nfs_write_mapping(struct address_space *mapping, int how)
 {
-	struct address_space *mapping = inode->i_mapping;
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
@@ -1343,35 +1339,26 @@ int nfs_wb_all(struct inode *inode)
 	ret = nfs_writepages(mapping, &wbc);
 	if (ret < 0)
 		goto out;
-	ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
-	if (ret >= 0)
-		return 0;
+	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
+	if (ret < 0)
+		goto out;
+	return 0;
 out:
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return ret;
 }
 
-int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how)
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
 {
-	struct writeback_control wbc = {
-		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.range_start = range_start,
-		.range_end = range_end,
-		.for_writepages = 1,
-	};
-	int ret;
+	return nfs_write_mapping(inode->i_mapping, 0);
+}
 
-	ret = nfs_writepages(mapping, &wbc);
-	if (ret < 0)
-		goto out;
-	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
-	if (ret >= 0)
-		return 0;
-out:
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return ret;
+int nfs_wb_nocommit(struct inode *inode)
+{
+	return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
 }
 
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 32bd5d38cdfd..15eec27cb60b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -426,8 +426,8 @@ extern void nfs_writedata_release(void *);
  * return value!)
  */
 extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
-extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
 extern int nfs_wb_all(struct inode *inode);
+extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
-- 
cgit v1.2.3


From 34901f70d119d88126e7390351b8c780646628e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 25 Jul 2007 14:09:54 -0400
Subject: NFS: Writeback optimisation

Schedule writes using WB_SYNC_NONE first, then come back for a second pass
using WB_SYNC_ALL.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75adb8e78db9..b3c5f5db73a4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1325,21 +1325,14 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 	return ret;
 }
 
-static int nfs_write_mapping(struct address_space *mapping, int how)
+static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
-	struct writeback_control wbc = {
-		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.for_writepages = 1,
-		.range_cyclic = 1,
-	};
 	int ret;
 
-	ret = nfs_writepages(mapping, &wbc);
+	ret = nfs_writepages(mapping, wbc);
 	if (ret < 0)
 		goto out;
-	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
+	ret = nfs_sync_mapping_wait(mapping, wbc, how);
 	if (ret < 0)
 		goto out;
 	return 0;
@@ -1348,6 +1341,25 @@ out:
 	return ret;
 }
 
+/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
+static int nfs_write_mapping(struct address_space *mapping, int how)
+{
+	struct writeback_control wbc = {
+		.bdi = mapping->backing_dev_info,
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = LONG_MAX,
+		.for_writepages = 1,
+		.range_cyclic = 1,
+	};
+	int ret;
+
+	ret = __nfs_write_mapping(mapping, &wbc, how);
+	if (ret < 0)
+		return ret;
+	wbc.sync_mode = WB_SYNC_ALL;
+	return __nfs_write_mapping(mapping, &wbc, how);
+}
+
 /*
  * flush the inode to disk.
  */
-- 
cgit v1.2.3


From 7b159fc18d417980f57aef64cab3417ee6af70f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 25 Jul 2007 14:09:54 -0400
Subject: NFS: Fall back to synchronous writes when a background write
 errors...

This helps prevent huge queues of background writes from building up
whenever the server runs out of disk or quota space, or if someone changes
the file access modes behind our backs.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          | 64 ++++++++++++++++++++++++++++++++++----------------
 fs/nfs/write.c         | 13 +++++++---
 include/linux/nfs_fs.h |  3 +++
 3 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 07f43791f696..5595b32c0915 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -176,6 +176,31 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 	return remote_llseek(filp, offset, origin);
 }
 
+/*
+ * Helper for nfs_file_flush() and nfs_fsync()
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
+{
+	int have_error, status;
+	int ret = 0;
+
+	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	status = nfs_wb_all(inode);
+	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	if (have_error)
+		ret = xchg(&ctx->error, 0);
+	if (!ret)
+		ret = status;
+	return ret;
+}
+
 /*
  * Flush all dirty pages, and check for write errors.
  *
@@ -192,16 +217,11 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
-	lock_kernel();
+
 	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_wb_all(inode);
-	if (!status) {
-		status = ctx->error;
-		ctx->error = 0;
-		if (!status)
-			nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	}
-	unlock_kernel();
+	status = nfs_do_fsync(ctx, inode);
+	if (!status)
+		nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	return status;
 }
 
@@ -278,19 +298,11 @@ nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 	struct inode *inode = dentry->d_inode;
-	int status;
 
 	dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-	lock_kernel();
-	status = nfs_wb_all(inode);
-	if (!status) {
-		status = ctx->error;
-		ctx->error = 0;
-	}
-	unlock_kernel();
-	return status;
+	return nfs_do_fsync(ctx, inode);
 }
 
 /*
@@ -377,6 +389,18 @@ static struct vm_operations_struct nfs_file_vm_ops = {
 	.page_mkwrite = nfs_vm_page_mkwrite,
 };
 
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+	struct nfs_open_context *ctx;
+
+	if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+		return 1;
+	ctx = filp->private_data;
+	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+		return 1;
+	return 0;
+}
+
 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
@@ -413,8 +437,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/* Return error values for O_SYNC and IS_SYNC() */
-	if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) {
-		int err = nfs_fsync(iocb->ki_filp, dentry, 1);
+	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+		int err = nfs_do_fsync(iocb->ki_filp->private_data, inode);
 		if (err < 0)
 			result = err;
 	}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b3c5f5db73a4..fb396ea5accf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -110,6 +110,13 @@ void nfs_writedata_release(void *wdata)
 	nfs_writedata_free(wdata);
 }
 
+static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+	ctx->error = error;
+	smp_wmb();
+	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
+
 static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 {
 	struct nfs_page *req = NULL;
@@ -945,7 +952,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 
 	if (task->tk_status < 0) {
 		nfs_set_pageerror(page);
-		req->wb_context->error = task->tk_status;
+		nfs_context_set_write_error(req->wb_context, task->tk_status);
 		dprintk(", error = %d\n", task->tk_status);
 		goto out;
 	}
@@ -1008,7 +1015,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 
 		if (task->tk_status < 0) {
 			nfs_set_pageerror(page);
-			req->wb_context->error = task->tk_status;
+			nfs_context_set_write_error(req->wb_context, task->tk_status);
 			dprintk(", error = %d\n", task->tk_status);
 			goto remove_request;
 		}
@@ -1222,7 +1229,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (task->tk_status < 0) {
-			req->wb_context->error = task->tk_status;
+			nfs_context_set_write_error(req->wb_context, task->tk_status);
 			nfs_inode_remove_request(req);
 			dprintk(", error = %d\n", task->tk_status);
 			goto next;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 15eec27cb60b..1d1343fd2ddd 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -77,6 +77,9 @@ struct nfs_open_context {
 	struct nfs4_state *state;
 	fl_owner_t lockowner;
 	int mode;
+
+	unsigned long flags;
+#define NFS_CONTEXT_ERROR_WRITE		(0)
 	int error;
 
 	struct list_head list;
-- 
cgit v1.2.3


From 50e437d522a6cc34a882b2f740297f1b6b4c3af3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Jun 2007 22:44:34 -0400
Subject: SUNRPC: Convert rpc_pipefs to use the generic filesystem notification
 hooks

This will allow rpc.gssd to use inotify instead of dnotify in order to
locate new rpc upcall pipes.

This also requires the exporting of __audit_inode_child(), which is used by
fsnotify_create() and fsnotify_mkdir(). Ccing David Woodhouse.

Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 kernel/auditsc.c      | 1 +
 net/sunrpc/rpc_pipe.c | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 04f3ffb8d9d4..0ae703c157ba 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1525,6 +1525,7 @@ add_names:
 			context->names[idx].ino = (unsigned long)-1;
 	}
 }
+EXPORT_SYMBOL_GPL(__audit_inode_child);
 
 /**
  * auditsc_get_stamp - get local copies of audit_context values
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 669e12a4ed18..ae83ac84f63b 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/kernel.h>
 
 #include <asm/ioctls.h>
@@ -585,6 +585,7 @@ rpc_populate(struct dentry *parent,
 		if (S_ISDIR(mode))
 			inc_nlink(dir);
 		d_add(dentry, inode);
+		fsnotify_create(dir, dentry);
 	}
 	mutex_unlock(&dir->i_mutex);
 	return 0;
@@ -606,7 +607,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
 	inode->i_ino = iunique(dir->i_sb, 100);
 	d_instantiate(dentry, inode);
 	inc_nlink(dir);
-	inode_dir_notify(dir, DN_CREATE);
+	fsnotify_mkdir(dir, dentry);
 	return 0;
 out_err:
 	printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
@@ -748,7 +749,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi
 	rpci->flags = flags;
 	rpci->ops = ops;
 	rpci->nkern_readwriters = 1;
-	inode_dir_notify(dir, DN_CREATE);
+	fsnotify_create(dir, dentry);
 	dget(dentry);
 out:
 	mutex_unlock(&dir->i_mutex);
-- 
cgit v1.2.3


From 4e769b934e7638038e232c05b64f644e7269a90f Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Fri, 3 Aug 2007 15:07:10 -0400
Subject: 64 bit ino support for NFS client

Hi.

Attached is a patch to modify the NFS client code to support
64 bit ino's, as appropriate for the system and the NFS
protocol version.

The code basically just expand the NFS interfaces for routines
which handle ino's from using ino_t to u64 and then uses the
fileid in the nfs_inode instead of i_ino in the inode.  The
code paths that were updated are in the getattr method and
the readdir methods.

This should be no real change on 64 bit platforms.  Since
the ino_t is an unsigned long, it would already be 64 bits
wide.

    Thanx...

           ps

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 15 ++++++++-------
 fs/nfs/inode.c    |  4 +++-
 fs/nfs/nfs4proc.c |  4 ++--
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e4a04d16b8b0..dd02db43cbe6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -407,7 +407,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct file	*file = desc->file;
 	struct nfs_entry *entry = desc->entry;
 	struct dentry	*dentry = NULL;
-	unsigned long	fileid;
+	u64		fileid;
 	int		loop_count = 0,
 			res;
 
@@ -418,7 +418,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		unsigned d_type = DT_UNKNOWN;
 		/* Note: entry->prev_cookie contains the cookie for
 		 *	 retrieving the current dirent on the server */
-		fileid = nfs_fileid_to_ino_t(entry->ino);
+		fileid = entry->ino;
 
 		/* Get a dentry if we have one */
 		if (dentry != NULL)
@@ -428,7 +428,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		/* Use readdirplus info */
 		if (dentry != NULL && dentry->d_inode != NULL) {
 			d_type = dt_type(dentry->d_inode);
-			fileid = dentry->d_inode->i_ino;
+			fileid = NFS_FILEID(dentry->d_inode);
 		}
 
 		res = filldir(dirent, entry->name, entry->len, 
@@ -1350,9 +1350,9 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 {
 	static unsigned int sillycounter;
-	const int      i_inosize  = sizeof(dir->i_ino)*2;
+	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
 	const int      countersize = sizeof(sillycounter)*2;
-	const int      slen       = sizeof(".nfs") + i_inosize + countersize - 1;
+	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
 	char           silly[slen+1];
 	struct qstr    qsilly;
 	struct dentry *sdentry;
@@ -1370,8 +1370,9 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out;
 
-	sprintf(silly, ".nfs%*.*lx",
-		i_inosize, i_inosize, dentry->d_inode->i_ino);
+	sprintf(silly, ".nfs%*.*Lx",
+		fileidsize, fileidsize,
+		(unsigned long long)NFS_FILEID(dentry->d_inode));
 
 	/* Return delegation in anticipation of the rename */
 	nfs_inode_return_delegation(dentry->d_inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 119fefef13f3..3ad938cecd73 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -450,8 +450,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	else
 		err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	if (!err)
+	if (!err) {
 		generic_fillattr(inode, stat);
+		stat->ino = NFS_FILEID(inode);
+	}
 	return err;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4b90e17555a9..d856e9f5913b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -177,7 +177,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 		*p++ = xdr_one;                         /* bitmap length */
 		*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
 		*p++ = htonl(8);              /* attribute buffer length */
-		p = xdr_encode_hyper(p, dentry->d_inode->i_ino);
+		p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
 	}
 	
 	*p++ = xdr_one;                                  /* next */
@@ -189,7 +189,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	*p++ = xdr_one;                         /* bitmap length */
 	*p++ = htonl(FATTR4_WORD0_FILEID);             /* bitmap */
 	*p++ = htonl(8);              /* attribute buffer length */
-	p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino);
+	p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
 
 	readdir->pgbase = (char *)p - (char *)start;
 	readdir->count -= readdir->pgbase;
-- 
cgit v1.2.3


From c7e15961115028b99f6142266b5fb08acca0e8dd Mon Sep 17 00:00:00 2001
From: Fabio Olive Leite <fleite@redhat.com>
Date: Thu, 26 Jul 2007 22:59:00 -0300
Subject: Re: [NFS] [PATCH] Attribute timeout handling and wrapping u32 jiffies

I would like to discuss the idea that the current checks for attribute
timeout using time_after are inadequate for 32bit architectures, since
time_after works correctly only when the two timestamps being compared
are within 2^31 jiffies of each other. The signed overflow caused by
comparing values more than 2^31 jiffies apart will flip the result,
causing incorrect assumptions of validity.

2^31 jiffies is a fairly large period of time (~25 days) when compared
to the lifetime of most kernel data structures, but for long lived NFS
mounts that can sit idle for months (think that for some reason autofs
cannot be used), it is easy to compare inode attribute timestamps with
very disparate or even bogus values (as in when jiffies have wrapped
many times, where the comparison doesn't even make sense).

Currently the code tests for attribute timeout by simply adding the
desired amount of jiffies to the stored timestamp and comparing that
with the current timestamp of obtained attribute data with time_after.
This is incorrect, as it returns true for the desired timeout period
and another full 2^31 range of jiffies.

In testing with artificial jumps (several small jumps, not one big
crank) of the jiffies I was able to reproduce a problem found in a
server with very long lived NFS mounts, where attributes would not be
refreshed even after touching files and directories in the server:

Initial uptime:
03:42:01 up 6 min, 0 users, load average: 0.01, 0.12, 0.07

NFS volume is mounted and time is advanced:
03:38:09 up 25 days, 2 min, 0 users, load average: 1.22, 1.05, 1.08

# ls -l /local/A/foo/bar /nfs/A/foo/bar
-rw-r--r--  1 root root 0 Dec 17 03:38 /local/A/foo/bar
-rw-r--r--  1 root root 0 Nov 22 00:36 /nfs/A/foo/bar

# touch /local/A/foo/bar

# ls -l /local/A/foo/bar /nfs/A/foo/bar
-rw-r--r--  1 root root 0 Dec 17 03:47 /local/A/foo/bar
-rw-r--r--  1 root root 0 Nov 22 00:36 /nfs/A/foo/bar

We can see the local mtime is updated, but the NFS mount still shows
the old value. The patch below makes it work:

Initial setup...
07:11:02 up 25 days, 1 min,  0 users,  load average: 0.15, 0.03, 0.04

# ls -l /local/A/foo/bar /nfs/A/foo/bar
-rw-r--r--  1 root root 0 Jan 11 07:11 /local/A/foo/bar
-rw-r--r--  1 root root 0 Jan 11 07:11 /nfs/A/foo/bar

# touch /local/A/foo/bar

# ls -l /local/A/foo/bar /nfs/A/foo/bar
-rw-r--r--  1 root root 0 Jan 11 07:14 /local/A/foo/bar
-rw-r--r--  1 root root 0 Jan 11 07:14 /nfs/A/foo/bar

Signed-off-by: Fabio Olive Leite <fleite@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 2 +-
 fs/nfs/inode.c          | 4 ++--
 include/linux/jiffies.h | 4 ++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index dd02db43cbe6..6e0aa04451dd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1855,7 +1855,7 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs
 	cache = nfs_access_search_rbtree(inode, cred);
 	if (cache == NULL)
 		goto out;
-	if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+	if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3ad938cecd73..7c8ca175d87b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -656,7 +656,7 @@ int nfs_attribute_timeout(struct inode *inode)
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
-	return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
+	return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
 /**
@@ -1055,7 +1055,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-	} else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) {
+	} else if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
 		if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
 			nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index c080f61fb024..f1c87ad24fea 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -115,6 +115,10 @@ static inline u64 get_jiffies_64(void)
 	 ((long)(a) - (long)(b) >= 0))
 #define time_before_eq(a,b)	time_after_eq(b,a)
 
+#define time_in_range(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before_eq(a,c))
+
 /* Same as above, but does so with platform independent 64bit types.
  * These must be used when utilizing jiffies_64 (i.e. return value of
  * get_jiffies_64() */
-- 
cgit v1.2.3


From f8cf3678f4885d66475b19b10ec2e9918ff2ca1c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 3 Aug 2007 16:20:32 +0200
Subject: [NFS] [PATCH] nfs: tiny makefile cleanup

no need to set up foo-objs these days.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b55cb236cf74..df0f41e09885 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -16,4 +16,3 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
-nfs-objs		:= $(nfs-y)
-- 
cgit v1.2.3


From ddc01c0813dc07ca7a2bd32c143a9b54a64915ce Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 30 Jul 2007 08:47:38 -0400
Subject: [NFS] [PATCH] NFS: show addr=ipaddr in /proc/mounts rather than

A minor thing, but useful when working with a server with multiple
addrs. This looks like it might also be necessary if Miklos' effort
to eliminate /etc/mtab ever comes to fruition.

When displaying mount options in /proc/mounts, the kernel prints
"addr=hostname". This info is redundant since we already have the
hostname displayed as part of the "device" section of the mount. This
patch changes it to display the IP address to which the socket is
connected.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b878528b64c1..42b8c671da35 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -506,8 +506,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	nfs_show_mount_options(m, nfss, 0);
 
-	seq_puts(m, ",addr=");
-	seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\");
+	seq_printf(m, ",addr="NIPQUAD_FMT,
+		NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
 
 	return 0;
 }
-- 
cgit v1.2.3


From adc24df81d6903d2d6280d77e936aa5e1670d7a1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:56:31 -0400
Subject: SUNRPC: Fix a signed v. unsigned comparison in rpcbind's XDR routines

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index d1740dbab991..a9d2cdc8e32c 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -490,10 +490,11 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 			       unsigned short *portp)
 {
 	char *addr;
-	int addr_len, c, i, f, first, val;
+	u32 addr_len;
+	int c, i, f, first, val;
 
 	*portp = 0;
-	addr_len = (unsigned int) ntohl(*p++);
+	addr_len = ntohl(*p++);
 	if (addr_len > RPCB_MAXADDRLEN)			/* sanity */
 		return -EINVAL;
 
-- 
cgit v1.2.3


From b595bb15061567441546be1af883b256bcdfff9c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:56:42 -0400
Subject: SUNRPC: Fix a signed v. unsigned comparison in net/sunrpc/xprtsock.c

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4ae7eed7f617..56b226797ae3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -523,7 +523,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct xdr_buf *xdr = &req->rq_snd_buf;
-	int status, retry = 0;
+	int status;
+	unsigned int retry = 0;
 
 	xs_encode_tcp_record_marker(&req->rq_snd_buf);
 
-- 
cgit v1.2.3


From bda243df2f5beebce92bae22bc01960544783984 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:56:52 -0400
Subject: SUNRPC: Use standard macros for printing IP addresses

include/linux/kernel.h gives us some nice macros for formatting IP
addresses.  Use them.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 56b226797ae3..015f29edb619 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -267,7 +267,7 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
 
 	buf = kzalloc(20, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 20, "%u.%u.%u.%u",
+		snprintf(buf, 20, NIPQUAD_FMT,
 				NIPQUAD(addr->sin_addr.s_addr));
 	}
 	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
@@ -286,7 +286,7 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
 
 	buf = kzalloc(48, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s",
+		snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
 			NIPQUAD(addr->sin_addr.s_addr),
 			ntohs(addr->sin_port),
 			xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
-- 
cgit v1.2.3


From 0c43b3d81cca46ab2469f8802f8bd68b49f1b2a5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:56:57 -0400
Subject: SUNRPC: Free address buffers in a loop

Use more generic logic to free buffers holding formatted addresses.  This
makes it less likely a bug will be introduced when adding additional buffer
types in xs_format_peer_address().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 015f29edb619..b3f40b8dbcba 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -279,10 +279,14 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
 	}
 	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
 
-	if (xprt->prot == IPPROTO_UDP)
-		xprt->address_strings[RPC_DISPLAY_PROTO] = "udp";
-	else
-		xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp";
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		if (xprt->prot == IPPROTO_UDP)
+			snprintf(buf, 8, "udp");
+		else
+			snprintf(buf, 8, "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
 
 	buf = kzalloc(48, GFP_KERNEL);
 	if (buf) {
@@ -296,9 +300,10 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
 
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
 {
-	kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
-	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
-	kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
+	int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		kfree(xprt->address_strings[i]);
 }
 
 #define XS_SENDMSG_FLAGS	(MSG_DONTWAIT | MSG_NOSIGNAL)
-- 
cgit v1.2.3


From fbfe3cc677c1a62ca6472abf24d03d4bf9f03a55 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:02 -0400
Subject: SUNRPC: Add hex-formatted address support to rpc_peeraddr2str()

Add support for the NFS client's need to export volume information
with IP addresses formatted in hex instead of decimal.

This isn't used yet, but subsequent patches (not in this series) will
change the NFS client to use this functionality.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  2 ++
 net/sunrpc/xprtsock.c       | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index d11cedd14f0f..4596c26f97aa 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -53,6 +53,8 @@ enum rpc_display_format_t {
 	RPC_DISPLAY_PORT,
 	RPC_DISPLAY_PROTO,
 	RPC_DISPLAY_ALL,
+	RPC_DISPLAY_HEX_ADDR,
+	RPC_DISPLAY_HEX_PORT,
 	RPC_DISPLAY_MAX,
 };
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b3f40b8dbcba..8a684d873111 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -296,6 +296,20 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt)
 			xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
 	}
 	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(10, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 10, "%02x%02x%02x%02x",
+				NIPQUAD(addr->sin_addr.s_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%4hx",
+				ntohs(addr->sin_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
 }
 
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
-- 
cgit v1.2.3


From ba10f2c23471b2ef106eb0c71ead4e9862766b8d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:07 -0400
Subject: SUNRPC: Rename xs_format_peer_addresses

Prepare to add an IPv6 version of xs_format_peer_addresses by renaming it
to xs_format_ipv4_peer_addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8a684d873111..45fac56b40f9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -260,7 +260,7 @@ struct sock_xprt {
 #define TCP_RCV_COPY_XID	(1UL << 2)
 #define TCP_RCV_COPY_DATA	(1UL << 3)
 
-static void xs_format_peer_addresses(struct rpc_xprt *xprt)
+static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
 	char *buf;
@@ -1600,7 +1600,7 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 	else
 		xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
 
-	xs_format_peer_addresses(xprt);
+	xs_format_ipv4_peer_addresses(xprt);
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1642,7 +1642,7 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 	else
 		xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
 
-	xs_format_peer_addresses(xprt);
+	xs_format_ipv4_peer_addresses(xprt);
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-- 
cgit v1.2.3


From 4b6473fba4e832ee1d15737bc38779501c349a61 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:12 -0400
Subject: SUNRPC: add a function to format IPv6 addresses

Clone xs_format_ipv4_peer_addresses into an IPv6 version.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 45fac56b40f9..23e5865d984c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -312,6 +312,58 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
 }
 
+static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr_in6 *addr = (struct sockaddr_in6 *) &xprt->addr;
+	char *buf;
+
+	buf = kzalloc(40, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 40, NIP6_FMT,
+				NIP6(addr->sin6_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%u",
+				ntohs(addr->sin6_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		if (xprt->prot == IPPROTO_UDP)
+			snprintf(buf, 8, "udp");
+		else
+			snprintf(buf, 8, "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
+
+	buf = kzalloc(64, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s",
+				NIP6(addr->sin6_addr),
+				ntohs(addr->sin6_port),
+				xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(36, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 36, NIP6_SEQFMT,
+				NIP6(addr->sin6_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%4hx",
+				ntohs(addr->sin6_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+}
+
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
 {
 	int i;
-- 
cgit v1.2.3


From d5b6443014940eb83788161c69b17cb7a1ffaaed Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:18 -0400
Subject: SUNRPC: add support for IPv6 to the kernel's rpcbind client

Prepare for adding IPv6 support to the RPC client by adding IPv6
capabilities to rpcbind.  Note that this is support on the query side
only; registering IPv6 addresses with the local portmapper will come
later.

Note we have to take care not to fall back to using version 2 of the
rpcbind protocol if we're dealing with IPv6 address.  Version 2 doesn't
support IPv6 at all.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index a9d2cdc8e32c..7db75e177e67 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -16,6 +16,8 @@
 
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 
@@ -137,10 +139,13 @@ struct rpcbind_args {
 static struct rpc_procinfo rpcb_procedures2[];
 static struct rpc_procinfo rpcb_procedures3[];
 
-static struct rpcb_info {
+struct rpcb_info {
 	int			rpc_vers;
 	struct rpc_procinfo *	rpc_proc;
-} rpcb_next_version[];
+};
+
+static struct rpcb_info rpcb_next_version[];
+static struct rpcb_info rpcb_next_version6[];
 
 static void rpcb_getport_prepare(struct rpc_task *task, void *calldata)
 {
@@ -190,7 +195,17 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 				   RPC_CLNT_CREATE_INTR),
 	};
 
-	((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+	switch (srvaddr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
+		break;
+	default:
+		return NULL;
+	}
+
 	if (!privileged)
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 	return rpc_create(&args);
@@ -316,6 +331,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	struct rpc_task	*child;
 	struct sockaddr addr;
 	int status;
+	struct rpcb_info *info;
 
 	dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
 		task->tk_pid, __FUNCTION__,
@@ -343,14 +359,30 @@ void rpcb_getport_async(struct rpc_task *task)
 		goto bailout_nofree;
 	}
 
-	if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) {
+	rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
+
+	/* Don't ever use rpcbind v2 for AF_INET6 requests */
+	switch (addr.sa_family) {
+	case AF_INET:
+		info = rpcb_next_version;
+		break;
+	case AF_INET6:
+		info = rpcb_next_version6;
+		break;
+	default:
+		status = -EAFNOSUPPORT;
+		dprintk("RPC: %5u %s: bad address family\n",
+				task->tk_pid, __FUNCTION__);
+		goto bailout_nofree;
+	}
+	if (info[xprt->bind_index].rpc_proc == NULL) {
 		xprt->bind_index = 0;
 		status = -EACCES;	/* tell caller to try again later */
 		dprintk("RPC: %5u %s: no more getport versions available\n",
 			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
-	bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
+	bind_version = info[xprt->bind_index].rpc_vers;
 
 	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
 		task->tk_pid, __FUNCTION__, bind_version);
@@ -373,7 +405,6 @@ void rpcb_getport_async(struct rpc_task *task)
 			sizeof(map->r_addr));
 	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
 
-	rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
 	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
 	if (IS_ERR(rpcb_clnt)) {
 		status = PTR_ERR(rpcb_clnt);
@@ -594,6 +625,14 @@ static struct rpcb_info rpcb_next_version[] = {
 	{ 0, NULL },
 };
 
+static struct rpcb_info rpcb_next_version6[] = {
+#ifdef CONFIG_SUNRPC_BIND34
+	{ 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] },
+	{ 3, &rpcb_procedures3[RPCBPROC_GETADDR] },
+#endif
+	{ 0, NULL },
+};
+
 static struct rpc_version rpcb_version2 = {
 	.number		= 2,
 	.nrprocs	= RPCB_HIGHPROC_2,
-- 
cgit v1.2.3


From 20612005c51a5ba1bb17902276b9216825958724 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:23 -0400
Subject: SUNRPC: Introduce support for setting the port number in IPv6
 addresses

We could clone xs_set_port, but this is easier overall.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 23e5865d984c..a2dddec5325e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1211,11 +1211,20 @@ static unsigned short xs_get_random_port(void)
  */
 static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 {
-	struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr;
+	struct sockaddr *addr = (struct sockaddr *) &xprt->addr;
 
 	dprintk("RPC:       setting port for xprt %p to %u\n", xprt, port);
 
-	sap->sin_port = htons(port);
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)addr)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+		break;
+	default:
+		BUG();
+	}
 }
 
 static int xs_bind(struct sock_xprt *transport, struct socket *sock)
-- 
cgit v1.2.3


From 7dc753f0391ad94868609376f37be4833671b57d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:28 -0400
Subject: SUNRPC: Rename xs_bind() to prepare for IPv6-specific bind method

Prepare for introduction of IPv6-specific socket bind function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a2dddec5325e..cd7c18b24527 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1227,7 +1227,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 	}
 }
 
-static int xs_bind(struct sock_xprt *transport, struct socket *sock)
+static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
 {
 	struct sockaddr_in myaddr = {
 		.sin_family = AF_INET,
@@ -1255,8 +1255,9 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 		else
 			port--;
 	} while (err == -EADDRINUSE && port != transport->port);
-	dprintk("RPC:       xs_bind "NIPQUAD_FMT":%u: %s (%d)\n",
-		NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err);
+	dprintk("RPC:       %s "NIPQUAD_FMT":%u: %s (%d)\n",
+			__FUNCTION__, NIPQUAD(myaddr.sin_addr),
+			port, err ? "failed" : "ok", err);
 	return err;
 }
 
@@ -1315,7 +1316,7 @@ static void xs_udp_connect_worker(struct work_struct *work)
 	}
 	xs_reclassify_socket(sock);
 
-	if (xs_bind(transport, sock)) {
+	if (xs_bind4(transport, sock)) {
 		sock_release(sock);
 		goto out;
 	}
@@ -1402,7 +1403,7 @@ static void xs_tcp_connect_worker(struct work_struct *work)
 		}
 		xs_reclassify_socket(sock);
 
-		if (xs_bind(transport, sock)) {
+		if (xs_bind4(transport, sock)) {
 			sock_release(sock);
 			goto out;
 		}
-- 
cgit v1.2.3


From 90058d37c30ffce0e033ea3dcc6a539111483a58 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:33 -0400
Subject: SUNRPC: create an IPv6-savvy mechanism for binding to a reserved port

Clone xs_bindresvport into two functions, one that can handle IPv4
addresses, and one that can handle IPv6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cd7c18b24527..8295ae28391a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1261,6 +1261,39 @@ static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
 	return err;
 }
 
+static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
+{
+	struct sockaddr_in6 myaddr = {
+		.sin6_family = AF_INET6,
+	};
+	struct sockaddr_in6 *sa;
+	int err;
+	unsigned short port = transport->port;
+
+	if (!transport->xprt.resvport)
+		port = 0;
+	sa = (struct sockaddr_in6 *)&transport->addr;
+	myaddr.sin6_addr = sa->sin6_addr;
+	do {
+		myaddr.sin6_port = htons(port);
+		err = kernel_bind(sock, (struct sockaddr *) &myaddr,
+						sizeof(myaddr));
+		if (!transport->xprt.resvport)
+			break;
+		if (err == 0) {
+			transport->port = port;
+			break;
+		}
+		if (port <= xprt_min_resvport)
+			port = xprt_max_resvport;
+		else
+			port--;
+	} while (err == -EADDRINUSE && port != transport->port);
+	dprintk("RPC:       xs_bind6 "NIP6_FMT":%u: %s (%d)\n",
+		NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err);
+	return err;
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key xs_key[2];
 static struct lock_class_key xs_slock_key[2];
-- 
cgit v1.2.3


From 16be2d20d999cb65daebfdaf0e560225e28fcb9d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:38 -0400
Subject: SUNRPC: Refactor a part of socket connect logic into a helper
 function

Finishing a socket connect is the same for IPv4 and IPv6, so split it out
into a helper.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 131 +++++++++++++++++++++++++++-----------------------
 1 file changed, 72 insertions(+), 59 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8295ae28391a..e8a5edf59912 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1323,6 +1323,35 @@ static inline void xs_reclassify_socket(struct socket *sock)
 }
 #endif
 
+static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		sk->sk_user_data = xprt;
+		transport->old_data_ready = sk->sk_data_ready;
+		transport->old_state_change = sk->sk_state_change;
+		transport->old_write_space = sk->sk_write_space;
+		sk->sk_data_ready = xs_udp_data_ready;
+		sk->sk_write_space = xs_udp_write_space;
+		sk->sk_no_check = UDP_CSUM_NORCV;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		xprt_set_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+	xs_udp_do_set_buffer_size(xprt);
+}
+
 /**
  * xs_udp_connect_worker - set up a UDP socket
  * @work: RPC transport to connect
@@ -1357,29 +1386,7 @@ static void xs_udp_connect_worker(struct work_struct *work)
 	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	if (!transport->inet) {
-		struct sock *sk = sock->sk;
-
-		write_lock_bh(&sk->sk_callback_lock);
-
-		sk->sk_user_data = xprt;
-		transport->old_data_ready = sk->sk_data_ready;
-		transport->old_state_change = sk->sk_state_change;
-		transport->old_write_space = sk->sk_write_space;
-		sk->sk_data_ready = xs_udp_data_ready;
-		sk->sk_write_space = xs_udp_write_space;
-		sk->sk_no_check = UDP_CSUM_NORCV;
-		sk->sk_allocation = GFP_ATOMIC;
-
-		xprt_set_connected(xprt);
-
-		/* Reset to new socket */
-		transport->sock = sock;
-		transport->inet = sk;
-
-		write_unlock_bh(&sk->sk_callback_lock);
-	}
-	xs_udp_do_set_buffer_size(xprt);
+	xs_udp_finish_connecting(xprt, sock);
 	status = 0;
 out:
 	xprt_wake_pending_tasks(xprt, status);
@@ -1410,42 +1417,9 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
 				result);
 }
 
-/**
- * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_tcp_connect_worker(struct work_struct *work)
+static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
-	struct socket *sock = transport->sock;
-	int err, status = -EIO;
-
-	if (xprt->shutdown || !xprt_bound(xprt))
-		goto out;
-
-	if (!sock) {
-		/* start from scratch */
-		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
-			dprintk("RPC:       can't create TCP transport "
-					"socket (%d).\n", -err);
-			goto out;
-		}
-		xs_reclassify_socket(sock);
-
-		if (xs_bind4(transport, sock)) {
-			sock_release(sock);
-			goto out;
-		}
-	} else
-		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt);
-
-	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
-			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
 	if (!transport->inet) {
 		struct sock *sk = sock->sk;
@@ -1479,8 +1453,47 @@ static void xs_tcp_connect_worker(struct work_struct *work)
 	/* Tell the socket layer to start connecting... */
 	xprt->stat.connect_count++;
 	xprt->stat.connect_start = jiffies;
-	status = kernel_connect(sock, (struct sockaddr *) &xprt->addr,
+	return kernel_connect(sock, (struct sockaddr *) &xprt->addr,
 			xprt->addrlen, O_NONBLOCK);
+}
+
+/**
+ * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
+
+	if (xprt->shutdown || !xprt_bound(xprt))
+		goto out;
+
+	if (!sock) {
+		/* start from scratch */
+		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
+			goto out;
+		}
+		xs_reclassify_socket(sock);
+
+		if (xs_bind4(transport, sock) < 0) {
+			sock_release(sock);
+			goto out;
+		}
+	} else
+		/* "close" the socket, preserving the local port */
+		xs_tcp_reuse_connection(xprt);
+
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+
+	status = xs_tcp_finish_connecting(xprt, sock);
 	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt),
 			sock->sk->sk_state);
-- 
cgit v1.2.3


From 9c3d72de28eed3e882becd7054da2118f8a73131 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:43 -0400
Subject: SUNRPC: Rename IPv4 connect workers

Prepare for introduction of IPv6 versions of same.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e8a5edf59912..a0c26b9ebb34 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1353,12 +1353,12 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 }
 
 /**
- * xs_udp_connect_worker - set up a UDP socket
+ * xs_udp_connect_worker4 - set up a UDP socket
  * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_udp_connect_worker(struct work_struct *work)
+static void xs_udp_connect_worker4(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
@@ -1458,12 +1458,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 }
 
 /**
- * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
+ * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
  * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker(struct work_struct *work)
+static void xs_tcp_connect_worker4(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
@@ -1695,7 +1695,7 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker);
+	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker4);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_UDP_CONN_TO;
 	xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1737,7 +1737,7 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker);
+	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_TCP_CONN_TO;
 	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-- 
cgit v1.2.3


From 68e220bd5c9fc52d8029275cd42e08f573ce3600 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:48 -0400
Subject: SUNRPC: create connect workers for IPv6

Clone separate connect worker functions for connecting AF_INET6 sockets.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a0c26b9ebb34..cc4db1763aa4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1393,6 +1393,47 @@ out:
 	xprt_clear_connecting(xprt);
 }
 
+/**
+ * xs_udp_connect_worker6 - set up a UDP socket
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_udp_connect_worker6(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
+
+	if (xprt->shutdown || !xprt_bound(xprt))
+		goto out;
+
+	/* Start by resetting any existing state */
+	xs_close(xprt);
+
+	if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
+		goto out;
+	}
+	xs_reclassify_socket(sock);
+
+	if (xs_bind6(transport, sock) < 0) {
+		sock_release(sock);
+		goto out;
+	}
+
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+
+	xs_udp_finish_connecting(xprt, sock);
+	status = 0;
+out:
+	xprt_wake_pending_tasks(xprt, status);
+	xprt_clear_connecting(xprt);
+}
+
 /*
  * We need to preserve the port number so the reply cache on the server can
  * find our cached RPC replies when we get around to reconnecting.
@@ -1518,6 +1559,66 @@ out_clear:
 	xprt_clear_connecting(xprt);
 }
 
+/**
+ * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker6(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int err, status = -EIO;
+
+	if (xprt->shutdown || !xprt_bound(xprt))
+		goto out;
+
+	if (!sock) {
+		/* start from scratch */
+		if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
+			goto out;
+		}
+		xs_reclassify_socket(sock);
+
+		if (xs_bind6(transport, sock) < 0) {
+			sock_release(sock);
+			goto out;
+		}
+	} else
+		/* "close" the socket, preserving the local port */
+		xs_tcp_reuse_connection(xprt);
+
+	dprintk("RPC:       worker connecting xprt %p to address: %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+
+	status = xs_tcp_finish_connecting(xprt, sock);
+	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
+			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
+	if (status < 0) {
+		switch (status) {
+			case -EINPROGRESS:
+			case -EALREADY:
+				goto out_clear;
+			case -ECONNREFUSED:
+			case -ECONNRESET:
+				/* retry with existing socket, after a delay */
+				break;
+			default:
+				/* get rid of existing socket, and retry */
+				xs_close(xprt);
+				break;
+		}
+	}
+out:
+	xprt_wake_pending_tasks(xprt, status);
+out_clear:
+	xprt_clear_connecting(xprt);
+}
+
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @task: address of RPC task that manages state of connect request
-- 
cgit v1.2.3


From 8f9d5b1a2e717fb9e0c4d2c60a224ecce905bd23 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:53 -0400
Subject: SUNRPC: Add IPv6 address support to net/sunrpc/xprtsock.c

Finalize support for setting up RPC client transports to remote RPC
services addressed via IPv6.

Based on work done by Gilles Quillard at Bull Open Source.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 57 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cc4db1763aa4..d9e65765f6c4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -13,6 +13,9 @@
  *  (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
  *
  * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
+ *
+ * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005.
+ *   <gilles.quillard@bull.net>
  */
 
 #include <linux/types.h>
@@ -1780,6 +1783,7 @@ static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned
  */
 struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 {
+	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
@@ -1788,15 +1792,11 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
-		xprt_set_bound(xprt);
-
 	xprt->prot = IPPROTO_UDP;
 	xprt->tsh_size = 0;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker4);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_UDP_CONN_TO;
 	xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1809,7 +1809,28 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 	else
 		xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
 
-	xs_format_ipv4_peer_addresses(xprt);
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_connect_worker4);
+		xs_format_ipv4_peer_addresses(xprt);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_connect_worker6);
+		xs_format_ipv6_peer_addresses(xprt);
+		break;
+	default:
+		kfree(xprt);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
@@ -1823,6 +1844,7 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
  */
 struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 {
+	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
@@ -1831,14 +1853,10 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
-		xprt_set_bound(xprt);
-
 	xprt->prot = IPPROTO_TCP;
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_TCP_CONN_TO;
 	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
@@ -1851,7 +1869,26 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 	else
 		xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
 
-	xs_format_ipv4_peer_addresses(xprt);
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
+		xs_format_ipv4_peer_addresses(xprt);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
+		xs_format_ipv6_peer_addresses(xprt);
+		break;
+	default:
+		kfree(xprt);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-- 
cgit v1.2.3


From 95392c593e13fa7546857425971f87e4ded6e0c1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:57:58 -0400
Subject: SUNRPC: Add a helper for extracting the address using the correct
 type

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d9e65765f6c4..fcbfae82cb12 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -263,9 +263,24 @@ struct sock_xprt {
 #define TCP_RCV_COPY_XID	(1UL << 2)
 #define TCP_RCV_COPY_DATA	(1UL << 3)
 
+static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr *) &xprt->addr;
+}
+
+static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in *) &xprt->addr;
+}
+
+static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in6 *) &xprt->addr;
+}
+
 static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 {
-	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
+	struct sockaddr_in *addr = xs_addr_in(xprt);
 	char *buf;
 
 	buf = kzalloc(20, GFP_KERNEL);
@@ -317,7 +332,7 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 
 static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
 {
-	struct sockaddr_in6 *addr = (struct sockaddr_in6 *) &xprt->addr;
+	struct sockaddr_in6 *addr = xs_addr_in6(xprt);
 	char *buf;
 
 	buf = kzalloc(40, GFP_KERNEL);
@@ -537,7 +552,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 
 	req->rq_xtime = jiffies;
 	status = xs_sendpages(transport->sock,
-			      (struct sockaddr *) &xprt->addr,
+			      xs_addr(xprt),
 			      xprt->addrlen, xdr,
 			      req->rq_bytes_sent);
 
@@ -1214,7 +1229,7 @@ static unsigned short xs_get_random_port(void)
  */
 static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 {
-	struct sockaddr *addr = (struct sockaddr *) &xprt->addr;
+	struct sockaddr *addr = xs_addr(xprt);
 
 	dprintk("RPC:       setting port for xprt %p to %u\n", xprt, port);
 
@@ -1497,8 +1512,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 	/* Tell the socket layer to start connecting... */
 	xprt->stat.connect_count++;
 	xprt->stat.connect_start = jiffies;
-	return kernel_connect(sock, (struct sockaddr *) &xprt->addr,
-			xprt->addrlen, O_NONBLOCK);
+	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
 }
 
 /**
-- 
cgit v1.2.3


From 8945ee5e27156ef9708bc3a11da87ba689aa38b6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 6 Aug 2007 11:58:04 -0400
Subject: SUNRPC: Split xs_reclassify_socket into an IPv4 and IPv6 version

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index fcbfae82cb12..01121a4f0851 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1316,27 +1316,29 @@ static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
 static struct lock_class_key xs_key[2];
 static struct lock_class_key xs_slock_key[2];
 
-static inline void xs_reclassify_socket(struct socket *sock)
+static inline void xs_reclassify_socket4(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+
 	BUG_ON(sk->sk_lock.owner != NULL);
-	switch (sk->sk_family) {
-	case AF_INET:
-		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS",
-			&xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]);
-		break;
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
+		&xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
+}
 
-	case AF_INET6:
-		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS",
-			&xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]);
-		break;
+static inline void xs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
 
-	default:
-		BUG();
-	}
+	BUG_ON(sk->sk_lock.owner != NULL);
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
+		&xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
 }
 #else
-static inline void xs_reclassify_socket(struct socket *sock)
+static inline void xs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket6(struct socket *sock)
 {
 }
 #endif
@@ -1394,7 +1396,7 @@ static void xs_udp_connect_worker4(struct work_struct *work)
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
-	xs_reclassify_socket(sock);
+	xs_reclassify_socket4(sock);
 
 	if (xs_bind4(transport, sock)) {
 		sock_release(sock);
@@ -1435,7 +1437,7 @@ static void xs_udp_connect_worker6(struct work_struct *work)
 		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
 		goto out;
 	}
-	xs_reclassify_socket(sock);
+	xs_reclassify_socket6(sock);
 
 	if (xs_bind6(transport, sock) < 0) {
 		sock_release(sock);
@@ -1538,7 +1540,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
 			goto out;
 		}
-		xs_reclassify_socket(sock);
+		xs_reclassify_socket4(sock);
 
 		if (xs_bind4(transport, sock) < 0) {
 			sock_release(sock);
@@ -1599,7 +1601,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
 			dprintk("RPC:       can't create TCP transport socket (%d).\n", -err);
 			goto out;
 		}
-		xs_reclassify_socket(sock);
+		xs_reclassify_socket6(sock);
 
 		if (xs_bind6(transport, sock) < 0) {
 			sock_release(sock);
-- 
cgit v1.2.3


From 756805e7a76bcd2aae07fe31786fe453375e60b1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 16 Aug 2007 16:03:26 -0400
Subject: SUNRPC: Add support for formatted universal addresses

"Universal addresses" are a string representation of an IP address and
port.  They are described fully in RFC 3530, section 2.2.  Add support
for generating them in the RPC client's socket transport module.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprtsock.c       | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4596c26f97aa..902a9c0a2d9a 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -55,6 +55,7 @@ enum rpc_display_format_t {
 	RPC_DISPLAY_ALL,
 	RPC_DISPLAY_HEX_ADDR,
 	RPC_DISPLAY_HEX_PORT,
+	RPC_DISPLAY_UNIVERSAL_ADDR,
 	RPC_DISPLAY_MAX,
 };
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 01121a4f0851..7a154e4b70f5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -328,6 +328,15 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 				ntohs(addr->sin_port));
 	}
 	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(30, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
+				NIPQUAD(addr->sin_addr.s_addr),
+				ntohs(addr->sin_port) >> 8,
+				ntohs(addr->sin_port) & 0xff);
+	}
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
 }
 
 static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
@@ -380,6 +389,15 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
 				ntohs(addr->sin6_port));
 	}
 	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(50, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 50, NIP6_FMT".%u.%u",
+				NIP6(addr->sin6_addr),
+				ntohs(addr->sin6_port) >> 8,
+				ntohs(addr->sin6_port) & 0xff);
+	}
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
 }
 
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
-- 
cgit v1.2.3


From 143b6c4008a7928de7e139c3a77a90e4cad8db2c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 16 Aug 2007 16:03:31 -0400
Subject: SUNRPC: Fix generation of universal addresses for

Fix some problems with rpcbind v3 and v4 queries from the in-kernel rpcbind
client:

1.  The r_addr argument must be a full universal address, not just an IP
address, and

2.  The universal address in r_addr is the address of the remote rpcbind
server, not the RPC service being requested

This addresses bugzilla.kernel.org report 8891 for 2.6.23-rc and greater.

In addition, if the rpcbind client is unable to start the rpcbind request,
make sure not to leak the xprt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/rpcb_clnt.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 7db75e177e67..0bb6709018de 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -387,6 +387,15 @@ void rpcb_getport_async(struct rpc_task *task)
 	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
 		task->tk_pid, __FUNCTION__, bind_version);
 
+	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot,
+				bind_version, 0);
+	if (IS_ERR(rpcb_clnt)) {
+		status = PTR_ERR(rpcb_clnt);
+		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+			task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
+		goto bailout_nofree;
+	}
+
 	map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
 	if (!map) {
 		status = -ENOMEM;
@@ -401,25 +410,18 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_xprt = xprt_get(xprt);
 	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP :
 						   RPCB_NETID_UDP;
-	memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR),
-			sizeof(map->r_addr));
+	memcpy(&map->r_addr,
+	       rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
+	       sizeof(map->r_addr));
 	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
 
-	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
-	if (IS_ERR(rpcb_clnt)) {
-		status = PTR_ERR(rpcb_clnt);
-		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
-			task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
-		goto bailout;
-	}
-
 	child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
 	rpc_release_client(rpcb_clnt);
 	if (IS_ERR(child)) {
 		status = -EIO;
 		dprintk("RPC: %5u %s: rpc_run_task failed\n",
 			task->tk_pid, __FUNCTION__);
-		goto bailout_nofree;
+		goto bailout;
 	}
 	rpc_put_task(child);
 
-- 
cgit v1.2.3


From 5d34da3af923e0f950a89f160540d2506ca046ce Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:03 -0400
Subject: SUNRPC: Only one dprintk is needed during client creation

Remove one of two identical dprintk's that occur when an RPC client is
created.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 52429b1ffcc1..c796e2fd2708 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -269,9 +269,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
 		xprt->resvport = 0;
 
-	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
-			args->program->name, args->servername, xprt);
-
 	clnt = rpc_new_client(xprt, args->servername, args->program,
 				args->version, args->authflavor);
 	if (IS_ERR(clnt))
-- 
cgit v1.2.3


From 89eb21c35b61b5157940e1b78c2c6d0529d11c63 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:09 -0400
Subject: SUNRPC: fix a signed v. unsigned comparison nit in
 rpc_bind_new_program
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

/home/cel/linux/net/sunrpc/clnt.c: In function ‘rpc_bind_new_program’:
/home/cel/linux/net/sunrpc/clnt.c:445: warning:
	comparison between signed and unsigned

RPC version numbers are u32, not int.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 2 +-
 net/sunrpc/clnt.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c0d9d14983b3..d9d5c5ad826c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -117,7 +117,7 @@ struct rpc_create_args {
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
-				struct rpc_program *, int);
+				struct rpc_program *, u32);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c796e2fd2708..c3d571abcded 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -436,7 +436,7 @@ rpc_release_client(struct rpc_clnt *clnt)
  */
 struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
 				      struct rpc_program *program,
-				      int vers)
+				      u32 vers)
 {
 	struct rpc_clnt *clnt;
 	struct rpc_version *version;
-- 
cgit v1.2.3


From 6d0aa06afd62a868d83c6021335622a316469527 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:15 -0400
Subject: SUNRPC: Use correct argument type in memcpy()

Noticed by Tom Talpey <tmt@netapp.com>:

OBTW, there's a nit on that memcpy, too. The r_addr is an array, so

memcpy(&map->r_addr

is passing the address of the array as a char **. It's the same as
map->r_addr, but technically the wrong type.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 0bb6709018de..f88ab90b8d34 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -410,7 +410,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_xprt = xprt_get(xprt);
 	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP :
 						   RPCB_NETID_UDP;
-	memcpy(&map->r_addr,
+	memcpy(map->r_addr,
 	       rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
 	       sizeof(map->r_addr));
 	map->r_owner = RPCB_OWNER_STRING;	/* ignored for GETADDR */
-- 
cgit v1.2.3


From 06b8d2552d50f802a3277137a565febcd59ef037 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:20 -0400
Subject: SUNRPC: Make sure server name is reasonable before trying to print it

Check the length of the passed-in server name before trying to print it in
the log.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c3d571abcded..0357819ffdb0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -127,7 +127,14 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_auth		*auth;
 	int err;
-	int len;
+	size_t len;
+
+	/* sanity check the name before trying to print it */
+	err = -EINVAL;
+	len = strlen(servname);
+	if (len > RPC_MAXNETNAMELEN)
+		goto out_no_rpciod;
+	len++;
 
 	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
 			program->name, servname, xprt);
@@ -148,7 +155,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	clnt->cl_parent = clnt;
 
 	clnt->cl_server = clnt->cl_inline_name;
-	len = strlen(servname) + 1;
 	if (len > sizeof(clnt->cl_inline_name)) {
 		char *buf = kmalloc(len, GFP_KERNEL);
 		if (buf != 0)
-- 
cgit v1.2.3


From d66968f207b6402fd12c20145cb31dbe3608979c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:25 -0400
Subject: SUNRPC: Clean up in rpc_show_tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

/home/cel/linux/net/sunrpc/clnt.c: In function ‘rpc_show_tasks’:
/home/cel/linux/net/sunrpc/clnt.c:1538: warning:
	signed and unsigned type in conditional expression

This points out another case where a conditional expression returns a
signed value in one arm and an unsigned value in the other.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0357819ffdb0..7a7cd3b7acce 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1526,13 +1526,18 @@ void rpc_show_tasks(void)
 		spin_lock(&clnt->cl_lock);
 		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
 			const char *rpc_waitq = "none";
+			int proc;
+
+			if (t->tk_msg.rpc_proc)
+				proc = t->tk_msg.rpc_proc->p_proc;
+			else
+				proc = -1;
 
 			if (RPC_IS_QUEUED(t))
 				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
 
 			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
-				t->tk_pid,
-				(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+				t->tk_pid, proc,
 				t->tk_flags, t->tk_status,
 				t->tk_client,
 				(t->tk_client ? t->tk_client->cl_prog : 0),
-- 
cgit v1.2.3


From e65fe3976f594603ed7b1b4a99d3e9b867f573ea Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:31 -0400
Subject: SUNRPC: Make rpcb_decode_getaddr more picky about universal addresses

Add better sanity checking of server replies to the GETVERSADDR reply
decoder.  Change the error return code: EIO is what other XDR decoding
routines return if there is a failure while decoding.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index f88ab90b8d34..34738c5cf312 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -528,12 +528,19 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 
 	*portp = 0;
 	addr_len = ntohl(*p++);
-	if (addr_len > RPCB_MAXADDRLEN)			/* sanity */
-		return -EINVAL;
-
-	dprintk("RPC:       rpcb_decode_getaddr returned string: '%s'\n",
-			(char *) p);
 
+	/*
+	 * Simple sanity check.  The smallest possible universal
+	 * address is an IPv4 address string containing 11 bytes.
+	 */
+	if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN)
+		goto out_err;
+
+	/*
+	 * Start at the end and walk backwards until the first dot
+	 * is encountered.  When the second dot is found, we have
+	 * both parts of the port number.
+	 */
 	addr = (char *)p;
 	val = 0;
 	first = 1;
@@ -555,8 +562,19 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
 		}
 	}
 
+	/*
+	 * Simple sanity check.  If we never saw a dot in the reply,
+	 * then this was probably just garbage.
+	 */
+	if (first)
+		goto out_err;
+
 	dprintk("RPC:       rpcb_decode_getaddr port=%u\n", *portp);
 	return 0;
+
+out_err:
+	dprintk("RPC:       rpcbind server returned malformed reply\n");
+	return -EIO;
 }
 
 #define RPCB_program_sz		(1u)
-- 
cgit v1.2.3


From 4784cb51a3f66d401f8a08810231aa7dc8f44e43 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:36 -0400
Subject: SUNRPC: Retry bad rpcbind replies

When a server returns a bad rpcbind reply, make rpcbind client recovery logic
retry with an older protocol version.  Older versions are more likely to work
correctly.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 34738c5cf312..a0c9f6905517 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -446,6 +446,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data)
 	struct rpc_xprt *xprt = map->r_xprt;
 	int status = child->tk_status;
 
+	/* Garbage reply: retry with a lesser rpcbind version */
+	if (status == -EIO)
+		status = -EPROTONOSUPPORT;
+
 	/* rpcbind server doesn't support this rpcbind protocol version */
 	if (status == -EPROTONOSUPPORT)
 		xprt->bind_index++;
-- 
cgit v1.2.3


From 2429cbf6a1566b8e92436d615387e4250feab46b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:41 -0400
Subject: SUNRPC: Add a new error code for retry waiting for another binder

Add new error code processing to the kernel's rpcbind client and to
call_bind_status() to distinguish two cases:

Case 1: the remote has replied that the program/version tuple is not
registered (returns -EACCES)

Case 2: another process is already in the middle of binding on this
transport (now returns -EAGAIN)

This change allows more specific retry processing for each of these two
cases.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c      | 6 ++++++
 net/sunrpc/rpcb_clnt.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7a7cd3b7acce..e53d71827132 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -950,6 +950,12 @@ call_bind_status(struct rpc_task *task)
 	}
 
 	switch (task->tk_status) {
+	case -EAGAIN:
+		dprintk("RPC: %5u rpcbind waiting for another request "
+				"to finish\n", task->tk_pid);
+		/* avoid busy-waiting here -- could be a network outage. */
+		rpc_delay(task, 5*HZ);
+		goto retry_timeout;
 	case -EACCES:
 		dprintk("RPC: %5u remote rpcbind: RPC program/version "
 				"unavailable\n", task->tk_pid);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index a0c9f6905517..9b06a217ab86 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -341,7 +341,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	BUG_ON(clnt->cl_parent != clnt);
 
 	if (xprt_test_and_set_binding(xprt)) {
-		status = -EACCES;		/* tell caller to check again */
+		status = -EAGAIN;	/* tell caller to check again */
 		dprintk("RPC: %5u %s: waiting for another binder\n",
 			task->tk_pid, __FUNCTION__);
 		goto bailout_nowake;
-- 
cgit v1.2.3


From 906462af4c707ba0238f3579fdb2b594c4ea29c3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:47 -0400
Subject: SUNRPC: Split another new rpcbind retry error code from EACCES

Add more new error code processing to the kernel's rpcbind client
and to call_bind_status() to distinguish two cases:

Case 1: the remote has replied that the program/version tuple is not
registered (returns EACCES)

Case 2: retry with a lesser rpcbind version (rpcb now returns EPFNOSUPPORT)

This change allows more specific error processing for each of these two
cases.  We now fail case 2 instead of retrying... it's a server
configuration error not to support even rpcbind version 2.  And don't
expose this new error code to user land -- convert it to EIO before
failing the RPC.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c      | 4 ++--
 net/sunrpc/rpcb_clnt.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e53d71827132..04f6e15457e0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -940,7 +940,7 @@ call_bind(struct rpc_task *task)
 static void
 call_bind_status(struct rpc_task *task)
 {
-	int status = -EACCES;
+	int status = -EIO;
 
 	if (task->tk_status >= 0) {
 		dprint_status(task);
@@ -966,6 +966,7 @@ call_bind_status(struct rpc_task *task)
 				task->tk_pid);
 		goto retry_timeout;
 	case -EPFNOSUPPORT:
+		/* server doesn't support any rpcbind version we know of */
 		dprintk("RPC: %5u remote rpcbind service unavailable\n",
 				task->tk_pid);
 		break;
@@ -978,7 +979,6 @@ call_bind_status(struct rpc_task *task)
 	default:
 		dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
 				task->tk_pid, -task->tk_status);
-		status = -EIO;
 	}
 
 	rpc_exit(task, status);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 9b06a217ab86..d7b9f02489e6 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -377,7 +377,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	}
 	if (info[xprt->bind_index].rpc_proc == NULL) {
 		xprt->bind_index = 0;
-		status = -EACCES;	/* tell caller to try again later */
+		status = -EPFNOSUPPORT;
 		dprintk("RPC: %5u %s: no more getport versions available\n",
 			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
-- 
cgit v1.2.3


From b79dc8ced1412e7056f3969bef40a30cc75ee530 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:52 -0400
Subject: SUNRPC: RPC bind failures should be permanent for NULL requests

The purpose of an RPC ping (a NULL request) is to determine whether the
remote end is operating and supports the RPC program and version of the
request.

If we do an RPC bind and the remote's rpcbind service says "this
program or service isn't supported" then we have our answer already,
and we should give up immediately.

This is good for the kernel mount client, as it will cause the request
to fail, and then allow an immediate retry with different options.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 04f6e15457e0..b19bacf42564 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -959,6 +959,11 @@ call_bind_status(struct rpc_task *task)
 	case -EACCES:
 		dprintk("RPC: %5u remote rpcbind: RPC program/version "
 				"unavailable\n", task->tk_pid);
+		/* fail immediately if this is an RPC ping */
+		if (task->tk_msg.rpc_proc->p_proc == 0) {
+			status = -EOPNOTSUPP;
+			break;
+		}
 		rpc_delay(task, 3*HZ);
 		goto retry_timeout;
 	case -ETIMEDOUT:
-- 
cgit v1.2.3


From efd8340bb19c26a43e77c92fee9283b1f5777204 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:00:58 -0400
Subject: NFS: Kernel mount client should use async bind

Simplify the in-kernel mount client by using autobind instead of an
explicit call to rpc_getport_sync.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 42b8c671da35..5085f53be2fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,15 +1027,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		sin = args->mount_server.address;
 	else
 		sin = args->nfs_server.address;
-	if (args->mount_server.port == 0) {
-		status = rpcb_getport_sync(&sin,
-					   args->mount_server.program,
-					   args->mount_server.version,
-					   args->mount_server.protocol);
-		if (status < 0)
-			goto out_err;
-		sin.sin_port = htons(status);
-	} else
+	if (args->mount_server.port != 0)
 		sin.sin_port = htons(args->mount_server.port);
 
 	/*
@@ -1049,14 +1041,11 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 			   args->mount_server.version,
 			   args->mount_server.protocol,
 			   root_fh);
-	if (status < 0)
-		goto out_err;
-
-	return status;
+	if (status == 0)
+		return 0;
 
-out_err:
-	dfprintk(MOUNT, "NFS: unable to contact server on host "
-		 NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr));
+	dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
+			", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
 	return status;
 }
 
-- 
cgit v1.2.3


From aad700073557c7932ef9f81c19a5e0647f8a6850 Mon Sep 17 00:00:00 2001
From: James Lentini <jlentini@netapp.com>
Date: Mon, 24 Sep 2007 17:32:49 -0400
Subject: [NFS] [PATCH] NFS: initialize default port in kernel mount client

If no mount server port number is specified, the previous change to the
kernel mount client inadvertently allows the NFS server's port number to be
the used as the mount server's port number. If the user specifies an NFS
server port (-o port=x), the mount will fail.

The fix below sets the mount server's port to 0 if no mount server
port is specified by the user.

Signed-off-by: James Lentini <jlentini@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5085f53be2fe..094d2f832c3c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,10 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		sin = args->mount_server.address;
 	else
 		sin = args->nfs_server.address;
-	if (args->mount_server.port != 0)
-		sin.sin_port = htons(args->mount_server.port);
+	/*
+	 * autobind will be used if mount_server.port == 0
+	 */
+	sin.sin_port = htons(args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
-- 
cgit v1.2.3


From 0ac83779fa5bffb90e32a97abc61f1840af31ee9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:01:04 -0400
Subject: NFS: Add new 'mountaddr=' mount option

I got the 'mounthost=' option wrong - it shouldn't look for an address
value, but rather a hostname value.  However, the in-kernel mount client
and NFS client cannot resolve a hostname by themselves; they rely on
user-land to pass in the resolved address.

Create a new mount option that does take an address so that the mount
program's address can be passed in.  The mount hostname is now ignored
by the kernel.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 094d2f832c3c..a955821b8499 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,6 +73,7 @@ struct nfs_parsed_mount_data {
 
 	struct {
 		struct sockaddr_in	address;
+		char			*hostname;
 		unsigned int		program;
 		unsigned int		version;
 		unsigned short		port;
@@ -116,7 +117,7 @@ enum {
 
 	/* Mount options that take string arguments */
 	Opt_sec, Opt_proto, Opt_mountproto,
-	Opt_addr, Opt_mounthost, Opt_clientaddr,
+	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 
 	/* Mount options that are ignored */
 	Opt_userspace, Opt_deprecated,
@@ -175,7 +176,8 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountproto, "mountproto=%s" },
 	{ Opt_addr, "addr=%s" },
 	{ Opt_clientaddr, "clientaddr=%s" },
-	{ Opt_mounthost, "mounthost=%s" },
+	{ Opt_userspace, "mounthost=%s" },
+	{ Opt_mountaddr, "mountaddr=%s" },
 
 	{ Opt_err, NULL }
 };
@@ -961,7 +963,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			mnt->client_address = string;
 			break;
-		case Opt_mounthost:
+		case Opt_mountaddr:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-- 
cgit v1.2.3


From fe82a183ca3c9188945c4ebeebc2ca45abfa24e5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:01:10 -0400
Subject: NFS: Convert printk's to dprintk's in fs/nfs/nfs?xdr.c

Due to recent edict to replace or remove printk's that can be triggered en
masse by remote misbehavior.  Left a few that only occur just before a BUG.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 18 +++++++++---------
 fs/nfs/nfs3xdr.c | 23 ++++++++++++-----------
 fs/nfs/nfs4xdr.c | 43 +++++++++++++++++++++----------------------
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c5fce7567200..1c570948fc16 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -271,7 +271,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	res->eof = 0;
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READ reply header overflowed:"
+		dprintk("NFS: READ reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -281,7 +281,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 			"count %d > recvd %d\n", count, recvd);
 		count = recvd;
 	}
@@ -431,7 +431,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+		dprintk("NFS: READDIR reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -454,7 +454,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 		len = ntohl(*p++);
 		p += XDR_QUADLEN(len) + 1;	/* name plus cookie */
 		if (len > NFS2_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n",
+			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
 						len);
 			goto err_unmap;
 		}
@@ -471,7 +471,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -583,12 +583,12 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	/* Convert length of symlink */
 	len = ntohl(*p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+		dprintk("NFS: READLINK reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -597,7 +597,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	}
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -695,7 +695,7 @@ nfs_stat_to_errno(int stat)
 		if (nfs_errtbl[i].stat == stat)
 			return nfs_errtbl[i].errno;
 	}
-	printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+	dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
 	return nfs_errtbl[i].errno;
 }
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9e08f0cf2a0..8a225fbe9ee2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -524,7 +524,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READDIR reply header overflowed:"
+		dprintk("NFS: READDIR reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -547,7 +547,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 		len = ntohl(*p++);		/* string length */
 		p += XDR_QUADLEN(len) + 2;	/* name + cookie */
 		if (len > NFS3_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n",
+			dprintk("NFS: giant filename in readdir (len %x)!\n",
 						len);
 			goto err_unmap;
 		}
@@ -567,7 +567,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 					goto short_pkt;
 				len = ntohl(*p++);
 				if (len > NFS3_FHSIZE) {
-					printk(KERN_WARNING "NFS: giant filehandle in "
+					dprintk("NFS: giant filehandle in "
 						"readdir (len %x)!\n", len);
 					goto err_unmap;
 				}
@@ -588,7 +588,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -826,22 +826,23 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	/* Convert length of symlink */
 	len = ntohl(*p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READLINK reply header overflowed:"
+		dprintk("NFS: READLINK reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
-		dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
+		dprintk("NFS: READLINK header is short. "
+			"iovec will be shifted.\n");
 		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
 	}
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -876,13 +877,13 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	ocount   = ntohl(*p++);
 
 	if (ocount != count) {
-		printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n");
+		dprintk("NFS: READ count doesn't match RPC opaque count.\n");
 		return -errno_NFSERR_IO;
 	}
 
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
-		printk(KERN_WARNING "NFS: READ reply header overflowed:"
+		dprintk("NFS: READ reply header overflowed:"
 				"length %d > %Zu\n", hdrlen, iov->iov_len);
        		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
@@ -892,7 +893,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 			"count %d > recvd %d\n", count, recvd);
 		count = recvd;
 		res->eof = 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index badd73b7ca12..4dbbf44727ea 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -562,7 +562,6 @@ struct compound_hdr {
 
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = xdr_reserve_space(xdr, nbytes);			\
-	if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
 	BUG_ON(!p);						\
 } while (0)
 
@@ -628,8 +627,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	if (iap->ia_valid & ATTR_UID) {
 		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
 		if (owner_namelen < 0) {
-			printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n",
-			       iap->ia_uid);
+			dprintk("nfs: couldn't resolve uid %d to string\n",
+					iap->ia_uid);
 			/* XXX */
 			strcpy(owner_name, "nobody");
 			owner_namelen = sizeof("nobody") - 1;
@@ -640,8 +639,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	if (iap->ia_valid & ATTR_GID) {
 		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
 		if (owner_grouplen < 0) {
-			printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n",
-			       iap->ia_gid);
+			dprintk("nfs: couldn't resolve gid %d to string\n",
+					iap->ia_gid);
 			strcpy(owner_group, "nobody");
 			owner_grouplen = sizeof("nobody") - 1;
 			/* goto out; */
@@ -711,7 +710,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	 * Now we backfill the bitmap and the attribute buffer length.
 	 */
 	if (len != ((char *)p - (char *)q) + 4) {
-		printk ("encode_attr: Attr length calculation error! %u != %Zu\n",
+		printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n",
 				len, ((char *)p - (char *)q) + 4);
 		BUG();
 	}
@@ -2180,9 +2179,9 @@ out:
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
-		printk(KERN_INFO "%s: prematurely hit end of receive" \
+		dprintk("nfs: %s: prematurely hit end of receive" \
 				" buffer\n", __FUNCTION__); \
-		printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
+		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
 				__FUNCTION__, xdr->p, nbytes, xdr->end); \
 		return -EIO; \
 	} \
@@ -2223,9 +2222,8 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	READ_BUF(8);
 	READ32(opnum);
 	if (opnum != expected) {
-		printk(KERN_NOTICE
-				"nfs4_decode_op_hdr: Server returned operation"
-			       	" %d but we issued a request for %d\n",
+		dprintk("nfs: Server returned operation"
+			" %d but we issued a request for %d\n",
 				opnum, expected);
 		return -EIO;
 	}
@@ -2758,7 +2756,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 						__FUNCTION__);
 		} else
-			printk(KERN_WARNING "%s: name too long (%u)!\n",
+			dprintk("%s: name too long (%u)!\n",
 					__FUNCTION__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
@@ -2783,7 +2781,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 						__FUNCTION__);
 		} else
-			printk(KERN_WARNING "%s: name too long (%u)!\n",
+			dprintk("%s: name too long (%u)!\n",
 					__FUNCTION__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
@@ -2950,7 +2948,8 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl
 	unsigned int nwords = xdr->p - savep;
 
 	if (unlikely(attrwords != nwords)) {
-		printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n",
+		dprintk("%s: server returned incorrect attribute length: "
+			"%u %c %u\n",
 				__FUNCTION__,
 				attrwords << 2,
 				(attrwords < nwords) ? '<' : '>',
@@ -3451,7 +3450,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
-		printk(KERN_WARNING "NFS: server cheating in read reply: "
+		dprintk("NFS: server cheating in read reply: "
 				"count %u > recvd %u\n", count, recvd);
 		count = recvd;
 		eof = 0;
@@ -3500,7 +3499,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 		p += 2;			/* cookie */
 		len = ntohl(*p++);	/* filename length */
 		if (len > NFS4_MAXNAMLEN) {
-			printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len);
+			dprintk("NFS: giant filename in readdir (len 0x%x)\n",
+					len);
 			goto err_unmap;
 		}
 		xlen = XDR_QUADLEN(len);
@@ -3528,7 +3528,7 @@ short_pkt:
 	entry[0] = entry[1] = 0;
 	/* truncate listing ? */
 	if (!nr) {
-		printk(KERN_NOTICE "NFS: readdir reply truncated!\n");
+		dprintk("NFS: readdir reply truncated!\n");
 		entry[1] = 1;
 	}
 	goto out;
@@ -3554,13 +3554,13 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	READ_BUF(4);
 	READ32(len);
 	if (len >= rcvbuf->page_len || len <= 0) {
-		dprintk(KERN_WARNING "nfs: server returned giant symlink!\n");
+		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (recvd < len) {
-		printk(KERN_WARNING "NFS: server cheating in readlink reply: "
+		dprintk("NFS: server cheating in readlink reply: "
 				"count %u > recvd %u\n", len, recvd);
 		return -EIO;
 	}
@@ -3643,7 +3643,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
 		recvd = req->rq_rcv_buf.len - hdrlen;
 		if (attrlen > recvd) {
-			printk(KERN_WARNING "NFS: server cheating in getattr"
+			dprintk("NFS: server cheating in getattr"
 					" acl reply: attrlen %u > recvd %u\n",
 					attrlen, recvd);
 			return -EINVAL;
@@ -3688,8 +3688,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	READ_BUF(8);
 	READ32(opnum);
 	if (opnum != OP_SETCLIENTID) {
-		printk(KERN_NOTICE
-				"nfs4_decode_setclientid: Server returned operation"
+		dprintk("nfs: decode_setclientid: Server returned operation"
 			       	" %d\n", opnum);
 		return -EIO;
 	}
-- 
cgit v1.2.3


From e159a08b6ab14e255536fddae75d448395295c6f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:01:15 -0400
Subject: LOCKD: Convert printk's to dprintk's in lockd XDR routines

Due to recent edict to remove or replace printk's that might flood the
system log.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/xdr.c  | 8 ++++----
 fs/lockd/xdr4.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 5316e307a49d..633653bff944 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -62,8 +62,9 @@ static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
 	}
 	else 
 	{
-		printk(KERN_NOTICE
-			"lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
 		return NULL;
 	}
 	return p;
@@ -84,8 +85,7 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
 	unsigned int	len;
 
 	if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
-		printk(KERN_NOTICE
-			"lockd: bad fhandle size %d (should be %d)\n",
+		dprintk("lockd: bad fhandle size %d (should be %d)\n",
 			len, NFS2_FHSIZE);
 		return NULL;
 	}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 846fc1d639dd..43ff9397e6c6 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -64,8 +64,9 @@ nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
 	}
 	else 
 	{
-		printk(KERN_NOTICE
-			"lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN);
+		dprintk("lockd: bad cookie size %d (only cookies under "
+			"%d bytes are supported.)\n",
+				len, NLM_MAXCOOKIELEN);
 		return NULL;
 	}
 	return p;
@@ -86,8 +87,7 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
 	memset(f->data, 0, sizeof(f->data));
 	f->size = ntohl(*p++);
 	if (f->size > NFS_MAXFHSIZE) {
-		printk(KERN_NOTICE
-			"lockd: bad fhandle size %d (should be <=%d)\n",
+		dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
 			f->size, NFS_MAXFHSIZE);
 		return NULL;
 	}
-- 
cgit v1.2.3


From 817cb9d43d4c330f9fc023d96e5beaa1abe8c4b7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 11 Sep 2007 18:01:20 -0400
Subject: NFSD: Convert printk's to dprintk's in NFSD's nfs4xdr

Due to recent edict to remove or replace printk's that can flood the system
log.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4xdr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 8ef0964179bc..163a76c21ac5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -102,7 +102,8 @@ check_filename(char *str, int len, __be32 err)
 out:						\
 	return status;				\
 xdr_error:					\
-	printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__);	\
+	dprintk("NFSD: xdr error (%s:%d)\n",	\
+			__FILE__, __LINE__);	\
 	status = nfserr_bad_xdr;		\
 	goto out
 
@@ -124,7 +125,8 @@ xdr_error:					\
 	if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
  		savemem(argp, p, nbytes) :	\
  		(char *)p)) {			\
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
 		goto xdr_error;			\
 		}				\
 	p += XDR_QUADLEN(nbytes);		\
@@ -140,7 +142,8 @@ xdr_error:					\
 		p = argp->p;			\
 		argp->p += XDR_QUADLEN(nbytes);	\
 	} else if (!(p = read_buf(argp, nbytes))) { \
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \
+		dprintk("NFSD: xdr error (%s:%d)\n", \
+				__FILE__, __LINE__); \
 		goto xdr_error;			\
 	}					\
 } while (0)
@@ -948,7 +951,8 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 	 */
 	avail = (char*)argp->end - (char*)argp->p;
 	if (avail + argp->pagelen < write->wr_buflen) {
-		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 
+		dprintk("NFSD: xdr error (%s:%d)\n",
+				__FILE__, __LINE__);
 		goto xdr_error;
 	}
 	argp->rqstp->rq_vec[0].iov_base = p;
@@ -1019,7 +1023,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
 		if (!argp->ops) {
 			argp->ops = argp->iops;
-			printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n");
+			dprintk("nfsd: couldn't allocate room for COMPOUND\n");
 			goto xdr_error;
 		}
 	}
@@ -1326,7 +1330,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
 	path = exp->ex_path;
 
 	if (strncmp(path, rootpath, strlen(rootpath))) {
-		printk("nfsd: fs_locations failed;"
+		dprintk("nfsd: fs_locations failed;"
 			"%s is not contained in %s\n", path, rootpath);
 		*stat = nfserr_notsupp;
 		return NULL;
-- 
cgit v1.2.3


From 4f40ee4a02a2d017b714d5b2faaf5c25bf9eae47 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:42:38 -0400
Subject: SUNRPC: move per-transport rpcbind netid's

Move the TCP/UDP rpcbind netid's from the rpcbind client to a global header.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/msg_prot.h | 13 +++++++++++++
 net/sunrpc/rpcb_clnt.c          | 26 +++-----------------------
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 784d4c3ef651..c4beb5775111 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -138,6 +138,19 @@ typedef __be32	rpc_fraghdr;
 #define RPC_MAX_HEADER_WITH_AUTH \
 	(RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
 
+/*
+ * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
+ */
+#define RPCBIND_NETID_UDP	"udp"
+#define RPCBIND_NETID_TCP	"tcp"
+#define RPCBIND_NETID_UDP6	"udp6"
+#define RPCBIND_NETID_TCP6	"tcp6"
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * netid string, but all currently defined netid's fit in 4 bytes.
+ */
+#define RPCBIND_MAXNETIDLEN	(4u)
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index d7b9f02489e6..b028a0ecd593 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -92,26 +92,6 @@ enum {
  */
 #define RPCB_MAXADDRLEN		(128u)
 
-/*
- * r_netid
- *
- * Quoting RFC 3530, section 2.2:
- *
- * For TCP over IPv4 the value of r_netid is the string "tcp".  For UDP
- * over IPv4 the value of r_netid is the string "udp".
- *
- * ...
- *
- * For TCP over IPv6 the value of r_netid is the string "tcp6".  For UDP
- * over IPv6 the value of r_netid is the string "udp6".
- */
-#define RPCB_NETID_UDP	"\165\144\160"		/* "udp" */
-#define RPCB_NETID_TCP	"\164\143\160"		/* "tcp" */
-#define RPCB_NETID_UDP6	"\165\144\160\066"	/* "udp6" */
-#define RPCB_NETID_TCP6	"\164\143\160\066"	/* "tcp6" */
-
-#define RPCB_MAXNETIDLEN	(4u)
-
 /*
  * r_owner
  *
@@ -408,8 +388,8 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_prot = xprt->prot;
 	map->r_port = 0;
 	map->r_xprt = xprt_get(xprt);
-	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP :
-						   RPCB_NETID_UDP;
+	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCBIND_NETID_TCP :
+						     RPCBIND_NETID_UDP;
 	memcpy(map->r_addr,
 	       rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
 	       sizeof(map->r_addr));
@@ -587,7 +567,7 @@ out_err:
 #define RPCB_port_sz		(1u)
 #define RPCB_boolean_sz		(1u)
 
-#define RPCB_netid_sz		(1+XDR_QUADLEN(RPCB_MAXNETIDLEN))
+#define RPCB_netid_sz		(1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
 #define RPCB_addr_sz		(1+XDR_QUADLEN(RPCB_MAXADDRLEN))
 #define RPCB_ownerstring_sz	(1+XDR_QUADLEN(RPCB_MAXOWNERLEN))
 
-- 
cgit v1.2.3


From 4417c8c41a51a2ae95b2a2fa2811640b368c4151 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:43:05 -0400
Subject: SUNRPC: export per-transport rpcbind netid's

The rpcbind (v3+) netid is provided by each RPC client transport. This fixes
an omission in IPv6 rpcbind client support, and enables future extension.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/rpcb_clnt.c      | 3 +--
 net/sunrpc/xprtsock.c       | 8 ++++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 902a9c0a2d9a..513b0657e14d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -56,6 +56,7 @@ enum rpc_display_format_t {
 	RPC_DISPLAY_HEX_ADDR,
 	RPC_DISPLAY_HEX_PORT,
 	RPC_DISPLAY_UNIVERSAL_ADDR,
+	RPC_DISPLAY_NETID,
 	RPC_DISPLAY_MAX,
 };
 
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index b028a0ecd593..6f0af08a51dc 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -388,8 +388,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	map->r_prot = xprt->prot;
 	map->r_port = 0;
 	map->r_xprt = xprt_get(xprt);
-	map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCBIND_NETID_TCP :
-						     RPCBIND_NETID_UDP;
+	map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
 	memcpy(map->r_addr,
 	       rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
 	       sizeof(map->r_addr));
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7a154e4b70f5..5bdce8fa2d56 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -337,6 +337,10 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
 				ntohs(addr->sin_port) & 0xff);
 	}
 	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_NETID] =
+		kstrdup(xprt->prot == IPPROTO_UDP ?
+			RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL);
 }
 
 static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
@@ -398,6 +402,10 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
 				ntohs(addr->sin6_port) & 0xff);
 	}
 	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_NETID] =
+		kstrdup(xprt->prot == IPPROTO_UDP ?
+			RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL);
 }
 
 static void xs_free_peer_addresses(struct rpc_xprt *xprt)
-- 
cgit v1.2.3


From 6b18eaa0821a559c5e2b7ed4b90f8aca5a8e6228 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:43:29 -0400
Subject: NFS: move nfs_parsed_mount_data structure definition

In preparation for rearranging the nfs mount argument passing, make the
nfs_parsed_mount_data struct visible across nfs kernel files.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h | 33 +++++++++++++++++++++++++++++++++
 fs/nfs/super.c    | 31 -------------------------------
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 76cf55d57101..9c2d0fbb9f89 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -27,6 +27,39 @@ struct nfs_clone_mount {
 	rpc_authflavor_t authflavor;
 };
 
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+	int			flags;
+	int			rsize, wsize;
+	int			timeo, retrans;
+	int			acregmin, acregmax,
+				acdirmin, acdirmax;
+	int			namlen;
+	unsigned int		bsize;
+	unsigned int		auth_flavor_len;
+	rpc_authflavor_t	auth_flavors[1];
+	char			*client_address;
+
+	struct {
+		struct sockaddr_in	address;
+		char			*hostname;
+		unsigned int		program;
+		unsigned int		version;
+		unsigned short		port;
+		int			protocol;
+	} mount_server;
+
+	struct {
+		struct sockaddr_in	address;
+		char			*hostname;
+		char			*export_path;
+		unsigned int		program;
+		int			protocol;
+	} nfs_server;
+};
+
 /* client.c */
 extern struct rpc_program nfs_program;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a955821b8499..2fd28c4df1d4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -58,37 +58,6 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-
-struct nfs_parsed_mount_data {
-	int			flags;
-	int			rsize, wsize;
-	int			timeo, retrans;
-	int			acregmin, acregmax,
-				acdirmin, acdirmax;
-	int			namlen;
-	unsigned int		bsize;
-	unsigned int		auth_flavor_len;
-	rpc_authflavor_t	auth_flavors[1];
-	char			*client_address;
-
-	struct {
-		struct sockaddr_in	address;
-		char			*hostname;
-		unsigned int		program;
-		unsigned int		version;
-		unsigned short		port;
-		int			protocol;
-	} mount_server;
-
-	struct {
-		struct sockaddr_in	address;
-		char			*hostname;
-		char			*export_path;
-		unsigned int		program;
-		int			protocol;
-	} nfs_server;
-};
-
 enum {
 	/* Mount options that take no arguments */
 	Opt_soft, Opt_hard,
-- 
cgit v1.2.3


From 2283f8d6ed21ea2221df4cc329314b93f35351b0 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:43:56 -0400
Subject: NFS: use in-kernel mount argument structure for nfsv[23] mounts

The user-visible nfs_mount_data does not contain sufficient data to
describe new mount options, and also is now a legacy structure. Replace
it with the internal nfs_parsed_mount_data for nfsv[23] in-kernel use.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   |  18 ++++----
 fs/nfs/internal.h |   6 +--
 fs/nfs/super.c    | 121 +++++++++++++++++++++++++-----------------------------
 3 files changed, 70 insertions(+), 75 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a204484072f3..f51eabff3cd0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -501,9 +501,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
 /*
  * Initialise an NFS2 or NFS3 client
  */
-static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data)
+static int nfs_init_client(struct nfs_client *clp,
+			   const struct nfs_parsed_mount_data *data)
 {
-	int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
 	int error;
 
 	if (clp->cl_cons_state == NFS_CS_READY) {
@@ -522,8 +522,8 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans,
-					RPC_AUTH_UNIX, 0);
+	error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
+				data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -538,7 +538,8 @@ error:
 /*
  * Create a version 2 or 3 client
  */
-static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data)
+static int nfs_init_server(struct nfs_server *server,
+			   const struct nfs_parsed_mount_data *data)
 {
 	struct nfs_client *clp;
 	int error, nfsvers = 2;
@@ -551,7 +552,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 #endif
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(data->hostname, &data->addr, nfsvers);
+	clp = nfs_get_client(data->nfs_server.hostname,
+				&data->nfs_server.address, nfsvers);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
@@ -581,7 +583,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 	if (error < 0)
 		goto error;
 
-	error = nfs_init_server_rpcclient(server, data->pseudoflavor);
+	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
 	if (error < 0)
 		goto error;
 
@@ -760,7 +762,7 @@ void nfs_free_server(struct nfs_server *server)
  * Create a version 2 or 3 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
+struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 				     struct nfs_fh *mntfh)
 {
 	struct nfs_server *server;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9c2d0fbb9f89..d28e54e4dee4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,7 +5,6 @@
 #include <linux/mount.h>
 
 struct nfs_string;
-struct nfs_mount_data;
 struct nfs4_mount_data;
 
 /* Maximum number of readahead requests
@@ -65,8 +64,9 @@ extern struct rpc_program nfs_program;
 
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
-extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *,
-					    struct nfs_fh *);
+extern struct nfs_server *nfs_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
 extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
 					     const char *,
 					     const struct sockaddr_in *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2fd28c4df1d4..36a595a63536 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1041,15 +1041,31 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
  * XXX: as far as I can tell, changing the NFS program number is not
  *      supported in the NFS client.
  */
-static int nfs_validate_mount_data(struct nfs_mount_data **options,
+static int nfs_validate_mount_data(void *options,
+				   struct nfs_parsed_mount_data *args,
 				   struct nfs_fh *mntfh,
 				   const char *dev_name)
 {
-	struct nfs_mount_data *data = *options;
+	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
 	if (data == NULL)
 		goto out_no_data;
 
+	memset(args, 0, sizeof(*args));
+	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
+	args->rsize		= NFS_MAX_FILE_IO_SIZE;
+	args->wsize		= NFS_MAX_FILE_IO_SIZE;
+	args->timeo		= 600;
+	args->retrans		= 2;
+	args->acregmin		= 3;
+	args->acregmax		= 60;
+	args->acdirmin		= 30;
+	args->acdirmax		= 60;
+	args->mount_server.protocol = IPPROTO_UDP;
+	args->mount_server.program = NFS_MNT_PROGRAM;
+	args->nfs_server.protocol = IPPROTO_TCP;
+	args->nfs_server.program = NFS_PROGRAM;
+
 	switch (data->version) {
 	case 1:
 		data->namlen = 0;
@@ -1078,90 +1094,67 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options,
 		if (mntfh->size < sizeof(mntfh->data))
 			memset(mntfh->data + mntfh->size, 0,
 			       sizeof(mntfh->data) - mntfh->size);
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
+		 * can deal with.
+		 */
+		args->flags		= data->flags;
+		args->rsize		= data->rsize;
+		args->wsize		= data->wsize;
+		args->flags		= data->flags;
+		args->timeo		= data->timeo;
+		args->retrans		= data->retrans;
+		args->acregmin		= data->acregmin;
+		args->acregmax		= data->acregmax;
+		args->acdirmin		= data->acdirmin;
+		args->acdirmax		= data->acdirmax;
+		args->nfs_server.address = data->addr;
+		if (!(data->flags & NFS_MOUNT_TCP))
+			args->nfs_server.protocol = IPPROTO_UDP;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+		args->namlen		= data->namlen;
+		args->bsize		= data->bsize;
+		args->auth_flavors[0]	= data->pseudoflavor;
 		break;
 	default: {
 		unsigned int len;
 		char *c;
 		int status;
-		struct nfs_parsed_mount_data args = {
-			.flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP),
-			.rsize		= NFS_MAX_FILE_IO_SIZE,
-			.wsize		= NFS_MAX_FILE_IO_SIZE,
-			.timeo		= 600,
-			.retrans	= 2,
-			.acregmin	= 3,
-			.acregmax	= 60,
-			.acdirmin	= 30,
-			.acdirmax	= 60,
-			.mount_server.protocol = IPPROTO_UDP,
-			.mount_server.program = NFS_MNT_PROGRAM,
-			.nfs_server.protocol = IPPROTO_TCP,
-			.nfs_server.program = NFS_PROGRAM,
-		};
 
-		if (nfs_parse_mount_options((char *) *options, &args) == 0)
+		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		data = kzalloc(sizeof(*data), GFP_KERNEL);
-		if (data == NULL)
-			return -ENOMEM;
-
-		/*
-		 * NB: after this point, caller will free "data"
-		 * if we return an error
-		 */
-		*options = data;
-
 		c = strchr(dev_name, ':');
 		if (c == NULL)
 			return -EINVAL;
 		len = c - dev_name;
-		if (len > sizeof(data->hostname))
-			return -ENAMETOOLONG;
-		strncpy(data->hostname, dev_name, len);
-		args.nfs_server.hostname = data->hostname;
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
 
 		c++;
 		if (strlen(c) > NFS_MAXPATHLEN)
 			return -ENAMETOOLONG;
-		args.nfs_server.export_path = c;
+		args->nfs_server.export_path = c;
 
-		status = nfs_try_mount(&args, mntfh);
+		status = nfs_try_mount(args, mntfh);
 		if (status)
 			return status;
 
-		/*
-		 * Translate to nfs_mount_data, which nfs_fill_super
-		 * can deal with.
-		 */
-		data->version		= 6;
-		data->flags		= args.flags;
-		data->rsize		= args.rsize;
-		data->wsize		= args.wsize;
-		data->timeo		= args.timeo;
-		data->retrans		= args.retrans;
-		data->acregmin		= args.acregmin;
-		data->acregmax		= args.acregmax;
-		data->acdirmin		= args.acdirmin;
-		data->acdirmax		= args.acdirmax;
-		data->addr		= args.nfs_server.address;
-		data->namlen		= args.namlen;
-		data->bsize		= args.bsize;
-		data->pseudoflavor	= args.auth_flavors[0];
-
 		break;
 		}
 	}
 
-	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-		data->pseudoflavor = RPC_AUTH_UNIX;
+	if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
+		args->auth_flavors[0] = RPC_AUTH_UNIX;
 
 #ifndef CONFIG_NFS_V3
-	if (data->flags & NFS_MOUNT_VER3)
+	if (args->flags & NFS_MOUNT_VER3)
 		goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
-	if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
+	if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
 		goto out_no_address;
 
 	return 0;
@@ -1220,7 +1213,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
 /*
  * Finish setting up an NFS2/3 superblock
  */
-static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data)
+static void nfs_fill_super(struct super_block *sb,
+			   struct nfs_parsed_mount_data *data)
 {
 	struct nfs_server *server = NFS_SB(sb);
 
@@ -1341,7 +1335,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_server *server = NULL;
 	struct super_block *s;
 	struct nfs_fh mntfh;
-	struct nfs_mount_data *data = raw_data;
+	struct nfs_parsed_mount_data data;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
@@ -1350,12 +1344,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	int error;
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(&data, &mntfh, dev_name);
+	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs_create_server(data, &mntfh);
+	server = nfs_create_server(&data, &mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1379,7 +1373,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, data);
+		nfs_fill_super(s, &data);
 	}
 
 	mntroot = nfs_get_root(s, &mntfh);
@@ -1394,8 +1388,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	error = 0;
 
 out:
-	if (data != raw_data)
-		kfree(data);
+	kfree(data.nfs_server.hostname);
 	return error;
 
 out_err_nosb:
-- 
cgit v1.2.3


From 91ea40b9c6303ddab5c84f078f96b29084b45817 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:44:33 -0400
Subject: NFS: use in-kernel mount argument structure for nfsv4 mounts

The user-visible nfs4_mount_data does not contain sufficient data to
describe new mount options, and also is now a legacy structure. Replace
it with the internal nfs_parsed_mount_data for nfsv4 in-kernel use.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   |  24 ++++-----
 fs/nfs/internal.h |  11 ++--
 fs/nfs/super.c    | 146 ++++++++++++++++++++++++------------------------------
 3 files changed, 81 insertions(+), 100 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f51eabff3cd0..2edcbca1a161 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -908,7 +908,7 @@ error:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-		const struct nfs4_mount_data *data, rpc_authflavor_t authflavour)
+		const struct nfs_parsed_mount_data *data)
 {
 	int error;
 
@@ -928,7 +928,7 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
-	error = nfs_init_server_rpcclient(server, authflavour);
+	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
 
 	/* Done */
 	dprintk("<-- nfs4_init_server() = %d\n", error);
@@ -939,12 +939,7 @@ static int nfs4_init_server(struct nfs_server *server,
  * Create a version 4 volume record
  * - keyed on server and FSID
  */
-struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
-				      const char *hostname,
-				      const struct sockaddr_in *addr,
-				      const char *mntpath,
-				      const char *ip_addr,
-				      rpc_authflavor_t authflavour,
+struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 				      struct nfs_fh *mntfh)
 {
 	struct nfs_fattr fattr;
@@ -958,13 +953,18 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
 		return ERR_PTR(-ENOMEM);
 
 	/* Get a client record */
-	error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
-			data->proto, data->timeo, data->retrans);
+	error = nfs4_set_client(server,
+			data->nfs_server.hostname,
+			&data->nfs_server.address,
+			data->client_address,
+			data->auth_flavors[0],
+			data->nfs_server.protocol,
+			data->timeo, data->retrans);
 	if (error < 0)
 		goto error;
 
 	/* set up the general RPC client */
-	error = nfs4_init_server(server, data, authflavour);
+	error = nfs4_init_server(server, data);
 	if (error < 0)
 		goto error;
 
@@ -973,7 +973,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
 	/* Probe the root fh to retrieve its FSID */
-	error = nfs4_path_walk(server, mntfh, mntpath);
+	error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d28e54e4dee4..f3acf48412be 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,7 +5,6 @@
 #include <linux/mount.h>
 
 struct nfs_string;
-struct nfs4_mount_data;
 
 /* Maximum number of readahead requests
  * FIXME: this should really be a sysctl so that users may tune it to suit
@@ -67,13 +66,9 @@ extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
-extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
-					     const char *,
-					     const struct sockaddr_in *,
-					     const char *,
-					     const char *,
-					     rpc_authflavor_t,
-					     struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(
+					const struct nfs_parsed_mount_data *,
+					struct nfs_fh *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
 						      struct nfs_fh *);
 extern void nfs_free_server(struct nfs_server *server);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 36a595a63536..63742bbcb1ce 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1514,38 +1514,49 @@ static void nfs4_fill_super(struct super_block *sb)
 /*
  * Validate NFSv4 mount options
  */
-static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
-				    const char *dev_name,
-				    struct sockaddr_in *addr,
-				    rpc_authflavor_t *authflavour,
-				    char **hostname,
-				    char **mntpath,
-				    char **ip_addr)
+static int nfs4_validate_mount_data(void *options,
+				    struct nfs_parsed_mount_data *args,
+				    const char *dev_name)
 {
-	struct nfs4_mount_data *data = *options;
+	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
 	if (data == NULL)
 		goto out_no_data;
 
+	memset(args, 0, sizeof(*args));
+	args->rsize		= NFS_MAX_FILE_IO_SIZE;
+	args->wsize		= NFS_MAX_FILE_IO_SIZE;
+	args->timeo		= 600;
+	args->retrans		= 2;
+	args->acregmin		= 3;
+	args->acregmax		= 60;
+	args->acdirmin		= 30;
+	args->acdirmax		= 60;
+	args->nfs_server.protocol = IPPROTO_TCP;
+
 	switch (data->version) {
 	case 1:
-		if (data->host_addrlen != sizeof(*addr))
+		if (data->host_addrlen != sizeof(args->nfs_server.address))
 			goto out_no_address;
-		if (copy_from_user(addr, data->host_addr, sizeof(*addr)))
+		if (copy_from_user(&args->nfs_server.address,
+				   data->host_addr,
+				   sizeof(&args->nfs_server.address)))
 			return -EFAULT;
-		if (addr->sin_port == 0)
-			addr->sin_port = htons(NFS_PORT);
-		if (!nfs_verify_server_address((struct sockaddr *) addr))
+		if (args->nfs_server.address.sin_port == 0)
+			args->nfs_server.address.sin_port = htons(NFS_PORT);
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
 			goto out_no_address;
 
 		switch (data->auth_flavourlen) {
 		case 0:
-			*authflavour = RPC_AUTH_UNIX;
+			args->auth_flavors[0] = RPC_AUTH_UNIX;
 			break;
 		case 1:
-			if (copy_from_user(authflavour, data->auth_flavours,
-					   sizeof(*authflavour)))
+			if (copy_from_user(args->auth_flavors,
+					   data->auth_flavours,
+					   sizeof(args->auth_flavors)))
 				return -EFAULT;
 			break;
 		default:
@@ -1555,74 +1566,56 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*hostname = c;
+		args->nfs_server.hostname = c;
 
 		c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*mntpath = c;
-		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath);
+		args->nfs_server.export_path = c;
+		dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
 
 		c = strndup_user(data->client_addr.data, 16);
 		if (IS_ERR(c))
 			return PTR_ERR(c);
-		*ip_addr = c;
+		args->client_address = c;
+
+		/*
+		 * Translate to nfs_parsed_mount_data, which nfs4_fill_super
+		 * can deal with.
+		 */
+
+		args->flags	= data->flags & NFS4_MOUNT_FLAGMASK;
+		args->rsize	= data->rsize;
+		args->wsize	= data->wsize;
+		args->timeo	= data->timeo;
+		args->retrans	= data->retrans;
+		args->acregmin	= data->acregmin;
+		args->acregmax	= data->acregmax;
+		args->acdirmin	= data->acdirmin;
+		args->acdirmax	= data->acdirmax;
+		args->nfs_server.protocol = data->proto;
 
 		break;
 	default: {
 		unsigned int len;
-		struct nfs_parsed_mount_data args = {
-			.rsize		= NFS_MAX_FILE_IO_SIZE,
-			.wsize		= NFS_MAX_FILE_IO_SIZE,
-			.timeo		= 600,
-			.retrans	= 2,
-			.acregmin	= 3,
-			.acregmax	= 60,
-			.acdirmin	= 30,
-			.acdirmax	= 60,
-			.nfs_server.protocol = IPPROTO_TCP,
-		};
-
-		if (nfs_parse_mount_options((char *) *options, &args) == 0)
+
+		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
 		if (!nfs_verify_server_address((struct sockaddr *)
-						&args.nfs_server.address))
+						&args->nfs_server.address))
 			return -EINVAL;
-		*addr = args.nfs_server.address;
 
-		switch (args.auth_flavor_len) {
+		switch (args->auth_flavor_len) {
 		case 0:
-			*authflavour = RPC_AUTH_UNIX;
+			args->auth_flavors[0] = RPC_AUTH_UNIX;
 			break;
 		case 1:
-			*authflavour = (rpc_authflavor_t) args.auth_flavors[0];
 			break;
 		default:
 			goto out_inval_auth;
 		}
 
-		/*
-		 * Translate to nfs4_mount_data, which nfs4_fill_super
-		 * can deal with.
-		 */
-		data = kzalloc(sizeof(*data), GFP_KERNEL);
-		if (data == NULL)
-			return -ENOMEM;
-		*options = data;
-
-		data->version	= 1;
-		data->flags	= args.flags & NFS4_MOUNT_FLAGMASK;
-		data->rsize	= args.rsize;
-		data->wsize	= args.wsize;
-		data->timeo	= args.timeo;
-		data->retrans	= args.retrans;
-		data->acregmin	= args.acregmin;
-		data->acregmax	= args.acregmax;
-		data->acdirmin	= args.acdirmin;
-		data->acdirmax	= args.acdirmax;
-		data->proto	= args.nfs_server.protocol;
-
 		/*
 		 * Split "dev_name" into "hostname:mntpath".
 		 */
@@ -1633,27 +1626,25 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
 		len = c - dev_name;
 		if (len > NFS4_MAXNAMLEN)
 			return -ENAMETOOLONG;
-		*hostname = kzalloc(len, GFP_KERNEL);
-		if (*hostname == NULL)
+		args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
+		if (args->nfs_server.hostname == NULL)
 			return -ENOMEM;
-		strncpy(*hostname, dev_name, len - 1);
+		strncpy(args->nfs_server.hostname, dev_name, len - 1);
 
 		c++;			/* step over the ':' */
 		len = strlen(c);
 		if (len > NFS4_MAXPATHLEN)
 			return -ENAMETOOLONG;
-		*mntpath = kzalloc(len + 1, GFP_KERNEL);
-		if (*mntpath == NULL)
+		args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
+		if (args->nfs_server.export_path == NULL)
 			return -ENOMEM;
-		strncpy(*mntpath, c, len);
+		strncpy(args->nfs_server.export_path, c, len);
 
-		dprintk("MNTPATH: %s\n", *mntpath);
+		dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
 
-		if (args.client_address == NULL)
+		if (args->client_address == NULL)
 			goto out_no_client_address;
 
-		*ip_addr = args.client_address;
-
 		break;
 		}
 	}
@@ -1684,14 +1675,11 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	struct nfs4_mount_data *data = raw_data;
+	struct nfs_parsed_mount_data data;
 	struct super_block *s;
 	struct nfs_server *server;
-	struct sockaddr_in addr;
-	rpc_authflavor_t authflavour;
 	struct nfs_fh mntfh;
 	struct dentry *mntroot;
-	char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
@@ -1699,14 +1687,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	int error;
 
 	/* Validate the mount data */
-	error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour,
-					 &hostname, &mntpath, &ip_addr);
+	error = nfs4_validate_mount_data(raw_data, &data, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
-				    authflavour, &mntfh);
+	server = nfs4_create_server(&data, &mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1745,9 +1731,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	error = 0;
 
 out:
-	kfree(ip_addr);
-	kfree(mntpath);
-	kfree(hostname);
+	kfree(data.client_address);
+	kfree(data.nfs_server.export_path);
+	kfree(data.nfs_server.hostname);
 	return error;
 
 out_free:
-- 
cgit v1.2.3


From 20c71f5e0f954b00d75009542db2c1f844d94a1e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 20 Sep 2007 20:23:51 -0400
Subject: NFSv4: Fix a bug in nfs4_validate_mount_data()

The previous patch introduced a bug when copying the server address.

Also clarify a copy into the auth_flavours array: currently the two
size calculations are equivalent, but we may decide to change the size
of auth_flavors[] at some point.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 63742bbcb1ce..3804aadc18c4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1541,7 +1541,7 @@ static int nfs4_validate_mount_data(void *options,
 			goto out_no_address;
 		if (copy_from_user(&args->nfs_server.address,
 				   data->host_addr,
-				   sizeof(&args->nfs_server.address)))
+				   sizeof(args->nfs_server.address)))
 			return -EFAULT;
 		if (args->nfs_server.address.sin_port == 0)
 			args->nfs_server.address.sin_port = htons(NFS_PORT);
@@ -1554,9 +1554,9 @@ static int nfs4_validate_mount_data(void *options,
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
 			break;
 		case 1:
-			if (copy_from_user(args->auth_flavors,
+			if (copy_from_user(&args->auth_flavors[0],
 					   data->auth_flavours,
-					   sizeof(args->auth_flavors)))
+					   sizeof(args->auth_flavors[0])))
 				return -EFAULT;
 			break;
 		default:
-- 
cgit v1.2.3


From 4f22ccc3460ef65e9899ec271d36fc4ef795c68d Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:44:58 -0400
Subject: SUNRPC: mark bulk read/write data in xdrbuf

Adds a flag word to the xdrbuf struct which indicates any bulk
disposition of the data. This enables RPC transport providers to
marshal it efficiently/appropriately, and may enable other
optimizations.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c           | 2 ++
 fs/nfs/nfs3xdr.c           | 2 ++
 fs/nfs/nfs4xdr.c           | 2 ++
 include/linux/sunrpc/xdr.h | 5 ++++-
 net/sunrpc/clnt.c          | 1 +
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 1c570948fc16..668ab96c7b59 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -251,6 +251,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	return 0;
 }
 
@@ -313,6 +314,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 
 	/* Copy the page array */
 	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+	sndbuf->flags |= XDRBUF_WRITE;
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8a225fbe9ee2..616d3267b7e7 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -346,6 +346,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	return 0;
 }
 
@@ -367,6 +368,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 
 	/* Copy the page array */
 	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+	sndbuf->flags |= XDRBUF_WRITE;
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4dbbf44727ea..5f353d4686b6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1856,6 +1856,7 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
 	xdr_inline_pages(&req->rq_rcv_buf, replen,
 			 args->pages, args->pgbase, args->count);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
 out:
 	return status;
 }
@@ -1932,6 +1933,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 	status = encode_write(&xdr, args);
 	if (status)
 		goto out;
+	req->rq_snd_buf.flags |= XDRBUF_WRITE;
 	status = encode_getfattr(&xdr, args->bitmask);
 out:
 	return status;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index c6b53d181bfa..0751c9464d0f 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -70,7 +70,10 @@ struct xdr_buf {
 
 	struct page **	pages;		/* Array of contiguous pages */
 	unsigned int	page_base,	/* Start of page data */
-			page_len;	/* Length of page data */
+			page_len,	/* Length of page data */
+			flags;		/* Flags for data disposition */
+#define XDRBUF_READ		0x01		/* target of file read */
+#define XDRBUF_WRITE		0x02		/* source of file write */
 
 	unsigned int	buflen,		/* Total length of storage buffer */
 			len;		/* Length of XDR encoded message */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b19bacf42564..19c129ed6467 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -874,6 +874,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 	buf->head[0].iov_len = len;
 	buf->tail[0].iov_len = 0;
 	buf->page_len = 0;
+	buf->flags = 0;
 	buf->len = 0;
 	buf->buflen = len;
 }
-- 
cgit v1.2.3


From 1244480976d357447aeddd3f44977586bfa0462b Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:45:36 -0400
Subject: SUNRPC: add EXPORT_SYMBOL_GPL for generic transport functions

SUNRPC: add EXPORT_SYMBOL_GPL for generic transport functions

As a preface to allowing arbitrary transport modules to be loaded
dynamically, add EXPORT_SYMBOL_GPL for all generic transport functions
that a transport implementation might want to use.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Tom Talpey <tmt@netapp.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c |  1 +
 net/sunrpc/sched.c     |  2 ++
 net/sunrpc/socklib.c   |  3 +++
 net/sunrpc/timer.c     |  4 ++++
 net/sunrpc/xprt.c      | 15 +++++++++++++++
 5 files changed, 25 insertions(+)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 6f0af08a51dc..09a2a8fde093 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -415,6 +415,7 @@ bailout_nofree:
 bailout_nowake:
 	task->tk_status = status;
 }
+EXPORT_SYMBOL_GPL(rpcb_getport_async);
 
 /*
  * Rpcbind child task calls this callback via tk_exit.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 954d7ec86c7e..3c773c53e12e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -777,6 +777,7 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
 			task->tk_pid, size, buf);
 	return &buf->data;
 }
+EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
  * rpc_free - free buffer allocated via rpc_malloc
@@ -802,6 +803,7 @@ void rpc_free(void *buffer)
 	else
 		kfree(buf);
 }
+EXPORT_SYMBOL_GPL(rpc_free);
 
 /*
  * Creation and deletion of RPC task structures
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1d377d1ab7f4..97ac45f034d6 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -34,6 +34,7 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
 	desc->offset += len;
 	return len;
 }
+EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
 
 /**
  * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -137,6 +138,7 @@ copy_tail:
 out:
 	return copied;
 }
+EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
 
 /**
  * csum_partial_copy_to_xdr - checksum and copy data
@@ -179,3 +181,4 @@ no_checksum:
 		return -1;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
index 8142fdb8a930..31becbf09263 100644
--- a/net/sunrpc/timer.c
+++ b/net/sunrpc/timer.c
@@ -17,6 +17,7 @@
 
 #include <linux/types.h>
 #include <linux/unistd.h>
+#include <linux/module.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -40,6 +41,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo)
 		rt->ntimeouts[i] = 0;
 	}
 }
+EXPORT_SYMBOL_GPL(rpc_init_rtt);
 
 /*
  * NB: When computing the smoothed RTT and standard deviation,
@@ -75,6 +77,7 @@ rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m)
 	if (*sdrtt < RPC_RTO_MIN)
 		*sdrtt = RPC_RTO_MIN;
 }
+EXPORT_SYMBOL_GPL(rpc_update_rtt);
 
 /*
  * Estimate rto for an nfs rpc sent via. an unreliable datagram.
@@ -103,3 +106,4 @@ rpc_calc_rto(struct rpc_rtt *rt, unsigned timer)
 
 	return res;
 }
+EXPORT_SYMBOL_GPL(rpc_calc_rto);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c8c2edccad7e..bc13616e7fdc 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -118,6 +118,7 @@ out_sleep:
 		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
@@ -167,6 +168,7 @@ out_sleep:
 		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
 
 static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -246,6 +248,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 		__xprt_lock_write_next(xprt);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt);
 
 /**
  * xprt_release_xprt_cong - allow other requests to use a transport
@@ -262,6 +265,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 		__xprt_lock_write_next_cong(xprt);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
 
 static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -314,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task)
 {
 	__xprt_put_cong(task->tk_xprt, task->tk_rqstp);
 }
+EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
 
 /**
  * xprt_adjust_cwnd - adjust transport congestion window
@@ -345,6 +350,7 @@ void xprt_adjust_cwnd(struct rpc_task *task, int result)
 	xprt->cwnd = cwnd;
 	__xprt_put_cong(xprt, req);
 }
+EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
 
 /**
  * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
@@ -359,6 +365,7 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
 	else
 		rpc_wake_up(&xprt->pending);
 }
+EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
 
 /**
  * xprt_wait_for_buffer_space - wait for transport output buffer to clear
@@ -373,6 +380,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task)
 	task->tk_timeout = req->rq_timeout;
 	rpc_sleep_on(&xprt->pending, task, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
 /**
  * xprt_write_space - wake the task waiting for transport output buffer space
@@ -393,6 +401,7 @@ void xprt_write_space(struct rpc_xprt *xprt)
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_write_space);
 
 /**
  * xprt_set_retrans_timeout_def - set a request's retransmit timeout
@@ -406,6 +415,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task)
 {
 	task->tk_timeout = task->tk_rqstp->rq_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
 
 /*
  * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
@@ -425,6 +435,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
 	if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
 		task->tk_timeout = max_timeout;
 }
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
 
 static void xprt_reset_majortimeo(struct rpc_rqst *req)
 {
@@ -500,6 +511,7 @@ void xprt_disconnect(struct rpc_xprt *xprt)
 	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock_bh(&xprt->transport_lock);
 }
+EXPORT_SYMBOL_GPL(xprt_disconnect);
 
 static void
 xprt_init_autodisconnect(unsigned long data)
@@ -610,6 +622,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 	xprt->stat.bad_xids++;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
 
 /**
  * xprt_update_rtt - update an RPC client's RTT state after receiving a reply
@@ -629,6 +642,7 @@ void xprt_update_rtt(struct rpc_task *task)
 		rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
 	}
 }
+EXPORT_SYMBOL_GPL(xprt_update_rtt);
 
 /**
  * xprt_complete_rqst - called when reply processing is complete
@@ -653,6 +667,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 	req->rq_received = req->rq_private_buf.len = copied;
 	rpc_wake_up_task(task);
 }
+EXPORT_SYMBOL_GPL(xprt_complete_rqst);
 
 static void xprt_timer(struct rpc_task *task)
 {
-- 
cgit v1.2.3


From 81c098af3da7981902e9f8163aeccc2467c4ba6d Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:46:00 -0400
Subject: SUNRPC: Provide a new API for registering transport implementations

To allow transport capabilities to be loaded dynamically, provide an API
for registering and unregistering the transports with the RPC client.
Eventually xprt_create_transport() will be changed to search the list of
registered transports when initializing a fresh transport.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 11 +++++++
 net/sunrpc/xprt.c           | 75 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 513b0657e14d..7b6b137eca97 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -208,6 +208,15 @@ struct rpc_xprtsock_create {
 	struct rpc_timeout *	timeout;	/* optional timeout parameters */
 };
 
+struct xprt_class {
+	struct list_head	list;
+	unsigned short		family;
+	int			protocol;
+	struct rpc_xprt *	(*setup)(struct rpc_xprtsock_create *);
+	struct module		*owner;
+	char			name[32];
+};
+
 /*
  * Transport operations used by ULPs
  */
@@ -239,6 +248,8 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
 /*
  * Transport switch helper functions
  */
+int			xprt_register_transport(struct xprt_class *type);
+int			xprt_unregister_transport(struct xprt_class *type);
 void			xprt_set_retrans_timeout_def(struct rpc_task *task);
 void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bc13616e7fdc..520859622af5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -62,6 +62,9 @@ static inline void	do_xprt_reserve(struct rpc_task *);
 static void	xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 
+static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(xprt_list);
+
 /*
  * The transport code maintains an estimate on the maximum number of out-
  * standing RPC requests, using a smoothed version of the congestion
@@ -80,6 +83,78 @@ static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
 
 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
 
+/**
+ * xprt_register_transport - register a transport implementation
+ * @transport: transport to register
+ *
+ * If a transport implementation is loaded as a kernel module, it can
+ * call this interface to make itself known to the RPC client.
+ *
+ * Returns:
+ * 0:		transport successfully registered
+ * -EEXIST:	transport already registered
+ * -EINVAL:	transport module being unloaded
+ */
+int xprt_register_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = -EEXIST;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		/* don't register the same transport class twice */
+		if (t == transport)
+			goto out;
+	}
+
+	result = -EINVAL;
+	if (try_module_get(THIS_MODULE)) {
+		list_add_tail(&transport->list, &xprt_list);
+		printk(KERN_INFO "RPC: Registered %s transport module.\n",
+			transport->name);
+		result = 0;
+	}
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_register_transport);
+
+/**
+ * xprt_unregister_transport - unregister a transport implementation
+ * transport: transport to unregister
+ *
+ * Returns:
+ * 0:		transport successfully unregistered
+ * -ENOENT:	transport never registered
+ */
+int xprt_unregister_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t == transport) {
+			printk(KERN_INFO
+				"RPC: Unregistered %s transport module.\n",
+				transport->name);
+			list_del_init(&transport->list);
+			module_put(THIS_MODULE);
+			goto out;
+		}
+	}
+	result = -ENOENT;
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_unregister_transport);
+
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
-- 
cgit v1.2.3


From bc25571e21e8bd053554209f5b1b228ad71e6b99 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:46:39 -0400
Subject: SUNRPC: Finish API to load RPC transport implementations dynamically

Allow RPC client transport implementations to be loaded as needed, or
as they become available from distributors or third-party vendors.

Note that we leave the IP sockets implementation in sunrpc.o
permanently, as IP functionality is always available in any
kernel that runs NFS.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c     | 26 +++++++++++++++-----------
 net/sunrpc/xprtsock.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 520859622af5..7f8c60b84396 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -983,19 +983,23 @@ struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args)
 {
 	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
+	struct xprt_class *t;
 
-	switch (args->proto) {
-	case IPPROTO_UDP:
-		xprt = xs_setup_udp(args);
-		break;
-	case IPPROTO_TCP:
-		xprt = xs_setup_tcp(args);
-		break;
-	default:
-		printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
-				args->proto);
-		return ERR_PTR(-EIO);
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if ((t->family == args->dstaddr->sa_family) &&
+		    (t->protocol == args->proto)) {
+			spin_unlock(&xprt_list_lock);
+			goto found;
+		}
 	}
+	spin_unlock(&xprt_list_lock);
+	printk(KERN_ERR "RPC: transport (%u/%d) not supported\n",
+			args->dstaddr->sa_family, args->proto);
+	return ERR_PTR(-EIO);
+
+found:
+	xprt = t->setup(args);
 	if (IS_ERR(xprt)) {
 		dprintk("RPC:       xprt_create_transport: failed, %ld\n",
 				-PTR_ERR(xprt));
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5bdce8fa2d56..3bc9c921c82f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -20,6 +20,7 @@
 
 #include <linux/types.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/pagemap.h>
 #include <linux/errno.h>
@@ -777,6 +778,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
 	xs_free_peer_addresses(xprt);
 	kfree(xprt->slot);
 	kfree(xprt);
+	module_put(THIS_MODULE);
 }
 
 static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@ -1782,7 +1784,8 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
-static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size)
+static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args,
+				      unsigned int slot_table_size)
 {
 	struct rpc_xprt *xprt;
 	struct sock_xprt *new;
@@ -1876,7 +1879,12 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	return xprt;
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(-EINVAL);
 }
 
 /**
@@ -1934,11 +1942,34 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 	dprintk("RPC:       set up transport to address %s\n",
 			xprt->address_strings[RPC_DISPLAY_ALL]);
 
-	return xprt;
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(-EINVAL);
 }
 
+static struct xprt_class	xs_udp_transport = {
+	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
+	.name		= "udp",
+	.owner		= THIS_MODULE,
+	.family		= AF_INET,
+	.protocol	= IPPROTO_UDP,
+	.setup		= xs_setup_udp,
+};
+
+static struct xprt_class	xs_tcp_transport = {
+	.list		= LIST_HEAD_INIT(xs_tcp_transport.list),
+	.name		= "tcp",
+	.owner		= THIS_MODULE,
+	.family		= AF_INET,
+	.protocol	= IPPROTO_TCP,
+	.setup		= xs_setup_tcp,
+};
+
 /**
- * init_socket_xprt - set up xprtsock's sysctls
+ * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
  *
  */
 int init_socket_xprt(void)
@@ -1948,11 +1979,14 @@ int init_socket_xprt(void)
 		sunrpc_table_header = register_sysctl_table(sunrpc_table);
 #endif
 
+	xprt_register_transport(&xs_udp_transport);
+	xprt_register_transport(&xs_tcp_transport);
+
 	return 0;
 }
 
 /**
- * cleanup_socket_xprt - remove xprtsock's sysctls
+ * cleanup_socket_xprt - remove xprtsock's sysctls, unregister
  *
  */
 void cleanup_socket_xprt(void)
@@ -1963,4 +1997,7 @@ void cleanup_socket_xprt(void)
 		sunrpc_table_header = NULL;
 	}
 #endif
+
+	xprt_unregister_transport(&xs_udp_transport);
+	xprt_unregister_transport(&xs_tcp_transport);
 }
-- 
cgit v1.2.3


From 3c341b0b925eee01daae2c594b81e673f659d7cd Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:47:07 -0400
Subject: SUNRPC: rename the rpc_xprtsock_create structure

To prepare for including non-sockets-based RPC transports, change the
overly suggestive name of the transport creation arguments struct.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 10 +++++-----
 net/sunrpc/clnt.c           |  2 +-
 net/sunrpc/xprt.c           |  2 +-
 net/sunrpc/xprtsock.c       |  6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 7b6b137eca97..6992ff02d737 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -200,7 +200,7 @@ struct rpc_xprt {
 	char *			address_strings[RPC_DISPLAY_MAX];
 };
 
-struct rpc_xprtsock_create {
+struct xprt_create {
 	int			proto;		/* IPPROTO_UDP or IPPROTO_TCP */
 	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
@@ -212,7 +212,7 @@ struct xprt_class {
 	struct list_head	list;
 	unsigned short		family;
 	int			protocol;
-	struct rpc_xprt *	(*setup)(struct rpc_xprtsock_create *);
+	struct rpc_xprt *	(*setup)(struct xprt_create *);
 	struct module		*owner;
 	char			name[32];
 };
@@ -225,7 +225,7 @@ void			xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long
 /*
  * Generic internal transport functions
  */
-struct rpc_xprt *	xprt_create_transport(struct rpc_xprtsock_create *args);
+struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_task *task);
@@ -265,8 +265,8 @@ void			xprt_disconnect(struct rpc_xprt *xprt);
 /*
  * Socket transport setup operations
  */
-struct rpc_xprt *	xs_setup_udp(struct rpc_xprtsock_create *args);
-struct rpc_xprt *	xs_setup_tcp(struct rpc_xprtsock_create *args);
+struct rpc_xprt		*xs_setup_udp(struct xprt_create *args);
+struct rpc_xprt		*xs_setup_tcp(struct xprt_create *args);
 int			init_socket_xprt(void);
 void			cleanup_socket_xprt(void);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 19c129ed6467..e86958c61a20 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -240,7 +240,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 {
 	struct rpc_xprt *xprt;
 	struct rpc_clnt *clnt;
-	struct rpc_xprtsock_create xprtargs = {
+	struct xprt_create xprtargs = {
 		.proto = args->protocol,
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 7f8c60b84396..473b48ff4523 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -979,7 +979,7 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
 	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3bc9c921c82f..4f1a57b28a49 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1784,7 +1784,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
-static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args,
+static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 				      unsigned int slot_table_size)
 {
 	struct rpc_xprt *xprt;
@@ -1826,7 +1826,7 @@ static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args,
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 {
 	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
@@ -1892,7 +1892,7 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
  * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
+struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 {
 	struct sockaddr *addr = args->dstaddr;
 	struct rpc_xprt *xprt;
-- 
cgit v1.2.3


From 49c36fcc441baf6a4d698e3645d1adf28edaf57b Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:47:31 -0400
Subject: SUNRPC: rearrange RPC sockets definitions

To prepare for including non-sockets-based RPC transports, move the
sockets-dependent definitions into their own file.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h     | 22 ----------------------
 include/linux/sunrpc/xprtsock.h | 40 ++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sunrpc_syms.c        |  2 +-
 net/sunrpc/xprtsock.c           |  1 +
 4 files changed, 42 insertions(+), 23 deletions(-)
 create mode 100644 include/linux/sunrpc/xprtsock.h

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 6992ff02d737..d7b8fcd312cc 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -19,24 +19,10 @@
 
 #ifdef __KERNEL__
 
-extern unsigned int xprt_udp_slot_table_entries;
-extern unsigned int xprt_tcp_slot_table_entries;
-
 #define RPC_MIN_SLOT_TABLE	(2U)
 #define RPC_DEF_SLOT_TABLE	(16U)
 #define RPC_MAX_SLOT_TABLE	(128U)
 
-/*
- * Parameters for choosing a free port
- */
-extern unsigned int xprt_min_resvport;
-extern unsigned int xprt_max_resvport;
-
-#define RPC_MIN_RESVPORT	(1U)
-#define RPC_MAX_RESVPORT	(65535U)
-#define RPC_DEF_MIN_RESVPORT	(665U)
-#define RPC_DEF_MAX_RESVPORT	(1023U)
-
 /*
  * This describes a timeout strategy
  */
@@ -262,14 +248,6 @@ void			xprt_complete_rqst(struct rpc_task *task, int copied);
 void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect(struct rpc_xprt *xprt);
 
-/*
- * Socket transport setup operations
- */
-struct rpc_xprt		*xs_setup_udp(struct xprt_create *args);
-struct rpc_xprt		*xs_setup_tcp(struct xprt_create *args);
-int			init_socket_xprt(void);
-void			cleanup_socket_xprt(void);
-
 /*
  * Reserved bit positions in xprt->state
  */
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
new file mode 100644
index 000000000000..9bde77061e4f
--- /dev/null
+++ b/include/linux/sunrpc/xprtsock.h
@@ -0,0 +1,40 @@
+/*
+ *  linux/include/linux/sunrpc/xprtsock.h
+ *
+ *  Declarations for the RPC transport socket provider.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTSOCK_H
+#define _LINUX_SUNRPC_XPRTSOCK_H
+
+#ifdef __KERNEL__
+
+/*
+ * Socket transport setup operations
+ */
+struct rpc_xprt *xs_setup_udp(struct xprt_create *args);
+struct rpc_xprt *xs_setup_tcp(struct xprt_create *args);
+
+int		init_socket_xprt(void);
+void		cleanup_socket_xprt(void);
+
+/*
+ * RPC slot table sizes for UDP, TCP transports
+ */
+extern unsigned int xprt_udp_slot_table_entries;
+extern unsigned int xprt_tcp_slot_table_entries;
+
+/*
+ * Parameters for choosing a free port
+ */
+extern unsigned int xprt_min_resvport;
+extern unsigned int xprt_max_resvport;
+
+#define RPC_MIN_RESVPORT	(1U)
+#define RPC_MAX_RESVPORT	(65535U)
+#define RPC_DEF_MIN_RESVPORT	(665U)
+#define RPC_DEF_MAX_RESVPORT	(1023U)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 384c4ad5ab86..33d89e842c85 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -20,7 +20,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-
+#include <linux/sunrpc/xprtsock.h>
 
 /* RPC scheduler */
 EXPORT_SYMBOL(rpc_execute);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f1a57b28a49..192a06e3b8c5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -32,6 +32,7 @@
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/file.h>
 
 #include <net/sock.h>
-- 
cgit v1.2.3


From 4fa016eb248cac875541fa199af550a8aefa0e90 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:47:57 -0400
Subject: NFS/SUNRPC: support transport protocol naming

To prepare for including non-sockets-based RPC transports, select
RPC transports by an identifier (to be used in following patches).

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h     |  5 ++---
 include/linux/sunrpc/xprtsock.h | 11 +++++++++++
 net/sunrpc/clnt.c               |  2 +-
 net/sunrpc/xprt.c               |  8 +++-----
 net/sunrpc/xprtsock.c           |  6 ++----
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index d7b8fcd312cc..30b17b3bc1a9 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -187,7 +187,7 @@ struct rpc_xprt {
 };
 
 struct xprt_create {
-	int			proto;		/* IPPROTO_UDP or IPPROTO_TCP */
+	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
 	size_t			addrlen;
@@ -196,8 +196,7 @@ struct xprt_create {
 
 struct xprt_class {
 	struct list_head	list;
-	unsigned short		family;
-	int			protocol;
+	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct rpc_xprt *	(*setup)(struct xprt_create *);
 	struct module		*owner;
 	char			name[32];
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 9bde77061e4f..2c6c2c2783d8 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -18,6 +18,17 @@ struct rpc_xprt *xs_setup_tcp(struct xprt_create *args);
 int		init_socket_xprt(void);
 void		cleanup_socket_xprt(void);
 
+/*
+ * RPC transport identifiers for UDP, TCP
+ *
+ * To preserve compatibility with the historical use of raw IP protocol
+ * id's for transport selection, these are specified with the previous
+ * values. No such restriction exists for new transports, except that
+ * they may not collide with these values (17 and 6, respectively).
+ */
+#define XPRT_TRANSPORT_UDP	IPPROTO_UDP
+#define XPRT_TRANSPORT_TCP	IPPROTO_TCP
+
 /*
  * RPC slot table sizes for UDP, TCP transports
  */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e86958c61a20..6cdf53c489b7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -241,7 +241,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	struct rpc_xprt *xprt;
 	struct rpc_clnt *clnt;
 	struct xprt_create xprtargs = {
-		.proto = args->protocol,
+		.ident = args->protocol,
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
 		.addrlen = args->addrsize,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 473b48ff4523..282a9a2ec90c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -104,7 +104,7 @@ int xprt_register_transport(struct xprt_class *transport)
 	spin_lock(&xprt_list_lock);
 	list_for_each_entry(t, &xprt_list, list) {
 		/* don't register the same transport class twice */
-		if (t == transport)
+		if (t->ident == transport->ident)
 			goto out;
 	}
 
@@ -987,15 +987,13 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 
 	spin_lock(&xprt_list_lock);
 	list_for_each_entry(t, &xprt_list, list) {
-		if ((t->family == args->dstaddr->sa_family) &&
-		    (t->protocol == args->proto)) {
+		if (t->ident == args->ident) {
 			spin_unlock(&xprt_list_lock);
 			goto found;
 		}
 	}
 	spin_unlock(&xprt_list_lock);
-	printk(KERN_ERR "RPC: transport (%u/%d) not supported\n",
-			args->dstaddr->sa_family, args->proto);
+	printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
 	return ERR_PTR(-EIO);
 
 found:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 192a06e3b8c5..b81494a97a5e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1955,8 +1955,7 @@ static struct xprt_class	xs_udp_transport = {
 	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
 	.name		= "udp",
 	.owner		= THIS_MODULE,
-	.family		= AF_INET,
-	.protocol	= IPPROTO_UDP,
+	.ident		= IPPROTO_UDP,
 	.setup		= xs_setup_udp,
 };
 
@@ -1964,8 +1963,7 @@ static struct xprt_class	xs_tcp_transport = {
 	.list		= LIST_HEAD_INIT(xs_tcp_transport.list),
 	.name		= "tcp",
 	.owner		= THIS_MODULE,
-	.family		= AF_INET,
-	.protocol	= IPPROTO_TCP,
+	.ident		= IPPROTO_TCP,
 	.setup		= xs_setup_tcp,
 };
 
-- 
cgit v1.2.3


From 0896a725a1c5fdc8773a4d1ab0b73059507f5925 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:48:23 -0400
Subject: NFS/SUNRPC: use transport protocol naming

Instead of an { address family, raw IP protocol number }-tuple, use the
newly-defined RPC identifier when creating clients in the upper layers.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c         |  3 ++-
 fs/nfs/client.c        |  5 +++--
 fs/nfs/nfsroot.c       |  3 ++-
 fs/nfs/super.c         | 21 +++++++++++----------
 net/sunrpc/rpcb_clnt.c |  3 ++-
 5 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3353ed8421a7..908b23fadd05 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
@@ -132,7 +133,7 @@ nsm_create(void)
 		.sin_port	= 0,
 	};
 	struct rpc_create_args args = {
-		.protocol	= IPPROTO_UDP,
+		.protocol	= XPRT_TRANSPORT_UDP,
 		.address	= (struct sockaddr *)&sin,
 		.addrsize	= sizeof(sin),
 		.servername	= "localhost",
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2edcbca1a161..6f6755529788 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -23,6 +23,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -340,7 +341,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 		to->to_retries = 2;
 
 	switch (proto) {
-	case IPPROTO_TCP:
+	case XPRT_TRANSPORT_TCP:
 		if (!to->to_initval)
 			to->to_initval = 60 * HZ;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
@@ -349,7 +350,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
 		to->to_exponential = 0;
 		break;
-	case IPPROTO_UDP:
+	case XPRT_TRANSPORT_UDP:
 	default:
 		if (!to->to_initval)
 			to->to_initval = 11 * HZ / 10;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 3490322d1145..e87b44ee9ac9 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -76,6 +76,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
@@ -491,7 +492,7 @@ static int __init root_nfs_get_handle(void)
 	struct sockaddr_in sin;
 	int status;
 	int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-					IPPROTO_TCP : IPPROTO_UDP;
+					XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
 	int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
 					NFS_MNT3_VERSION : NFS_MNT_VERSION;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3804aadc18c4..b47e87e6c406 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -33,6 +33,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -669,13 +670,13 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
-			mnt->nfs_server.protocol = IPPROTO_UDP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 			mnt->timeo = 7;
 			mnt->retrans = 5;
 			break;
 		case Opt_tcp:
 			mnt->flags |= NFS_MOUNT_TCP;
-			mnt->nfs_server.protocol = IPPROTO_TCP;
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 			mnt->timeo = 600;
 			mnt->retrans = 2;
 			break;
@@ -884,13 +885,13 @@ static int nfs_parse_mount_options(char *raw,
 			switch (token) {
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
-				mnt->nfs_server.protocol = IPPROTO_UDP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 				mnt->timeo = 7;
 				mnt->retrans = 5;
 				break;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
-				mnt->nfs_server.protocol = IPPROTO_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 				mnt->timeo = 600;
 				mnt->retrans = 2;
 				break;
@@ -908,10 +909,10 @@ static int nfs_parse_mount_options(char *raw,
 
 			switch (token) {
 			case Opt_xprt_udp:
-				mnt->mount_server.protocol = IPPROTO_UDP;
+				mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
 				break;
 			case Opt_xprt_tcp:
-				mnt->mount_server.protocol = IPPROTO_TCP;
+				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
 			default:
 				goto out_unrec_xprt;
@@ -1061,9 +1062,9 @@ static int nfs_validate_mount_data(void *options,
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
-	args->mount_server.protocol = IPPROTO_UDP;
+	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
 	args->mount_server.program = NFS_MNT_PROGRAM;
-	args->nfs_server.protocol = IPPROTO_TCP;
+	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->nfs_server.program = NFS_PROGRAM;
 
 	switch (data->version) {
@@ -1110,7 +1111,7 @@ static int nfs_validate_mount_data(void *options,
 		args->acdirmax		= data->acdirmax;
 		args->nfs_server.address = data->addr;
 		if (!(data->flags & NFS_MOUNT_TCP))
-			args->nfs_server.protocol = IPPROTO_UDP;
+			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 		/* N.B. caller will free nfs_server.hostname in all cases */
 		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
 		args->namlen		= data->namlen;
@@ -1533,7 +1534,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
-	args->nfs_server.protocol = IPPROTO_TCP;
+	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
 	switch (data->version) {
 	case 1:
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 09a2a8fde093..8db3d70c23e3 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -23,6 +23,7 @@
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_BIND
@@ -229,7 +230,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 			prog, vers, prot, port);
 
 	rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin,
-					IPPROTO_UDP, 2, 1);
+					XPRT_TRANSPORT_UDP, 2, 1);
 	if (IS_ERR(rpcb_clnt))
 		return PTR_ERR(rpcb_clnt);
 
-- 
cgit v1.2.3


From 56928edd5afb51d684c38c0bed56594e93ffe4c7 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:48:47 -0400
Subject: NFS - print accurate transport protocol

Use the per-transport strings to display the transport protocol accurately.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b47e87e6c406..347c36341e53 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -432,8 +432,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	};
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
-	char buf[12];
-	const char *proto;
 
 	seq_printf(m, ",vers=%d", clp->rpc_ops->version);
 	seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -452,18 +450,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		else
 			seq_puts(m, nfs_infop->nostr);
 	}
-	switch (nfss->client->cl_xprt->prot) {
-		case IPPROTO_TCP:
-			proto = "tcp";
-			break;
-		case IPPROTO_UDP:
-			proto = "udp";
-			break;
-		default:
-			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-			proto = buf;
-	}
-	seq_printf(m, ",proto=%s", proto);
+	seq_printf(m, ",proto=%s",
+		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
 	seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
 	seq_printf(m, ",retrans=%u", clp->retrans_count);
 	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
-- 
cgit v1.2.3


From c3a57ed7471a17b07844d531534d970b84b69faf Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:49:15 -0400
Subject: RPCRDMA: Kconfig and header file with rpcrdma protocol definitions

This file implements the configuration target, protocol template and
constants for the rpcrdma transport framing, for use by the xprtrdma
rpc transport implementation.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig                      |   8 +++
 include/linux/sunrpc/rpc_rdma.h | 116 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sunrpc/xprtrdma.h |  85 +++++++++++++++++++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 include/linux/sunrpc/rpc_rdma.h
 create mode 100644 include/linux/sunrpc/xprtrdma.h

diff --git a/fs/Kconfig b/fs/Kconfig
index f9eed6d79066..b9808bba5722 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1728,6 +1728,14 @@ config SUNRPC
 config SUNRPC_GSS
 	tristate
 
+config SUNRPC_XPRT_RDMA
+	tristate "RDMA transport for sunrpc (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	default m
+	help
+	  Adds a client RPC transport for supporting kernel NFS over RDMA
+	  mounts, including Infiniband and iWARP. Experimental.
+
 config SUNRPC_BIND34
 	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
new file mode 100644
index 000000000000..0013a0d8dc6b
--- /dev/null
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_RPC_RDMA_H
+#define _LINUX_SUNRPC_RPC_RDMA_H
+
+struct rpcrdma_segment {
+	uint32_t rs_handle;	/* Registered memory handle */
+	uint32_t rs_length;	/* Length of the chunk in bytes */
+	uint64_t rs_offset;	/* Chunk virtual address or offset */
+};
+
+/*
+ * read chunk(s), encoded as a linked list.
+ */
+struct rpcrdma_read_chunk {
+	uint32_t rc_discrim;	/* 1 indicates presence */
+	uint32_t rc_position;	/* Position in XDR stream */
+	struct rpcrdma_segment rc_target;
+};
+
+/*
+ * write chunk, and reply chunk.
+ */
+struct rpcrdma_write_chunk {
+	struct rpcrdma_segment wc_target;
+};
+
+/*
+ * write chunk(s), encoded as a counted array.
+ */
+struct rpcrdma_write_array {
+	uint32_t wc_discrim;	/* 1 indicates presence */
+	uint32_t wc_nchunks;	/* Array count */
+	struct rpcrdma_write_chunk wc_array[0];
+};
+
+struct rpcrdma_msg {
+	uint32_t rm_xid;	/* Mirrors the RPC header xid */
+	uint32_t rm_vers;	/* Version of this protocol */
+	uint32_t rm_credit;	/* Buffers requested/granted */
+	uint32_t rm_type;	/* Type of message (enum rpcrdma_proc) */
+	union {
+
+		struct {			/* no chunks */
+			uint32_t rm_empty[3];	/* 3 empty chunk lists */
+		} rm_nochunks;
+
+		struct {			/* no chunks and padded */
+			uint32_t rm_align;	/* Padding alignment */
+			uint32_t rm_thresh;	/* Padding threshold */
+			uint32_t rm_pempty[3];	/* 3 empty chunk lists */
+		} rm_padded;
+
+		uint32_t rm_chunks[0];	/* read, write and reply chunks */
+
+	} rm_body;
+};
+
+#define RPCRDMA_HDRLEN_MIN	28
+
+enum rpcrdma_errcode {
+	ERR_VERS = 1,
+	ERR_CHUNK = 2
+};
+
+struct rpcrdma_err_vers {
+	uint32_t rdma_vers_low;	/* Version range supported by peer */
+	uint32_t rdma_vers_high;
+};
+
+enum rpcrdma_proc {
+	RDMA_MSG = 0,		/* An RPC call or reply msg */
+	RDMA_NOMSG = 1,		/* An RPC call or reply msg - separate body */
+	RDMA_MSGP = 2,		/* An RPC call or reply msg with padding */
+	RDMA_DONE = 3,		/* Client signals reply completion */
+	RDMA_ERROR = 4		/* An RPC RDMA encoding error */
+};
+
+#endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
new file mode 100644
index 000000000000..4de56b1d372b
--- /dev/null
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTRDMA_H
+#define _LINUX_SUNRPC_XPRTRDMA_H
+
+/*
+ * RPC transport identifier for RDMA
+ */
+#define XPRT_TRANSPORT_RDMA	256
+
+/*
+ * rpcbind (v3+) RDMA netid.
+ */
+#define RPCBIND_NETID_RDMA	"rdma"
+
+/*
+ * Constants. Max RPC/NFS header is big enough to account for
+ * additional marshaling buffers passed down by Linux client.
+ *
+ * RDMA header is currently fixed max size, and is big enough for a
+ * fully-chunked NFS message (read chunks are the largest). Note only
+ * a single chunk type per message is supported currently.
+ */
+#define RPCRDMA_MIN_SLOT_TABLE	(2U)
+#define RPCRDMA_DEF_SLOT_TABLE	(32U)
+#define RPCRDMA_MAX_SLOT_TABLE	(256U)
+
+#define RPCRDMA_DEF_INLINE  (1024)	/* default inline max */
+
+#define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
+
+#define RDMA_RESOLVE_TIMEOUT	(5*HZ)	/* TBD 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+/* memory registration strategies */
+#define RPCRDMA_PERSISTENT_REGISTRATION (1)
+
+enum rpcrdma_memreg {
+	RPCRDMA_BOUNCEBUFFERS = 0,
+	RPCRDMA_REGISTER,
+	RPCRDMA_MEMWINDOWS,
+	RPCRDMA_MEMWINDOWS_ASYNC,
+	RPCRDMA_MTHCAFMR,
+	RPCRDMA_ALLPHYSICAL,
+	RPCRDMA_LAST
+};
+
+#endif /* _LINUX_SUNRPC_XPRTRDMA_H */
-- 
cgit v1.2.3


From 2cf7ff7a37cc149bd59c4f3bad432f686a4619c8 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:49:41 -0400
Subject: NFS: support RDMA mounts

Adds hooks to the string-based NFS mount to support an "rdma" protocol option.

Signed-off-by: Tom Talpey <tmt@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c |  2 ++
 fs/nfs/super.c  | 21 +++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6f6755529788..a532ee12740a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -24,6 +24,7 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
 #include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -342,6 +343,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 
 	switch (proto) {
 	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
 		if (!to->to_initval)
 			to->to_initval = 60 * HZ;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 347c36341e53..11ab7ff6e087 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -34,6 +34,7 @@
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
 #include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
@@ -68,7 +69,7 @@ enum {
 	Opt_ac, Opt_noac,
 	Opt_lock, Opt_nolock,
 	Opt_v2, Opt_v3,
-	Opt_udp, Opt_tcp,
+	Opt_udp, Opt_tcp, Opt_rdma,
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
 	Opt_sharecache, Opt_nosharecache,
@@ -114,6 +115,7 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_v3, "v3" },
 	{ Opt_udp, "udp" },
 	{ Opt_tcp, "tcp" },
+	{ Opt_rdma, "rdma" },
 	{ Opt_acl, "acl" },
 	{ Opt_noacl, "noacl" },
 	{ Opt_rdirplus, "rdirplus" },
@@ -153,7 +155,7 @@ static match_table_t nfs_mount_option_tokens = {
 };
 
 enum {
-	Opt_xprt_udp, Opt_xprt_tcp,
+	Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
 
 	Opt_xprt_err
 };
@@ -161,6 +163,7 @@ enum {
 static match_table_t nfs_xprt_protocol_tokens = {
 	{ Opt_xprt_udp, "udp" },
 	{ Opt_xprt_tcp, "tcp" },
+	{ Opt_xprt_rdma, "rdma" },
 
 	{ Opt_xprt_err, NULL }
 };
@@ -668,6 +671,12 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->timeo = 600;
 			mnt->retrans = 2;
 			break;
+		case Opt_rdma:
+			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
+			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+			mnt->timeo = 600;
+			mnt->retrans = 2;
+			break;
 		case Opt_acl:
 			mnt->flags &= ~NFS_MOUNT_NOACL;
 			break;
@@ -883,6 +892,13 @@ static int nfs_parse_mount_options(char *raw,
 				mnt->timeo = 600;
 				mnt->retrans = 2;
 				break;
+			case Opt_xprt_rdma:
+				/* vector side protocols to TCP */
+				mnt->flags |= NFS_MOUNT_TCP;
+				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+				mnt->timeo = 600;
+				mnt->retrans = 2;
+				break;
 			default:
 				goto out_unrec_xprt;
 			}
@@ -902,6 +918,7 @@ static int nfs_parse_mount_options(char *raw,
 			case Opt_xprt_tcp:
 				mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
 				break;
+			case Opt_xprt_rdma: /* not used for side protocols */
 			default:
 				goto out_unrec_xprt;
 			}
-- 
cgit v1.2.3


From f58851e6b0f148fb4b2a1c6f70beb2f125863c0f Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:50:12 -0400
Subject: RPCRDMA: rpc rdma transport switch

This implements the configuration and building of the core transport
switch implementation of the rpcrdma transport. Stubs are provided for
the rpcrdma protocol handling, and the infiniband/iwarp verbs interface.
These are provided in following patches.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/debug.h    |   5 +
 net/sunrpc/Makefile             |   1 +
 net/sunrpc/xprtrdma/Makefile    |   3 +
 net/sunrpc/xprtrdma/rpc_rdma.c  |   9 +
 net/sunrpc/xprtrdma/transport.c | 800 ++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/verbs.c     |  37 ++
 net/sunrpc/xprtrdma/xprt_rdma.h | 330 +++++++++++++++++
 7 files changed, 1185 insertions(+)
 create mode 100644 net/sunrpc/xprtrdma/Makefile
 create mode 100644 net/sunrpc/xprtrdma/rpc_rdma.c
 create mode 100644 net/sunrpc/xprtrdma/transport.c
 create mode 100644 net/sunrpc/xprtrdma/verbs.c
 create mode 100644 net/sunrpc/xprtrdma/xprt_rdma.h

diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf16361e..3347c72b848a 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -88,6 +88,11 @@ enum {
 	CTL_SLOTTABLE_TCP,
 	CTL_MIN_RESVPORT,
 	CTL_MAX_RESVPORT,
+	CTL_SLOTTABLE_RDMA,
+	CTL_RDMA_MAXINLINEREAD,
+	CTL_RDMA_MAXINLINEWRITE,
+	CTL_RDMA_WRITEPADDING,
+	CTL_RDMA_MEMREG,
 };
 
 #endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8ebfc4db7f51..5c69a725e530 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_SUNRPC) += sunrpc.o
 obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
 
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    auth.o auth_null.o auth_unix.o \
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000000..264f0feeb513
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000000..b0587f3a5d77
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,9 @@
+/*
+ * Placeholders for subsequent patches
+ */
+
+#include "xprt_rdma.h"
+
+void rpcrdma_conn_func(struct rpcrdma_ep *a) { }
+void rpcrdma_reply_handler(struct rpcrdma_rep *a) { }
+int rpcrdma_marshal_req(struct rpc_rqst *a) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000000..dc55cc974c90
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include "xprt_rdma.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+#if !RPCRDMA_PERSISTENT_REGISTRATION
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
+#else
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
+#endif
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static ctl_table xr_tunables_table[] = {
+	{
+		.ctl_name	= CTL_SLOTTABLE_RDMA,
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.ctl_name	= CTL_RDMA_MAXINLINEREAD,
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_RDMA_MAXINLINEWRITE,
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_RDMA_WRITEPADDING,
+		.procname	= "rdma_inline_write_padding",
+		.data		= &xprt_rdma_inline_write_padding,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &max_padding,
+	},
+	{
+		.ctl_name	= CTL_RDMA_MEMREG,
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.ctl_name = 0,
+	},
+};
+
+static ctl_table sunrpc_table[] = {
+	{
+		.ctl_name	= CTL_SUNRPC,
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{
+		.ctl_name = 0,
+	},
+};
+
+#endif
+
+static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
+
+static void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)
+					&rpcx_to_rdmad(xprt).addr;
+	char *buf;
+
+	buf = kzalloc(20, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 8, "%u", ntohs(addr->sin_port));
+	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+
+	buf = kzalloc(48, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port), "rdma");
+	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+
+	buf = kzalloc(10, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 10, "%02x%02x%02x%02x",
+			NIPQUAD(addr->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 8, "%4hx", ntohs(addr->sin_port));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
+
+	buf = kzalloc(30, GFP_KERNEL);
+	if (buf)
+		snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port) >> 8,
+			ntohs(addr->sin_port) & 0xff);
+	xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
+
+	/* netid */
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+}
+
+static void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt =
+		container_of(work, struct rpcrdma_xprt, rdma_connect.work);
+	struct rpc_xprt *xprt = &r_xprt->xprt;
+	int rc = 0;
+
+	if (!xprt->shutdown) {
+		xprt_clear_connected(xprt);
+
+		dprintk("RPC:       %s: %sconnect\n", __func__,
+				r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+		rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+		if (rc)
+			goto out;
+	}
+	goto out_clear;
+
+out:
+	xprt_wake_pending_tasks(xprt, rc);
+
+out_clear:
+	dprintk("RPC:       %s: exit\n", __func__);
+	xprt_clear_connecting(xprt);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc;
+
+	dprintk("RPC:       %s: called\n", __func__);
+
+	cancel_delayed_work(&r_xprt->rdma_connect);
+	flush_scheduled_work();
+
+	xprt_clear_connected(xprt);
+
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+	rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		dprintk("RPC:       %s: rpcrdma_ep_destroy returned %i\n",
+			__func__, rc);
+	rpcrdma_ia_close(&r_xprt->rx_ia);
+
+	xprt_rdma_free_addresses(xprt);
+
+	kfree(xprt->slot);
+	xprt->slot = NULL;
+	kfree(xprt);
+
+	dprintk("RPC:       %s: returning\n", __func__);
+
+	module_put(THIS_MODULE);
+}
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpcrdma_create_data_internal cdata;
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct rpcrdma_ep *new_ep;
+	struct sockaddr_in *sin;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
+	if (xprt == NULL) {
+		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xprt->max_reqs = xprt_rdma_slot_table_entries;
+	xprt->slot = kcalloc(xprt->max_reqs,
+				sizeof(struct rpc_rqst), GFP_KERNEL);
+	if (xprt->slot == NULL) {
+		kfree(xprt);
+		dprintk("RPC:       %s: couldn't allocate %d slots\n",
+			__func__, xprt->max_reqs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* 60 second timeout, no retries */
+	xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
+	xprt->bind_timeout = (60U * HZ);
+	xprt->connect_timeout = (60U * HZ);
+	xprt->reestablish_timeout = (5U * HZ);
+	xprt->idle_timeout = (5U * 60 * HZ);
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
+	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+
+	/* Put server RDMA address in local cdata */
+	memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+
+	sin = (struct sockaddr_in *)&cdata.addr;
+	if (ntohs(sin->sin_port) != 0)
+		xprt_set_bound(xprt);
+
+	dprintk("RPC:       %s: %u.%u.%u.%u:%u\n", __func__,
+			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
+
+	/* Set max requests */
+	cdata.max_requests = xprt->max_reqs;
+
+	/* Set some length limits */
+	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+	cdata.inline_wsize = xprt_rdma_max_inline_write;
+	if (cdata.inline_wsize > cdata.wsize)
+		cdata.inline_wsize = cdata.wsize;
+
+	cdata.inline_rsize = xprt_rdma_max_inline_read;
+	if (cdata.inline_rsize > cdata.rsize)
+		cdata.inline_rsize = cdata.rsize;
+
+	cdata.padding = xprt_rdma_inline_write_padding;
+
+	/*
+	 * Create new transport instance, which includes initialized
+	 *  o ia
+	 *  o endpoint
+	 *  o buffers
+	 */
+
+	new_xprt = rpcx_to_rdmax(xprt);
+
+	rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
+				xprt_rdma_memreg_strategy);
+	if (rc)
+		goto out1;
+
+	/*
+	 * initialize and create ep
+	 */
+	new_xprt->rx_data = cdata;
+	new_ep = &new_xprt->rx_ep;
+	new_ep->rep_remote_addr = cdata.addr;
+
+	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+				&new_xprt->rx_ia, &new_xprt->rx_data);
+	if (rc)
+		goto out2;
+
+	/*
+	 * Allocate pre-registered send and receive buffers for headers and
+	 * any inline data. Also specify any padding which will be provided
+	 * from a preregistered zero buffer.
+	 */
+	rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
+				&new_xprt->rx_data);
+	if (rc)
+		goto out3;
+
+	/*
+	 * Register a callback for connection events. This is necessary because
+	 * connection loss notification is async. We also catch connection loss
+	 * when reaping receives.
+	 */
+	INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
+	new_ep->rep_func = rpcrdma_conn_func;
+	new_ep->rep_xprt = xprt;
+
+	xprt_rdma_format_addresses(xprt);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out4;
+
+	return xprt;
+
+out4:
+	xprt_rdma_free_addresses(xprt);
+	rc = -EINVAL;
+out3:
+	(void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+	rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+	kfree(xprt->slot);
+	kfree(xprt);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Close a connection, during shutdown or timeout/reconnect
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	dprintk("RPC:       %s: closing\n", __func__);
+	xprt_disconnect(xprt);
+	(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+}
+
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr_in *sap;
+
+	sap = (struct sockaddr_in *)&xprt->addr;
+	sap->sin_port = htons(port);
+	sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
+	sap->sin_port = htons(port);
+	dprintk("RPC:       %s: %u\n", __func__, port);
+}
+
+static void
+xprt_rdma_connect(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	if (!xprt_test_and_set_connecting(xprt)) {
+		if (r_xprt->rx_ep.rep_connected != 0) {
+			/* Reconnect */
+			schedule_delayed_work(&r_xprt->rdma_connect,
+				xprt->reestablish_timeout);
+		} else {
+			schedule_delayed_work(&r_xprt->rdma_connect, 0);
+			if (!RPC_IS_ASYNC(task))
+				flush_scheduled_work();
+		}
+	}
+}
+
+static int
+xprt_rdma_reserve_xprt(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
+
+	/* == RPC_CWNDSCALE @ init, but *after* setup */
+	if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
+		r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
+		dprintk("RPC:       %s: cwndscale %lu\n", __func__,
+			r_xprt->rx_buf.rb_cwndscale);
+		BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
+	}
+	xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
+	return xprt_reserve_xprt_cong(task);
+}
+
+/*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+ * sequence. For this reason, the recv buffers are attached to send
+ * buffers for portions of the RPC. Note that the RPC layer allocates
+ * both send and receive buffers in the same call. We may register
+ * the receive buffer portion when using reply chunks.
+ */
+static void *
+xprt_rdma_allocate(struct rpc_task *task, size_t size)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req, *nreq;
+
+	req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+	BUG_ON(NULL == req);
+
+	if (size > req->rl_size) {
+		dprintk("RPC:       %s: size %zd too large for buffer[%zd]: "
+			"prog %d vers %d proc %d\n",
+			__func__, size, req->rl_size,
+			task->tk_client->cl_prog, task->tk_client->cl_vers,
+			task->tk_msg.rpc_proc->p_proc);
+		/*
+		 * Outgoing length shortage. Our inline write max must have
+		 * been configured to perform direct i/o.
+		 *
+		 * This is therefore a large metadata operation, and the
+		 * allocate call was made on the maximum possible message,
+		 * e.g. containing long filename(s) or symlink data. In
+		 * fact, while these metadata operations *might* carry
+		 * large outgoing payloads, they rarely *do*. However, we
+		 * have to commit to the request here, so reallocate and
+		 * register it now. The data path will never require this
+		 * reallocation.
+		 *
+		 * If the allocation or registration fails, the RPC framework
+		 * will (doggedly) retry.
+		 */
+		if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
+				RPCRDMA_BOUNCEBUFFERS) {
+			/* forced to "pure inline" */
+			dprintk("RPC:       %s: too much data (%zd) for inline "
+					"(r/w max %d/%d)\n", __func__, size,
+					rpcx_to_rdmad(xprt).inline_rsize,
+					rpcx_to_rdmad(xprt).inline_wsize);
+			size = req->rl_size;
+			rpc_exit(task, -EIO);		/* fail the operation */
+			rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+			goto out;
+		}
+		if (task->tk_flags & RPC_TASK_SWAPPER)
+			nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+		else
+			nreq = kmalloc(sizeof *req + size, GFP_NOFS);
+		if (nreq == NULL)
+			goto outfail;
+
+		if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
+				nreq->rl_base, size + sizeof(struct rpcrdma_req)
+				- offsetof(struct rpcrdma_req, rl_base),
+				&nreq->rl_handle, &nreq->rl_iov)) {
+			kfree(nreq);
+			goto outfail;
+		}
+		rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
+		nreq->rl_size = size;
+		nreq->rl_niovs = 0;
+		nreq->rl_nchunks = 0;
+		nreq->rl_buffer = (struct rpcrdma_buffer *)req;
+		nreq->rl_reply = req->rl_reply;
+		memcpy(nreq->rl_segments,
+			req->rl_segments, sizeof nreq->rl_segments);
+		/* flag the swap with an unused field */
+		nreq->rl_iov.length = 0;
+		req->rl_reply = NULL;
+		req = nreq;
+	}
+	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+out:
+	return req->rl_xdr_buf;
+
+outfail:
+	rpcrdma_buffer_put(req);
+	rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+	return NULL;
+}
+
+/*
+ * This function returns all RDMA resources to the pool.
+ */
+static void
+xprt_rdma_free(void *buffer)
+{
+	struct rpcrdma_req *req;
+	struct rpcrdma_xprt *r_xprt;
+	struct rpcrdma_rep *rep;
+	int i;
+
+	if (buffer == NULL)
+		return;
+
+	req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
+	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
+	rep = req->rl_reply;
+
+	dprintk("RPC:       %s: called on 0x%p%s\n",
+		__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+
+	/*
+	 * Finish the deregistration. When using mw bind, this was
+	 * begun in rpcrdma_reply_handler(). In all other modes, we
+	 * do it here, in thread context. The process is considered
+	 * complete when the rr_func vector becomes NULL - this
+	 * was put in place during rpcrdma_reply_handler() - the wait
+	 * call below will not block if the dereg is "done". If
+	 * interrupted, our framework will clean up.
+	 */
+	for (i = 0; req->rl_nchunks;) {
+		--req->rl_nchunks;
+		i += rpcrdma_deregister_external(
+			&req->rl_segments[i], r_xprt, NULL);
+	}
+
+	if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
+		rep->rr_func = NULL;	/* abandon the callback */
+		req->rl_reply = NULL;
+	}
+
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+		oreq->rl_reply = req->rl_reply;
+		(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
+						   req->rl_handle,
+						   &req->rl_iov);
+		kfree(req);
+		req = oreq;
+	}
+
+	/* Put back request+reply buffers */
+	rpcrdma_buffer_put(req);
+}
+
+/*
+ * send_request invokes the meat of RPC RDMA. It must do the following:
+ *  1.  Marshal the RPC request into an RPC RDMA request, which means
+ *	putting a header in front of data, and creating IOVs for RDMA
+ *	from those in the request.
+ *  2.  In marshaling, detect opportunities for RDMA, and use them.
+ *  3.  Post a recv message to set up asynch completion, then send
+ *	the request (rpcrdma_ep_post).
+ *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ */
+
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	/* marshal the send itself */
+	if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
+		r_xprt->rx_stats.failed_marshal_count++;
+		dprintk("RPC:       %s: rpcrdma_marshal_req failed\n",
+			__func__);
+		return -EIO;
+	}
+
+	if (req->rl_reply == NULL) 		/* e.g. reconnection */
+		rpcrdma_recv_buffer_get(req);
+
+	if (req->rl_reply) {
+		req->rl_reply->rr_func = rpcrdma_reply_handler;
+		/* this need only be done once, but... */
+		req->rl_reply->rr_xprt = xprt;
+	}
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
+		xprt_disconnect(xprt);
+		return -ENOTCONN;	/* implies disconnect */
+	}
+
+	rqst->rq_bytes_sent = 0;
+	return 0;
+}
+
+static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq,
+	  "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
+	  "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
+
+	   0,	/* need a local port? */
+	   xprt->stat.bind_count,
+	   xprt->stat.connect_count,
+	   xprt->stat.connect_time,
+	   idle_time,
+	   xprt->stat.sends,
+	   xprt->stat.recvs,
+	   xprt->stat.bad_xids,
+	   xprt->stat.req_u,
+	   xprt->stat.bklog_u,
+
+	   r_xprt->rx_stats.read_chunk_count,
+	   r_xprt->rx_stats.write_chunk_count,
+	   r_xprt->rx_stats.reply_chunk_count,
+	   r_xprt->rx_stats.total_rdma_request,
+	   r_xprt->rx_stats.total_rdma_reply,
+	   r_xprt->rx_stats.pullup_copy_count,
+	   r_xprt->rx_stats.fixup_copy_count,
+	   r_xprt->rx_stats.hardway_register_count,
+	   r_xprt->rx_stats.failed_marshal_count,
+	   r_xprt->rx_stats.bad_reply_count);
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_rdma_reserve_xprt,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+};
+
+static void __exit xprt_rdma_cleanup(void)
+{
+	int rc;
+
+	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#ifdef RPC_DEBUG
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+	rc = xprt_unregister_transport(&xprt_rdma);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister returned %i\n",
+			__func__, rc);
+}
+
+static int __init xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+
+	if (rc)
+		return rc;
+
+	dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+
+	dprintk(KERN_INFO "Defaults:\n");
+	dprintk(KERN_INFO "\tSlots %d\n"
+		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+		xprt_rdma_slot_table_entries,
+		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+	dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+#ifdef RPC_DEBUG
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
+
+module_init(xprt_rdma_init);
+module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000000..0baf53381987
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,37 @@
+/*
+ * Placeholders for subsequent patches
+ */
+
+#include "xprt_rdma.h"
+
+int rpcrdma_ia_open(struct rpcrdma_xprt *a, struct sockaddr *b, int c)
+{ return EINVAL; }
+void rpcrdma_ia_close(struct rpcrdma_ia *a) { }
+int rpcrdma_ep_create(struct rpcrdma_ep *a, struct rpcrdma_ia *b,
+struct rpcrdma_create_data_internal *c) { return EINVAL; }
+int rpcrdma_ep_destroy(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
+{ return EINVAL; }
+int rpcrdma_ep_connect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
+{ return EINVAL; }
+int rpcrdma_ep_disconnect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
+{ return EINVAL; }
+int rpcrdma_ep_post(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
+struct rpcrdma_req *c) { return EINVAL; }
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
+struct rpcrdma_rep *c) { return EINVAL; }
+int rpcrdma_buffer_create(struct rpcrdma_buffer *a, struct rpcrdma_ep *b,
+struct rpcrdma_ia *c, struct rpcrdma_create_data_internal *d) { return EINVAL; }
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *a) { }
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *a)
+{ return NULL; }
+void rpcrdma_buffer_put(struct rpcrdma_req *a) { }
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *a) { }
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *a) { }
+int rpcrdma_register_internal(struct rpcrdma_ia *a, void *b, int c,
+struct ib_mr **d, struct ib_sge *e) { return EINVAL; }
+int rpcrdma_deregister_internal(struct rpcrdma_ia *a, struct ib_mr *b,
+struct ib_sge *c) { return EINVAL; }
+int rpcrdma_register_external(struct rpcrdma_mr_seg *a, int b, int c,
+struct rpcrdma_xprt *d) { return EINVAL; }
+int rpcrdma_deregister_external(struct rpcrdma_mr_seg *a,
+struct rpcrdma_xprt *b, void *c) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000000..2427822f8bd4
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <asm/atomic.h>			/* atomic_t, etc */
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+	struct rdma_cm_id 	*ri_id;
+	struct ib_pd		*ri_pd;
+	struct ib_mr		*ri_bind_mem;
+	struct completion	ri_done;
+	int			ri_async_rc;
+	enum rpcrdma_memreg	ri_memreg_strategy;
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+struct rpcrdma_ep {
+	atomic_t		rep_cqcount;
+	int			rep_cqinit;
+	int			rep_connected;
+	struct rpcrdma_ia	*rep_ia;
+	struct ib_cq		*rep_cq;
+	struct ib_qp_init_attr	rep_attr;
+	wait_queue_head_t 	rep_connect_wait;
+	struct ib_sge		rep_pad;	/* holds zeroed pad */
+	struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */
+	void			(*rep_func)(struct rpcrdma_ep *);
+	struct rpc_xprt		*rep_xprt;	/* for rep_func */
+	struct rdma_conn_param	rep_remote_cma;
+	struct sockaddr_storage	rep_remote_addr;
+};
+
+#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required to recv
+ * and complete a reply, asychronously. It needs several pieces of
+ * state:
+ *   o recv buffer (posted to provider)
+ *   o ib_sge (also donated to provider)
+ *   o status of reply (length, success or not)
+ *   o bookkeeping state to get run by tasklet (list, etc)
+ *
+ * These are allocated during initialization, per-transport instance;
+ * however, the tasklet execution list itself is global, as it should
+ * always be pretty short.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ */
+
+/* temporary static scatter/gather max */
+#define RPCRDMA_MAX_DATA_SEGS	(8)	/* max scatter/gather */
+#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+#define MAX_RPCRDMAHDR	(\
+	/* max supported RPC/RDMA header */ \
+	sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
+	(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
+
+struct rpcrdma_buffer;
+
+struct rpcrdma_rep {
+	unsigned int	rr_len;		/* actual received reply length */
+	struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
+	struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */
+	void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+	struct list_head rr_list;	/* tasklet list */
+	wait_queue_head_t rr_unbind;	/* optional unbind wait */
+	struct ib_sge	rr_iov;		/* for posting */
+	struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */
+	char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ *
+ * NOTES:
+ *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
+ *     marshal. The number needed varies depending on the iov lists that
+ *     are passed to us, the memory registration mode we are in, and if
+ *     physical addressing is used, the layout.
+ */
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	union {				/* chunk memory handles */
+		struct ib_mr	*rl_mr;		/* if registered directly */
+		struct rpcrdma_mw {		/* if registered from region */
+			union {
+				struct ib_mw	*mw;
+				struct ib_fmr	*fmr;
+			} r;
+			struct list_head mw_list;
+		} *rl_mw;
+	} mr_chunk;
+	u64		mr_base;	/* registration result */
+	u32		mr_rkey;	/* registration result */
+	u32		mr_len;		/* length of chunk or segment */
+	int		mr_nsegs;	/* number of segments in chunk or 0 */
+	enum dma_data_direction	mr_dir;	/* segment mapping direction */
+	dma_addr_t	mr_dma;		/* segment mapping address */
+	size_t		mr_dmalen;	/* segment mapping length */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+struct rpcrdma_req {
+	size_t 		rl_size;	/* actual length of buffer */
+	unsigned int	rl_niovs;	/* 0, 2 or 4 */
+	unsigned int	rl_nchunks;	/* non-zero if chunks */
+	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
+	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
+	struct ib_sge	rl_send_iov[4];	/* for active requests */
+	struct ib_sge	rl_iov;		/* for posting */
+	struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */
+	char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
+	__u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */
+};
+#define rpcr_to_rdmar(r) \
+	container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t	rb_lock;	/* protects indexes */
+	atomic_t	rb_credits;	/* most recent server credits */
+	unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */
+	int		rb_max_requests;/* client max requests */
+	struct list_head rb_mws;	/* optional memory windows/fmrs */
+	int		rb_send_index;
+	struct rpcrdma_req	**rb_send_bufs;
+	int		rb_recv_index;
+	struct rpcrdma_rep	**rb_recv_bufs;
+	char		*rb_pool;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+	struct sockaddr_storage	addr;	/* RDMA server address */
+	unsigned int	max_requests;	/* max requests (slots) in flight */
+	unsigned int	rsize;		/* mount rsize - max read hdr+data */
+	unsigned int	wsize;		/* mount wsize - max write hdr+data */
+	unsigned int	inline_rsize;	/* max non-rdma read data payload */
+	unsigned int	inline_wsize;	/* max non-rdma write data payload */
+	unsigned int	padding;	/* non-rdma write header padding */
+};
+
+#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
+
+#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
+
+#define RPCRDMA_INLINE_PAD_VALUE(rq)\
+	rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+
+	unsigned long long	total_rdma_request;
+	unsigned long long	total_rdma_reply;
+
+	unsigned long long	pullup_copy_count;
+	unsigned long long	fixup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+};
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		xprt;
+	struct rpcrdma_ia	rx_ia;
+	struct rpcrdma_ep	rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct rpcrdma_create_data_internal rx_data;
+	struct delayed_work	rdma_connect;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
+				struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
+				struct ib_mr **, struct ib_sge *);
+int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+				struct ib_mr *, struct ib_sge *);
+
+int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+				int, int, struct rpcrdma_xprt *);
+int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+				struct rpcrdma_xprt *, void *);
+
+/*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
+void rpcrdma_conn_func(struct rpcrdma_ep *);
+void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+int rpcrdma_marshal_req(struct rpc_rqst *);
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
-- 
cgit v1.2.3


From e96018280cb36210f4c69663561825114a57e7e1 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:50:42 -0400
Subject: RPCRDMA: rpc rdma protocol implementation

This implements the marshaling and unmarshaling of the rpcrdma transport
headers. Connection management is also addressed.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 867 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 863 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b0587f3a5d77..12db63580427 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,9 +1,868 @@
 /*
- * Placeholders for subsequent patches
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
  */
 
 #include "xprt_rdma.h"
 
-void rpcrdma_conn_func(struct rpcrdma_ep *a) { }
-void rpcrdma_reply_handler(struct rpcrdma_rep *a) { }
-int rpcrdma_marshal_req(struct rpc_rqst *a) { return EINVAL; }
+#include <linux/highmem.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+#ifdef RPC_DEBUG
+static const char transfertypes[][12] = {
+	"pure inline",	/* no chunks */
+	" read chunk",	/* some argument via rdma read */
+	"*read chunk",	/* entire request via rdma read */
+	"write chunk",	/* some result via rdma write */
+	"reply chunk"	/* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Note, this routine is never called if the connection's memory
+ * registration strategy is 0 (bounce buffers).
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos,
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+	int len, n = 0, p;
+
+	if (pos == 0 && xdrbuf->head[0].iov_len) {
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->head[0].iov_base;
+		seg[n].mr_len = xdrbuf->head[0].iov_len;
+		pos += xdrbuf->head[0].iov_len;
+		++n;
+	}
+
+	if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) {
+		if (n == nsegs)
+			return 0;
+		seg[n].mr_page = xdrbuf->pages[0];
+		seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base;
+		seg[n].mr_len = min_t(u32,
+			PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len);
+		len = xdrbuf->page_len - seg[n].mr_len;
+		pos += len;
+		++n;
+		p = 1;
+		while (len > 0) {
+			if (n == nsegs)
+				return 0;
+			seg[n].mr_page = xdrbuf->pages[p];
+			seg[n].mr_offset = NULL;
+			seg[n].mr_len = min_t(u32, PAGE_SIZE, len);
+			len -= seg[n].mr_len;
+			++n;
+			++p;
+		}
+	}
+
+	if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) {
+		if (n == nsegs)
+			return 0;
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+		seg[n].mr_len = xdrbuf->tail[0].iov_len;
+		pos += xdrbuf->tail[0].iov_len;
+		++n;
+	}
+
+	if (pos < xdrbuf->len)
+		dprintk("RPC:       %s: marshaled only %d of %d\n",
+				__func__, pos, xdrbuf->len);
+
+	return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ *   Assume check against THRESHOLD has been done, and chunks are required.
+ *   Assume only encoding one list entry for read|write chunks. The NFSv3
+ *     protocol is simple enough to allow this as it only has a single "bulk
+ *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ */
+
+static unsigned int
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
+	int nsegs, nchunks = 0;
+	int pos;
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_read_chunk *cur_rchunk = NULL;
+	struct rpcrdma_write_array *warray = NULL;
+	struct rpcrdma_write_chunk *cur_wchunk = NULL;
+	u32 *iptr = headerp->rm_body.rm_chunks;
+
+	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+		/* a read chunk - server will RDMA Read our memory */
+		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+	} else {
+		/* a write or reply chunk - server will RDMA Write our memory */
+		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
+		if (type == rpcrdma_replych)
+			*iptr++ = xdr_zero;	/* a NULL write chunk list */
+		warray = (struct rpcrdma_write_array *) iptr;
+		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+	}
+
+	if (type == rpcrdma_replych || type == rpcrdma_areadch)
+		pos = 0;
+	else
+		pos = target->head[0].iov_len;
+
+	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+	if (nsegs == 0)
+		return 0;
+
+	do {
+		/* bind/register the memory, then build chunk from result. */
+		int n = rpcrdma_register_external(seg, nsegs,
+						cur_wchunk != NULL, r_xprt);
+		if (n <= 0)
+			goto out;
+		if (cur_rchunk) {	/* read */
+			cur_rchunk->rc_discrim = xdr_one;
+			/* all read chunks have the same "position" */
+			cur_rchunk->rc_position = htonl(pos);
+			cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(u32 *)&cur_rchunk->rc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: read chunk "
+				"elem %d@0x%llx:0x%x pos %d (%s)\n", __func__,
+				seg->mr_len, seg->mr_base, seg->mr_rkey, pos,
+				n < nsegs ? "more" : "last");
+			cur_rchunk++;
+			r_xprt->rx_stats.read_chunk_count++;
+		} else {		/* write/reply */
+			cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(u32 *)&cur_wchunk->wc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: %s chunk "
+				"elem %d@0x%llx:0x%x (%s)\n", __func__,
+				(type == rpcrdma_replych) ? "reply" : "write",
+				seg->mr_len, seg->mr_base, seg->mr_rkey,
+				n < nsegs ? "more" : "last");
+			cur_wchunk++;
+			if (type == rpcrdma_replych)
+				r_xprt->rx_stats.reply_chunk_count++;
+			else
+				r_xprt->rx_stats.write_chunk_count++;
+			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		}
+		nchunks++;
+		seg   += n;
+		nsegs -= n;
+	} while (nsegs);
+
+	/* success. all failures return above */
+	req->rl_nchunks = nchunks;
+
+	BUG_ON(nchunks == 0);
+
+	/*
+	 * finish off header. If write, marshal discrim and nchunks.
+	 */
+	if (cur_rchunk) {
+		iptr = (u32 *) cur_rchunk;
+		*iptr++ = xdr_zero;	/* finish the read chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
+	} else {
+		warray->wc_discrim = xdr_one;
+		warray->wc_nchunks = htonl(nchunks);
+		iptr = (u32 *) cur_wchunk;
+		if (type == rpcrdma_writech) {
+			*iptr++ = xdr_zero; /* finish the write chunk list */
+			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
+		}
+	}
+
+	/*
+	 * Return header size.
+	 */
+	return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+	for (pos = 0; nchunks--;)
+		pos += rpcrdma_deregister_external(
+				&req->rl_segments[pos], r_xprt, NULL);
+	return 0;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+	int i, npages, curlen;
+	int copy_len;
+	unsigned char *srcp, *destp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+
+	destp = rqst->rq_svec[0].iov_base;
+	curlen = rqst->rq_svec[0].iov_len;
+	destp += curlen;
+	/*
+	 * Do optional padding where it makes sense. Alignment of write
+	 * payload can help the server, if our setting is accurate.
+	 */
+	pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+	if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+		pad = 0;	/* don't pad this request */
+
+	dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+		__func__, pad, destp, rqst->rq_slen, curlen);
+
+	copy_len = rqst->rq_snd_buf.page_len;
+	r_xprt->rx_stats.pullup_copy_count += copy_len;
+	npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
+	for (i = 0; copy_len && i < npages; i++) {
+		if (i == 0)
+			curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base;
+		else
+			curlen = PAGE_SIZE;
+		if (curlen > copy_len)
+			curlen = copy_len;
+		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
+			__func__, i, destp, copy_len, curlen);
+		srcp = kmap_atomic(rqst->rq_snd_buf.pages[i],
+					KM_SKB_SUNRPC_DATA);
+		if (i == 0)
+			memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen);
+		else
+			memcpy(destp, srcp, curlen);
+		kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
+		rqst->rq_svec[0].iov_len += curlen;
+		destp += curlen;
+		copy_len -= curlen;
+	}
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
+			memcpy(destp,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d curlen %d\n",
+			__func__, destp, copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
+	/* header now contains entire send message */
+	return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ *  [0] -- RPC RDMA header, which uses memory from the *start* of the
+ *         preregistered buffer that already holds the RPC data in
+ *         its middle.
+ *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ *  [2] -- optional padding.
+ *  [3] -- if padded, header only in [1] and data here.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	char *base;
+	size_t hdrlen, rpclen, padlen;
+	enum rpcrdma_chunktype rtype, wtype;
+	struct rpcrdma_msg *headerp;
+
+	/*
+	 * rpclen gets amount of data in first buffer, which is the
+	 * pre-registered buffer.
+	 */
+	base = rqst->rq_svec[0].iov_base;
+	rpclen = rqst->rq_svec[0].iov_len;
+
+	/* build RDMA header in private area at front */
+	headerp = (struct rpcrdma_msg *) req->rl_base;
+	/* don't htonl XID, it's already done in request */
+	headerp->rm_xid = rqst->rq_xid;
+	headerp->rm_vers = xdr_one;
+	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
+	headerp->rm_type = __constant_htonl(RDMA_MSG);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline (but see later).
+	 * o Large non-read ops return as a single reply chunk.
+	 * o Large read ops return data as write chunk(s), header as inline.
+	 *
+	 * Note: the NFS code sending down multiple result segments implies
+	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+	 */
+
+	/*
+	 * This code can handle read chunks, write chunks OR reply
+	 * chunks -- only one type. If the request is too big to fit
+	 * inline, then we will choose read chunks. If the request is
+	 * a READ, then use write chunks to separate the file data
+	 * into pages; otherwise use reply chunks.
+	 */
+	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+		wtype = rpcrdma_noch;
+	else if (rqst->rq_rcv_buf.page_len == 0)
+		wtype = rpcrdma_replych;
+	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		wtype = rpcrdma_writech;
+	else
+		wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 *
+	 * Note: the NFS code sending down multiple argument segments
+	 * implies the op is a write.
+	 * TBD check NFSv4 setacl
+	 */
+	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+		rtype = rpcrdma_noch;
+	else if (rqst->rq_snd_buf.page_len == 0)
+		rtype = rpcrdma_areadch;
+	else
+		rtype = rpcrdma_readch;
+
+	/* The following simplification is not true forever */
+	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+		wtype = rpcrdma_noch;
+	BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+
+	if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+	    (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+		/* forced to "pure inline"? */
+		dprintk("RPC:       %s: too much data (%d/%d) for inline\n",
+			__func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+		return -1;
+	}
+
+	hdrlen = 28; /*sizeof *headerp;*/
+	padlen = 0;
+
+	/*
+	 * Pull up any extra send data into the preregistered buffer.
+	 * When padding is in use and applies to the transfer, insert
+	 * it and change the message type.
+	 */
+	if (rtype == rpcrdma_noch) {
+
+		padlen = rpcrdma_inline_pullup(rqst,
+						RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+		if (padlen) {
+			headerp->rm_type = __constant_htonl(RDMA_MSGP);
+			headerp->rm_body.rm_padded.rm_align =
+				htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+			headerp->rm_body.rm_padded.rm_thresh =
+				__constant_htonl(RPCRDMA_INLINE_PAD_THRESH);
+			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+			BUG_ON(wtype != rpcrdma_noch);
+
+		} else {
+			headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+			/* new length after pullup */
+			rpclen = rqst->rq_svec[0].iov_len;
+			/*
+			 * Currently we try to not actually use read inline.
+			 * Reply chunks have the desirable property that
+			 * they land, packed, directly in the target buffers
+			 * without headers, so they require no fixup. The
+			 * additional RDMA Write op sends the same amount
+			 * of data, streams on-the-wire and adds no overhead
+			 * on receive. Therefore, we request a reply chunk
+			 * for non-writes wherever feasible and efficient.
+			 */
+			if (wtype == rpcrdma_noch &&
+			    r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+				wtype = rpcrdma_replych;
+		}
+	}
+
+	/*
+	 * Marshal chunks. This routine will return the header length
+	 * consumed by marshaling.
+	 */
+	if (rtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_snd_buf, headerp, rtype);
+		wtype = rtype;	/* simplify dprintk */
+
+	} else if (wtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_rcv_buf, headerp, wtype);
+	}
+
+	if (hdrlen == 0)
+		return -1;
+
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
+		"                   headerp 0x%p base 0x%p lkey 0x%x\n",
+		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+		headerp, base, req->rl_iov.lkey);
+
+	/*
+	 * initialize send_iov's - normally only two: rdma chunk header and
+	 * single preregistered RPC header buffer, but if padding is present,
+	 * then use a preregistered (and zeroed) pad buffer between the RPC
+	 * header and any write data. In all non-rdma cases, any following
+	 * data has been copied into the RPC header buffer.
+	 */
+	req->rl_send_iov[0].addr = req->rl_iov.addr;
+	req->rl_send_iov[0].length = hdrlen;
+	req->rl_send_iov[0].lkey = req->rl_iov.lkey;
+
+	req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
+	req->rl_send_iov[1].length = rpclen;
+	req->rl_send_iov[1].lkey = req->rl_iov.lkey;
+
+	req->rl_niovs = 2;
+
+	if (padlen) {
+		struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+		req->rl_send_iov[2].addr = ep->rep_pad.addr;
+		req->rl_send_iov[2].length = padlen;
+		req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
+
+		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+		req->rl_send_iov[3].lkey = req->rl_iov.lkey;
+
+		req->rl_niovs = 4;
+	}
+
+	return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp)
+{
+	unsigned int i, total_len;
+	struct rpcrdma_write_chunk *cur_wchunk;
+
+	i = ntohl(**iptrp);	/* get array count */
+	if (i > max)
+		return -1;
+	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+	total_len = 0;
+	while (i--) {
+		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+		ifdebug(FACILITY) {
+			u64 off;
+			xdr_decode_hyper((u32 *)&seg->rs_offset, &off);
+			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+				__func__,
+				ntohl(seg->rs_length),
+				off,
+				ntohl(seg->rs_handle));
+		}
+		total_len += ntohl(seg->rs_length);
+		++cur_wchunk;
+	}
+	/* check and adjust for properly terminated write chunk */
+	if (wrchunk) {
+		u32 *w = (u32 *) cur_wchunk;
+		if (*w++ != xdr_zero)
+			return -1;
+		cur_wchunk = (struct rpcrdma_write_chunk *) w;
+	}
+	if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
+		return -1;
+
+	*iptrp = (u32 *) cur_wchunk;
+	return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+{
+	int i, npages, curlen, olen;
+	char *destp;
+
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len) {	/* write chunk header fixup */
+		curlen = copy_len;
+		rqst->rq_rcv_buf.head[0].iov_len = curlen;
+	}
+
+	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
+		__func__, srcp, copy_len, curlen);
+
+	/* Shift pointer for first receive segment only */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	srcp += curlen;
+	copy_len -= curlen;
+
+	olen = copy_len;
+	i = 0;
+	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base +
+			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+		for (; i < npages; i++) {
+			if (i == 0)
+				curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base;
+			else
+				curlen = PAGE_SIZE;
+			if (curlen > copy_len)
+				curlen = copy_len;
+			dprintk("RPC:       %s: page %d"
+				" srcp 0x%p len %d curlen %d\n",
+				__func__, i, srcp, copy_len, curlen);
+			destp = kmap_atomic(rqst->rq_rcv_buf.pages[i],
+						KM_SKB_SUNRPC_DATA);
+			if (i == 0)
+				memcpy(destp + rqst->rq_rcv_buf.page_base,
+						srcp, curlen);
+			else
+				memcpy(destp, srcp, curlen);
+			flush_dcache_page(rqst->rq_rcv_buf.pages[i]);
+			kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
+			srcp += curlen;
+			copy_len -= curlen;
+			if (copy_len == 0)
+				break;
+		}
+		rqst->rq_rcv_buf.page_len = olen - copy_len;
+	} else
+		rqst->rq_rcv_buf.page_len = 0;
+
+	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+		curlen = copy_len;
+		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+			memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
+			__func__, srcp, copy_len, curlen);
+		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+		copy_len -= curlen; ++i;
+	} else
+		rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+	if (copy_len)
+		dprintk("RPC:       %s: %d bytes in"
+			" %d extra segments (%d lost)\n",
+			__func__, olen, i, copy_len);
+
+	/* TBD avoid a warning from call_decode() */
+	rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	struct rpc_xprt *xprt = ep->rep_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, ep->rep_connected);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when memory window unbind which we are waiting
+ * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ */
+static void
+rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+{
+	wake_up(&rep->rr_unbind);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_msg *headerp;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct rpc_xprt *xprt = rep->rr_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	u32 *iptr;
+	int i, rdmalen, status;
+
+	/* Check status. If bad, signal disconnect and return rep to pool */
+	if (rep->rr_len == ~0U) {
+		rpcrdma_recv_buffer_put(rep);
+		if (r_xprt->rx_ep.rep_connected == 1) {
+			r_xprt->rx_ep.rep_connected = -EIO;
+			rpcrdma_conn_func(&r_xprt->rx_ep);
+		}
+		return;
+	}
+	if (rep->rr_len < 28) {
+		dprintk("RPC:       %s: short/invalid reply\n", __func__);
+		goto repost;
+	}
+	headerp = (struct rpcrdma_msg *) rep->rr_base;
+	if (headerp->rm_vers != xdr_one) {
+		dprintk("RPC:       %s: invalid version %d\n",
+			__func__, ntohl(headerp->rm_vers));
+		goto repost;
+	}
+
+	/* Get XID and try for a match. */
+	spin_lock(&xprt->transport_lock);
+	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+	if (rqst == NULL) {
+		spin_unlock(&xprt->transport_lock);
+		dprintk("RPC:       %s: reply 0x%p failed "
+			"to match any request xid 0x%08x len %d\n",
+			__func__, rep, headerp->rm_xid, rep->rr_len);
+repost:
+		r_xprt->rx_stats.bad_reply_count++;
+		rep->rr_func = rpcrdma_reply_handler;
+		if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+			rpcrdma_recv_buffer_put(rep);
+
+		return;
+	}
+
+	/* get request object */
+	req = rpcr_to_rdmar(rqst);
+
+	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
+		"                   RPC request 0x%p xid 0x%08x\n",
+			__func__, rep, req, rqst, headerp->rm_xid);
+
+	BUG_ON(!req || req->rl_reply);
+
+	/* from here on, the reply is no longer an orphan */
+	req->rl_reply = rep;
+
+	/* check for expected message types */
+	/* The order of some of these tests is important. */
+	switch (headerp->rm_type) {
+	case __constant_htonl(RDMA_MSG):
+		/* never expect read chunks */
+		/* never expect reply chunks (two ways to check) */
+		/* never expect write chunks without having offered RDMA */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+		     req->rl_nchunks == 0))
+			goto badheader;
+		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+			/* count any expected write chunks in read reply */
+			/* start at write chunk array count */
+			iptr = &headerp->rm_body.rm_chunks[2];
+			rdmalen = rpcrdma_count_chunks(rep,
+						req->rl_nchunks, 1, &iptr);
+			/* check for validity, and no reply chunk after */
+			if (rdmalen < 0 || *iptr++ != xdr_zero)
+				goto badheader;
+			rep->rr_len -=
+			    ((unsigned char *)iptr - (unsigned char *)headerp);
+			status = rep->rr_len + rdmalen;
+			r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		} else {
+			/* else ordinary inline */
+			iptr = (u32 *)((unsigned char *)headerp + 28);
+			rep->rr_len -= 28; /*sizeof *headerp;*/
+			status = rep->rr_len;
+		}
+		/* Fix up the rpc results for upper layer */
+		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+		break;
+
+	case __constant_htonl(RDMA_NOMSG):
+		/* never expect read or write chunks, always reply chunks */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[2] != xdr_one ||
+		    req->rl_nchunks == 0)
+			goto badheader;
+		iptr = (u32 *)((unsigned char *)headerp + 28);
+		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+		if (rdmalen < 0)
+			goto badheader;
+		r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		/* Reply chunk buffer already is the reply vector - no fixup. */
+		status = rdmalen;
+		break;
+
+badheader:
+	default:
+		dprintk("%s: invalid rpcrdma reply header (type %d):"
+				" chunks[012] == %d %d %d"
+				" expected chunks <= %d\n",
+				__func__, ntohl(headerp->rm_type),
+				headerp->rm_body.rm_chunks[0],
+				headerp->rm_body.rm_chunks[1],
+				headerp->rm_body.rm_chunks[2],
+				req->rl_nchunks);
+		status = -EIO;
+		r_xprt->rx_stats.bad_reply_count++;
+		break;
+	}
+
+	/* If using mw bind, start the deregister process now. */
+	/* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+	if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS:
+		for (i = 0; req->rl_nchunks-- > 1;)
+			i += rpcrdma_deregister_external(
+				&req->rl_segments[i], r_xprt, NULL);
+		/* Optionally wait (not here) for unbinds to complete */
+		rep->rr_func = rpcrdma_unbind_func;
+		(void) rpcrdma_deregister_external(&req->rl_segments[i],
+						   r_xprt, rep);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		for (i = 0; req->rl_nchunks--;)
+			i += rpcrdma_deregister_external(&req->rl_segments[i],
+							 r_xprt, NULL);
+		break;
+	default:
+		break;
+	}
+
+	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+			__func__, xprt, rqst, status);
+	xprt_complete_rqst(rqst->rq_task, status);
+	spin_unlock(&xprt->transport_lock);
+}
-- 
cgit v1.2.3


From c56c65fb67d6461f6059dd83b1750a1973a91185 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Mon, 10 Sep 2007 13:51:18 -0400
Subject: RPCRDMA: rpc rdma verbs interface implementation

This implements the interface from rpcrdma to the RDMA verbs interface
supported by Infniband and iWARP.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: James Lentini <jlentini@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 1653 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 1621 insertions(+), 32 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0baf53381987..9ec8ca4f6028 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1,37 +1,1626 @@
 /*
- * Placeholders for subsequent patches
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/pci.h>	/* for Tavor hack below */
+
 #include "xprt_rdma.h"
 
-int rpcrdma_ia_open(struct rpcrdma_xprt *a, struct sockaddr *b, int c)
-{ return EINVAL; }
-void rpcrdma_ia_close(struct rpcrdma_ia *a) { }
-int rpcrdma_ep_create(struct rpcrdma_ep *a, struct rpcrdma_ia *b,
-struct rpcrdma_create_data_internal *c) { return EINVAL; }
-int rpcrdma_ep_destroy(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
-{ return EINVAL; }
-int rpcrdma_ep_connect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
-{ return EINVAL; }
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
-{ return EINVAL; }
-int rpcrdma_ep_post(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
-struct rpcrdma_req *c) { return EINVAL; }
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
-struct rpcrdma_rep *c) { return EINVAL; }
-int rpcrdma_buffer_create(struct rpcrdma_buffer *a, struct rpcrdma_ep *b,
-struct rpcrdma_ia *c, struct rpcrdma_create_data_internal *d) { return EINVAL; }
-void rpcrdma_buffer_destroy(struct rpcrdma_buffer *a) { }
-struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *a)
-{ return NULL; }
-void rpcrdma_buffer_put(struct rpcrdma_req *a) { }
-void rpcrdma_recv_buffer_get(struct rpcrdma_req *a) { }
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *a) { }
-int rpcrdma_register_internal(struct rpcrdma_ia *a, void *b, int c,
-struct ib_mr **d, struct ib_sge *e) { return EINVAL; }
-int rpcrdma_deregister_internal(struct rpcrdma_ia *a, struct ib_mr *b,
-struct ib_sge *c) { return EINVAL; }
-int rpcrdma_register_external(struct rpcrdma_mr_seg *a, int b, int c,
-struct rpcrdma_xprt *d) { return EINVAL; }
-int rpcrdma_deregister_external(struct rpcrdma_mr_seg *a,
-struct rpcrdma_xprt *b, void *c) { return EINVAL; }
+/*
+ * Globals/Macros
+ */
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+
+/*
+ * handle replies in tasklet context, using a single, global list
+ * rdma tasklet function -- just turn around and call the func
+ * for all replies on the list
+ */
+
+static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
+static LIST_HEAD(rpcrdma_tasklets_g);
+
+static void
+rpcrdma_run_tasklet(unsigned long data)
+{
+	struct rpcrdma_rep *rep;
+	void (*func)(struct rpcrdma_rep *);
+	unsigned long flags;
+
+	data = data;
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	while (!list_empty(&rpcrdma_tasklets_g)) {
+		rep = list_entry(rpcrdma_tasklets_g.next,
+				 struct rpcrdma_rep, rr_list);
+		list_del(&rep->rr_list);
+		func = rep->rr_func;
+		rep->rr_func = NULL;
+		spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+
+		if (func)
+			func(rep);
+		else
+			rpcrdma_recv_buffer_put(rep);
+
+		spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	}
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+}
+
+static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
+
+static inline void
+rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+}
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static void
+rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static inline
+void rpcrdma_event_process(struct ib_wc *wc)
+{
+	struct rpcrdma_rep *rep =
+			(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+
+	dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
+		__func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+	if (!rep) /* send or bind completion that we don't care about */
+		return;
+
+	if (IB_WC_SUCCESS != wc->status) {
+		dprintk("RPC:       %s: %s WC status %X, connection lost\n",
+			__func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
+			 wc->status);
+		rep->rr_len = ~0U;
+		rpcrdma_schedule_tasklet(rep);
+		return;
+	}
+
+	switch (wc->opcode) {
+	case IB_WC_RECV:
+		rep->rr_len = wc->byte_len;
+		ib_dma_sync_single_for_cpu(
+			rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+			rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+		/* Keep (only) the most recent credits, after check validity */
+		if (rep->rr_len >= 16) {
+			struct rpcrdma_msg *p =
+					(struct rpcrdma_msg *) rep->rr_base;
+			unsigned int credits = ntohl(p->rm_credit);
+			if (credits == 0) {
+				dprintk("RPC:       %s: server"
+					" dropped credits to 0!\n", __func__);
+				/* don't deadlock */
+				credits = 1;
+			} else if (credits > rep->rr_buffer->rb_max_requests) {
+				dprintk("RPC:       %s: server"
+					" over-crediting: %d (%d)\n",
+					__func__, credits,
+					rep->rr_buffer->rb_max_requests);
+				credits = rep->rr_buffer->rb_max_requests;
+			}
+			atomic_set(&rep->rr_buffer->rb_credits, credits);
+		}
+		/* fall through */
+	case IB_WC_BIND_MW:
+		rpcrdma_schedule_tasklet(rep);
+		break;
+	default:
+		dprintk("RPC:       %s: unexpected WC event %X\n",
+			__func__, wc->opcode);
+		break;
+	}
+}
+
+static inline int
+rpcrdma_cq_poll(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int rc;
+
+	for (;;) {
+		rc = ib_poll_cq(cq, 1, &wc);
+		if (rc < 0) {
+			dprintk("RPC:       %s: ib_poll_cq failed %i\n",
+				__func__, rc);
+			return rc;
+		}
+		if (rc == 0)
+			break;
+
+		rpcrdma_event_process(&wc);
+	}
+
+	return 0;
+}
+
+/*
+ * rpcrdma_cq_event_upcall
+ *
+ * This upcall handles recv, send, bind and unbind events.
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+ * It is the responsibility of the scheduled tasklet to return
+ * recv buffers to the pool. NOTE: this affects synchronization of
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+ *
+ * Note that send events are suppressed and do not result in an upcall.
+ */
+static void
+rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+{
+	int rc;
+
+	rc = rpcrdma_cq_poll(cq);
+	if (rc)
+		return;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rpcrdma_cq_poll(cq);
+}
+
+#ifdef RPC_DEBUG
+static const char * const conn[] = {
+	"address resolved",
+	"address error",
+	"route resolved",
+	"route error",
+	"connect request",
+	"connect response",
+	"connect error",
+	"unreachable",
+	"rejected",
+	"established",
+	"disconnected",
+	"device removal"
+};
+#endif
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rpcrdma_xprt *xprt = id->context;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	struct rpcrdma_ep *ep = &xprt->rx_ep;
+	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr iattr;
+	int connstate = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ia->ri_async_rc = -EHOSTUNREACH;
+		dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ia->ri_async_rc = -ENETUNREACH;
+		dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		connstate = 1;
+		ib_query_qp(ia->ri_id->qp, &attr,
+			IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+			&iattr);
+		dprintk("RPC:       %s: %d responder resources"
+			" (%d initiator)\n",
+			__func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
+		goto connected;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		connstate = -ENOTCONN;
+		goto connected;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		connstate = -ENETDOWN;
+		goto connected;
+	case RDMA_CM_EVENT_REJECTED:
+		connstate = -ECONNREFUSED;
+		goto connected;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		connstate = -ECONNABORTED;
+		goto connected;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		connstate = -ENODEV;
+connected:
+		dprintk("RPC:       %s: %s: %u.%u.%u.%u:%u"
+			" (ep 0x%p event 0x%x)\n",
+			__func__,
+			(event->event <= 11) ? conn[event->event] :
+						"unknown connection error",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port),
+			ep, event->event);
+		atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
+		dprintk("RPC:       %s: %sconnected\n",
+					__func__, connstate > 0 ? "" : "dis");
+		ep->rep_connected = connstate;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+		break;
+	default:
+		ia->ri_async_rc = -EINVAL;
+		dprintk("RPC:       %s: unexpected CM event %X\n",
+			__func__, event->event);
+		complete(&ia->ri_done);
+		break;
+	}
+
+	return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt,
+			struct rpcrdma_ia *ia, struct sockaddr *addr)
+{
+	struct rdma_cm_id *id;
+	int rc;
+
+	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
+			__func__, rc);
+		return id;
+	}
+
+	ia->ri_async_rc = 0;
+	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion(&ia->ri_done);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	ia->ri_async_rc = 0;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion(&ia->ri_done);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Drain any cq, prior to teardown.
+ */
+static void
+rpcrdma_clean_cq(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int count = 0;
+
+	while (1 == ib_poll_cq(cq, 1, &wc))
+		++count;
+
+	if (count)
+		dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
+			__func__, count, wc.opcode);
+}
+
+/*
+ * Exported functions.
+ */
+
+/*
+ * Open and initialize an Interface Adapter.
+ *  o initializes fields of struct rpcrdma_ia, including
+ *    interface and provider attributes and protection zone.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+{
+	int rc;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+
+	init_completion(&ia->ri_done);
+
+	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+	if (IS_ERR(ia->ri_id)) {
+		rc = PTR_ERR(ia->ri_id);
+		goto out1;
+	}
+
+	ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+	if (IS_ERR(ia->ri_pd)) {
+		rc = PTR_ERR(ia->ri_pd);
+		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	/*
+	 * Optionally obtain an underlying physical identity mapping in
+	 * order to do a memory window-based bind. This base registration
+	 * is protected from remote access - that is enabled only by binding
+	 * for the specific bytes targeted during each RPC operation, and
+	 * revoked after the corresponding completion similar to a storage
+	 * adapter.
+	 */
+	if (memreg > RPCRDMA_REGISTER) {
+		int mem_priv = IB_ACCESS_LOCAL_WRITE;
+		switch (memreg) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+		case RPCRDMA_ALLPHYSICAL:
+			mem_priv |= IB_ACCESS_REMOTE_WRITE;
+			mem_priv |= IB_ACCESS_REMOTE_READ;
+			break;
+#endif
+		case RPCRDMA_MEMWINDOWS_ASYNC:
+		case RPCRDMA_MEMWINDOWS:
+			mem_priv |= IB_ACCESS_MW_BIND;
+			break;
+		default:
+			break;
+		}
+		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+		if (IS_ERR(ia->ri_bind_mem)) {
+			printk(KERN_ALERT "%s: ib_get_dma_mr for "
+				"phys register failed with %lX\n\t"
+				"Will continue with degraded performance\n",
+				__func__, PTR_ERR(ia->ri_bind_mem));
+			memreg = RPCRDMA_REGISTER;
+			ia->ri_bind_mem = NULL;
+		}
+	}
+
+	/* Else will do memory reg/dereg for each chunk */
+	ia->ri_memreg_strategy = memreg;
+
+	return 0;
+out2:
+	rdma_destroy_id(ia->ri_id);
+out1:
+	return rc;
+}
+
+/*
+ * Clean up/close an IA.
+ *   o if event handles and PD have been initialized, free them.
+ *   o close the IA
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering\n", __func__);
+	if (ia->ri_bind_mem != NULL) {
+		rc = ib_dereg_mr(ia->ri_bind_mem);
+		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
+		rdma_destroy_qp(ia->ri_id);
+	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+		rc = ib_dealloc_pd(ia->ri_pd);
+		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
+		rdma_destroy_id(ia->ri_id);
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+				struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr devattr;
+	int rc;
+
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		return rc;
+	}
+
+	/* check provider's send/recv wr limits */
+	if (cdata->max_requests > devattr.max_qp_wr)
+		cdata->max_requests = devattr.max_qp_wr;
+
+	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+	ep->rep_attr.qp_context = ep;
+	/* send_cq and recv_cq initialized below */
+	ep->rep_attr.srq = NULL;
+	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/* Add room for mw_binds+unbinds - overkill! */
+		ep->rep_attr.cap.max_send_wr++;
+		ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+	ep->rep_attr.cap.max_recv_sge = 1;
+	ep->rep_attr.cap.max_inline_data = 0;
+	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->rep_attr.qp_type = IB_QPT_RC;
+	ep->rep_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr,
+		ep->rep_attr.cap.max_send_sge,
+		ep->rep_attr.cap.max_recv_sge);
+
+	/* set trigger for requesting send completion */
+	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
+		break;
+	default:
+		break;
+	}
+	if (ep->rep_cqinit <= 2)
+		ep->rep_cqinit = 0;
+	INIT_CQCOUNT(ep);
+	ep->rep_ia = ia;
+	init_waitqueue_head(&ep->rep_connect_wait);
+
+	/*
+	 * Create a single cq for receive dto and mw_bind (only ever
+	 * care about unbind, really). Send completions are suppressed.
+	 * Use single threaded tasklet upcalls to maintain ordering.
+	 */
+	ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+				  rpcrdma_cq_async_error_upcall, NULL,
+				  ep->rep_attr.cap.max_recv_wr +
+				  ep->rep_attr.cap.max_send_wr + 1, 0);
+	if (IS_ERR(ep->rep_cq)) {
+		rc = PTR_ERR(ep->rep_cq);
+		dprintk("RPC:       %s: ib_create_cq failed: %i\n",
+			__func__, rc);
+		goto out1;
+	}
+
+	rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	ep->rep_attr.send_cq = ep->rep_cq;
+	ep->rep_attr.recv_cq = ep->rep_cq;
+
+	/* Initialize cma parameters */
+
+	/* RPC/RDMA does not use private data */
+	ep->rep_remote_cma.private_data = NULL;
+	ep->rep_remote_cma.private_data_len = 0;
+
+	/* Client offers RDMA Read but does not initiate */
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_BOUNCEBUFFERS:
+		ep->rep_remote_cma.responder_resources = 0;
+		break;
+	case RPCRDMA_MTHCAFMR:
+	case RPCRDMA_REGISTER:
+		ep->rep_remote_cma.responder_resources = cdata->max_requests *
+				(RPCRDMA_MAX_DATA_SEGS / 8);
+		break;
+	case RPCRDMA_MEMWINDOWS:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+#endif
+		ep->rep_remote_cma.responder_resources = cdata->max_requests *
+				(RPCRDMA_MAX_DATA_SEGS / 2);
+		break;
+	default:
+		break;
+	}
+	if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
+		ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+	ep->rep_remote_cma.initiator_depth = 0;
+
+	ep->rep_remote_cma.retry_count = 7;
+	ep->rep_remote_cma.flow_control = 0;
+	ep->rep_remote_cma.rnr_retry_count = 0;
+
+	return 0;
+
+out2:
+	if (ib_destroy_cq(ep->rep_cq))
+		;
+out1:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ *
+ * The caller's error handling must be sure to not leak the endpoint
+ * if this function fails.
+ */
+int
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering, connected is %d\n",
+		__func__, ep->rep_connected);
+
+	if (ia->ri_id->qp) {
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" returned %i\n", __func__, rc);
+	}
+
+	ep->rep_func = NULL;
+
+	/* padding - could be done in rpcrdma_buffer_destroy... */
+	if (ep->rep_pad_mr) {
+		rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
+		ep->rep_pad_mr = NULL;
+	}
+
+	if (ia->ri_id->qp) {
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = ib_destroy_cq(ep->rep_cq);
+	if (rc)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, rc);
+
+	return rc;
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	struct rdma_cm_id *id;
+	int rc = 0;
+	int retry_count = 0;
+	int reconnect = (ep->rep_connected != 0);
+
+	if (reconnect) {
+		struct rpcrdma_xprt *xprt;
+retry:
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc && rc != -ENOTCONN)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" status %i\n", __func__, rc);
+		rpcrdma_clean_cq(ep->rep_cq);
+
+		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		id = rpcrdma_create_id(xprt, ia,
+				(struct sockaddr *)&xprt->rx_data.addr);
+		if (IS_ERR(id)) {
+			rc = PTR_ERR(id);
+			goto out;
+		}
+		/* TEMP TEMP TEMP - fail if new device:
+		 * Deregister/remarshal *all* requests!
+		 * Close and recreate adapter, pd, etc!
+		 * Re-determine all attributes still sane!
+		 * More stuff I haven't thought of!
+		 * Rrrgh!
+		 */
+		if (ia->ri_id->device != id->device) {
+			printk("RPC:       %s: can't reconnect on "
+				"different device!\n", __func__);
+			rdma_destroy_id(id);
+			rc = -ENETDOWN;
+			goto out;
+		}
+		/* END TEMP */
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = id;
+	}
+
+	rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+
+/* XXX Tavor device performs badly with 2K MTU! */
+if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
+	struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
+	if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
+	    (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
+	     pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
+		struct ib_qp_attr attr = {
+			.path_mtu = IB_MTU_1024
+		};
+		rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
+	}
+}
+
+	/* Theoretically a client initiator_depth > 0 is not needed,
+	 * but many peers fail to complete the connection unless they
+	 * == responder_resources! */
+	if (ep->rep_remote_cma.initiator_depth !=
+				ep->rep_remote_cma.responder_resources)
+		ep->rep_remote_cma.initiator_depth =
+			ep->rep_remote_cma.responder_resources;
+
+	ep->rep_connected = 0;
+
+	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
+				__func__, rc);
+		goto out;
+	}
+
+	if (reconnect)
+		return 0;
+
+	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+
+	/*
+	 * Check state. A non-peer reject indicates no listener
+	 * (ECONNREFUSED), which may be a transient state. All
+	 * others indicate a transport condition which has already
+	 * undergone a best-effort.
+	 */
+	if (ep->rep_connected == -ECONNREFUSED
+	    && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+		dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
+		goto retry;
+	}
+	if (ep->rep_connected <= 0) {
+		/* Sometimes, the only way to reliably connect to remote
+		 * CMs is to use same nonzero values for ORD and IRD. */
+		ep->rep_remote_cma.initiator_depth =
+					ep->rep_remote_cma.responder_resources;
+		if (ep->rep_remote_cma.initiator_depth == 0)
+			++ep->rep_remote_cma.initiator_depth;
+		if (ep->rep_remote_cma.responder_resources == 0)
+			++ep->rep_remote_cma.responder_resources;
+		if (retry_count++ == 0)
+			goto retry;
+		rc = ep->rep_connected;
+	} else {
+		dprintk("RPC:       %s: connected\n", __func__);
+	}
+
+out:
+	if (rc)
+		ep->rep_connected = rc;
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+int
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = rdma_disconnect(ia->ri_id);
+	if (!rc) {
+		/* returns without wait if not connected */
+		wait_event_interruptible(ep->rep_connect_wait,
+							ep->rep_connected != 1);
+		dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
+			(ep->rep_connected == 1) ? "still " : "dis");
+	} else {
+		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
+		ep->rep_connected = rc;
+	}
+	return rc;
+}
+
+/*
+ * Initialize buffer memory
+ */
+int
+rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+	struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+{
+	char *p;
+	size_t len;
+	int i, rc;
+
+	buf->rb_max_requests = cdata->max_requests;
+	spin_lock_init(&buf->rb_lock);
+	atomic_set(&buf->rb_credits, 1);
+
+	/* Need to allocate:
+	 *   1.  arrays for send and recv pointers
+	 *   2.  arrays of struct rpcrdma_req to fill in pointers
+	 *   3.  array of struct rpcrdma_rep for replies
+	 *   4.  padding, if any
+	 *   5.  mw's, if any
+	 * Send/recv buffers in req/rep need to be registered
+	 */
+
+	len = buf->rb_max_requests *
+		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
+	len += cdata->padding;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+		/* TBD we are perhaps overallocating here */
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	default:
+		break;
+	}
+
+	/* allocate 1, 4 and 5 in one shot */
+	p = kzalloc(len, GFP_KERNEL);
+	if (p == NULL) {
+		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
+			__func__, len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	buf->rb_pool = p;	/* for freeing it later */
+
+	buf->rb_send_bufs = (struct rpcrdma_req **) p;
+	p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
+	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
+	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+
+	/*
+	 * Register the zeroed pad buffer, if any.
+	 */
+	if (cdata->padding) {
+		rc = rpcrdma_register_internal(ia, p, cdata->padding,
+					    &ep->rep_pad_mr, &ep->rep_pad);
+		if (rc)
+			goto out;
+	}
+	p += cdata->padding;
+
+	/*
+	 * Allocate the fmr's, or mw's for mw_bind chunk registration.
+	 * We "cycle" the mw's in order to minimize rkey reuse,
+	 * and also reduce unbind-to-bind collision.
+	 */
+	INIT_LIST_HEAD(&buf->rb_mws);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+		{
+		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
+		struct ib_fmr_attr fa = {
+			RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
+		};
+		/* TBD we are perhaps overallocating here */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.fmr = ib_alloc_fmr(ia->ri_pd,
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
+				&fa);
+			if (IS_ERR(r->r.fmr)) {
+				rc = PTR_ERR(r->r.fmr);
+				dprintk("RPC:       %s: ib_alloc_fmr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		}
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
+		/* Allocate one extra request's worth, for full cycling */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.mw = ib_alloc_mw(ia->ri_pd);
+			if (IS_ERR(r->r.mw)) {
+				rc = PTR_ERR(r->r.mw);
+				dprintk("RPC:       %s: ib_alloc_mw"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Allocate/init the request/reply buffers. Doing this
+	 * using kmalloc for now -- one for each buf.
+	 */
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		struct rpcrdma_req *req;
+		struct rpcrdma_rep *rep;
+
+		len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
+		/* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
+		/* Typical ~2400b, so rounding up saves work later */
+		if (len < 4096)
+			len = 4096;
+		req = kmalloc(len, GFP_KERNEL);
+		if (req == NULL) {
+			dprintk("RPC:       %s: request buffer %d alloc"
+				" failed\n", __func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(req, 0, sizeof(struct rpcrdma_req));
+		buf->rb_send_bufs[i] = req;
+		buf->rb_send_bufs[i]->rl_buffer = buf;
+
+		rc = rpcrdma_register_internal(ia, req->rl_base,
+				len - offsetof(struct rpcrdma_req, rl_base),
+				&buf->rb_send_bufs[i]->rl_handle,
+				&buf->rb_send_bufs[i]->rl_iov);
+		if (rc)
+			goto out;
+
+		buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+
+		len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
+		rep = kmalloc(len, GFP_KERNEL);
+		if (rep == NULL) {
+			dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+				__func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(rep, 0, sizeof(struct rpcrdma_rep));
+		buf->rb_recv_bufs[i] = rep;
+		buf->rb_recv_bufs[i]->rr_buffer = buf;
+		init_waitqueue_head(&rep->rr_unbind);
+
+		rc = rpcrdma_register_internal(ia, rep->rr_base,
+				len - offsetof(struct rpcrdma_rep, rr_base),
+				&buf->rb_recv_bufs[i]->rr_handle,
+				&buf->rb_recv_bufs[i]->rr_iov);
+		if (rc)
+			goto out;
+
+	}
+	dprintk("RPC:       %s: max_requests %d\n",
+		__func__, buf->rb_max_requests);
+	/* done */
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+/*
+ * Unregister and destroy buffer memory. Need to deal with
+ * partial initialization, so it's callable from failed create.
+ * Must be called before destroying endpoint, as registrations
+ * reference it.
+ */
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	int rc, i;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+
+	/* clean up in reverse order from create
+	 *   1.  recv mr memory (mr free, then kfree)
+	 *   1a. bind mw memory
+	 *   2.  send mr memory (mr free, then kfree)
+	 *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
+	 *   4.  arrays
+	 */
+	dprintk("RPC:       %s: entering\n", __func__);
+
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
+			rpcrdma_deregister_internal(ia,
+					buf->rb_recv_bufs[i]->rr_handle,
+					&buf->rb_recv_bufs[i]->rr_iov);
+			kfree(buf->rb_recv_bufs[i]);
+		}
+		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+			while (!list_empty(&buf->rb_mws)) {
+				struct rpcrdma_mw *r;
+				r = list_entry(buf->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+				list_del(&r->mw_list);
+				switch (ia->ri_memreg_strategy) {
+				case RPCRDMA_MTHCAFMR:
+					rc = ib_dealloc_fmr(r->r.fmr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_fmr"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				case RPCRDMA_MEMWINDOWS_ASYNC:
+				case RPCRDMA_MEMWINDOWS:
+					rc = ib_dealloc_mw(r->r.mw);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_mw"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				default:
+					break;
+				}
+			}
+			rpcrdma_deregister_internal(ia,
+					buf->rb_send_bufs[i]->rl_handle,
+					&buf->rb_send_bufs[i]->rl_iov);
+			kfree(buf->rb_send_bufs[i]);
+		}
+	}
+
+	kfree(buf->rb_pool);
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if needed) is attached to send buffer upon return.
+ * Rule:
+ *    rb_send_index and rb_recv_index MUST always be pointing to the
+ *    *next* available buffer (non-NULL). They are incremented after
+ *    removing buffers, and decremented *before* returning them.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_req *req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_send_index == buffers->rb_max_requests) {
+		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		dprintk("RPC:       %s: out of request buffers\n", __func__);
+		return ((struct rpcrdma_req *)NULL);
+	}
+
+	req = buffers->rb_send_bufs[buffers->rb_send_index];
+	if (buffers->rb_send_index < buffers->rb_recv_index) {
+		dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
+			__func__,
+			buffers->rb_recv_index - buffers->rb_send_index);
+		req->rl_reply = NULL;
+	} else {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+	if (!list_empty(&buffers->rb_mws)) {
+		int i = RPCRDMA_MAX_SEGS - 1;
+		do {
+			struct rpcrdma_mw *r;
+			r = list_entry(buffers->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+			list_del(&r->mw_list);
+			req->rl_segments[i].mr_chunk.rl_mw = r;
+		} while (--i >= 0);
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	return req;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	int i;
+	unsigned long flags;
+
+	BUG_ON(req->rl_nchunks != 0);
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_send_bufs[--buffers->rb_send_index] = req;
+	req->rl_niovs = 0;
+	if (req->rl_reply) {
+		buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
+		init_waitqueue_head(&req->rl_reply->rr_unbind);
+		req->rl_reply->rr_func = NULL;
+		req->rl_reply = NULL;
+	}
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MTHCAFMR:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/*
+		 * Cycle mw's back in reverse order, and "spin" them.
+		 * This delays and scrambles reuse as much as possible.
+		 */
+		i = 1;
+		do {
+			struct rpcrdma_mw **mw;
+			mw = &req->rl_segments[i].mr_chunk.rl_mw;
+			list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
+			*mw = NULL;
+		} while (++i < RPCRDMA_MAX_SEGS);
+		list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
+					&buffers->rb_mws);
+		req->rl_segments[0].mr_chunk.rl_mw = NULL;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from error conditions.
+ * Post-increment counter/array index.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	unsigned long flags;
+
+	if (req->rl_iov.length == 0)	/* special case xprt_rdma_allocate() */
+		buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_recv_index < buffers->rb_max_requests) {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions, and when
+ * aborting unbinds. Pre-decrement counter/array index.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_buffer *buffers = rep->rr_buffer;
+	unsigned long flags;
+
+	rep->rr_func = NULL;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Wrappers for internal-use kmalloc memory registration, used by buffer code.
+ */
+
+int
+rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
+				struct ib_mr **mrp, struct ib_sge *iov)
+{
+	struct ib_phys_buf ipb;
+	struct ib_mr *mr;
+	int rc;
+
+	/*
+	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
+	 */
+	iov->addr = ib_dma_map_single(ia->ri_id->device,
+			va, len, DMA_BIDIRECTIONAL);
+	iov->length = len;
+
+	if (ia->ri_bind_mem != NULL) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_bind_mem->lkey;
+		return 0;
+	}
+
+	ipb.addr = iov->addr;
+	ipb.size = iov->length;
+	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
+			IB_ACCESS_LOCAL_WRITE, &iov->addr);
+
+	dprintk("RPC:       %s: phys convert: 0x%llx "
+			"registered 0x%llx length %d\n",
+			__func__, ipb.addr, iov->addr, len);
+
+	if (IS_ERR(mr)) {
+		*mrp = NULL;
+		rc = PTR_ERR(mr);
+		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
+	} else {
+		*mrp = mr;
+		iov->lkey = mr->lkey;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+int
+rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
+				struct ib_mr *mr, struct ib_sge *iov)
+{
+	int rc;
+
+	ib_dma_unmap_single(ia->ri_id->device,
+			iov->addr, iov->length, DMA_BIDIRECTIONAL);
+
+	if (NULL == mr)
+		return 0;
+
+	rc = ib_dereg_mr(mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
+	return rc;
+}
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static void
+rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
+{
+	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	seg->mr_dmalen = seg->mr_len;
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+}
+
+static void
+rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+int
+rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int i;
+	int rc = 0;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		rpcrdma_map_one(ia, seg, writing);
+		seg->mr_rkey = ia->ri_bind_mem->rkey;
+		seg->mr_base = seg->mr_dma;
+		seg->mr_nsegs = 1;
+		nsegs = 1;
+		break;
+#endif
+
+	/* Registration using fast memory registration */
+	case RPCRDMA_MTHCAFMR:
+		{
+		u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+		int len, pageoff = offset_in_page(seg->mr_offset);
+		seg1->mr_offset -= pageoff;	/* start of page */
+		seg1->mr_len += pageoff;
+		len = -pageoff;
+		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+			nsegs = RPCRDMA_MAX_DATA_SEGS;
+		for (i = 0; i < nsegs;) {
+			rpcrdma_map_one(ia, seg, writing);
+			physaddrs[i] = seg->mr_dma;
+			len += seg->mr_len;
+			++seg;
+			++i;
+			/* Check for holes */
+			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+				break;
+		}
+		nsegs = i;
+		rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+					physaddrs, nsegs, seg1->mr_dma);
+		if (rc) {
+			dprintk("RPC:       %s: failed ib_map_phys_fmr "
+				"%u@0x%llx+%i (%d)... status %i\n", __func__,
+				len, (unsigned long long)seg1->mr_dma,
+				pageoff, nsegs, rc);
+			while (nsegs--)
+				rpcrdma_unmap_one(ia, --seg);
+		} else {
+			seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+			seg1->mr_base = seg1->mr_dma + pageoff;
+			seg1->mr_nsegs = nsegs;
+			seg1->mr_len = len;
+		}
+		}
+		break;
+
+	/* Registration using memory windows */
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct ib_mw_bind param;
+		rpcrdma_map_one(ia, seg, writing);
+		param.mr = ia->ri_bind_mem;
+		param.wr_id = 0ULL;	/* no send cookie */
+		param.addr = seg->mr_dma;
+		param.length = seg->mr_len;
+		param.send_flags = 0;
+		param.mw_access_flags = mem_priv;
+
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+		rc = ib_bind_mw(ia->ri_id->qp,
+					seg->mr_chunk.rl_mw->r.mw, &param);
+		if (rc) {
+			dprintk("RPC:       %s: failed ib_bind_mw "
+				"%u@0x%llx status %i\n",
+				__func__, seg->mr_len,
+				(unsigned long long)seg->mr_dma, rc);
+			rpcrdma_unmap_one(ia, seg);
+		} else {
+			seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+			seg->mr_base = param.addr;
+			seg->mr_nsegs = 1;
+			nsegs = 1;
+		}
+		}
+		break;
+
+	/* Default registration each time */
+	default:
+		{
+		struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+		int len = 0;
+		if (nsegs > RPCRDMA_MAX_DATA_SEGS)
+			nsegs = RPCRDMA_MAX_DATA_SEGS;
+		for (i = 0; i < nsegs;) {
+			rpcrdma_map_one(ia, seg, writing);
+			ipb[i].addr = seg->mr_dma;
+			ipb[i].size = seg->mr_len;
+			len += seg->mr_len;
+			++seg;
+			++i;
+			/* Check for holes */
+			if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+			    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+				break;
+		}
+		nsegs = i;
+		seg1->mr_base = seg1->mr_dma;
+		seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+					ipb, nsegs, mem_priv, &seg1->mr_base);
+		if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+			rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+			dprintk("RPC:       %s: failed ib_reg_phys_mr "
+				"%u@0x%llx (%d)... status %i\n",
+				__func__, len,
+				(unsigned long long)seg1->mr_dma, nsegs, rc);
+			while (nsegs--)
+				rpcrdma_unmap_one(ia, --seg);
+		} else {
+			seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+			seg1->mr_nsegs = nsegs;
+			seg1->mr_len = len;
+		}
+		}
+		break;
+	}
+	if (rc)
+		return -1;
+
+	return nsegs;
+}
+
+int
+rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+		struct rpcrdma_xprt *r_xprt, void *r)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int nsegs = seg->mr_nsegs, rc;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		BUG_ON(nsegs != 1);
+		rpcrdma_unmap_one(ia, seg);
+		rc = 0;
+		break;
+#endif
+
+	case RPCRDMA_MTHCAFMR:
+		{
+		LIST_HEAD(l);
+		list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
+		rc = ib_unmap_fmr(&l);
+		while (seg1->mr_nsegs--)
+			rpcrdma_unmap_one(ia, seg++);
+		}
+		if (rc)
+			dprintk("RPC:       %s: failed ib_unmap_fmr,"
+				" status %i\n", __func__, rc);
+		break;
+
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		{
+		struct ib_mw_bind param;
+		BUG_ON(nsegs != 1);
+		param.mr = ia->ri_bind_mem;
+		param.addr = 0ULL;	/* unbind */
+		param.length = 0;
+		param.mw_access_flags = 0;
+		if (r) {
+			param.wr_id = (u64) (unsigned long) r;
+			param.send_flags = IB_SEND_SIGNALED;
+			INIT_CQCOUNT(&r_xprt->rx_ep);
+		} else {
+			param.wr_id = 0ULL;
+			param.send_flags = 0;
+			DECR_CQCOUNT(&r_xprt->rx_ep);
+		}
+		rc = ib_bind_mw(ia->ri_id->qp,
+				seg->mr_chunk.rl_mw->r.mw, &param);
+		rpcrdma_unmap_one(ia, seg);
+		}
+		if (rc)
+			dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+				" status %i\n", __func__, rc);
+		else
+			r = NULL;	/* will upcall on completion */
+		break;
+
+	default:
+		rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+		seg1->mr_chunk.rl_mr = NULL;
+		while (seg1->mr_nsegs--)
+			rpcrdma_unmap_one(ia, seg++);
+		if (rc)
+			dprintk("RPC:       %s: failed ib_dereg_mr,"
+				" status %i\n", __func__, rc);
+		break;
+	}
+	if (r) {
+		struct rpcrdma_rep *rep = r;
+		void (*func)(struct rpcrdma_rep *) = rep->rr_func;
+		rep->rr_func = NULL;
+		func(rep);	/* dereg done, callback now */
+	}
+	return nsegs;
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+		struct rpcrdma_ep *ep,
+		struct rpcrdma_req *req)
+{
+	struct ib_send_wr send_wr, *send_wr_fail;
+	struct rpcrdma_rep *rep = req->rl_reply;
+	int rc;
+
+	if (rep) {
+		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		if (rc)
+			goto out;
+		req->rl_reply = NULL;
+	}
+
+	send_wr.next = NULL;
+	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.sg_list = req->rl_send_iov;
+	send_wr.num_sge = req->rl_niovs;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.imm_data = 0;
+	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
+		ib_dma_sync_single_for_device(ia->ri_id->device,
+			req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
+			DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
+		DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
+		DMA_TO_DEVICE);
+
+	if (DECR_CQCOUNT(ep) > 0)
+		send_wr.send_flags = 0;
+	else { /* Provider must take a send completion every now and then */
+		INIT_CQCOUNT(ep);
+		send_wr.send_flags = IB_SEND_SIGNALED;
+	}
+
+	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	if (rc)
+		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
+			rc);
+out:
+	return rc;
+}
+
+/*
+ * (Re)post a receive buffer.
+ */
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+		     struct rpcrdma_ep *ep,
+		     struct rpcrdma_rep *rep)
+{
+	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	int rc;
+
+	recv_wr.next = NULL;
+	recv_wr.wr_id = (u64) (unsigned long) rep;
+	recv_wr.sg_list = &rep->rr_iov;
+	recv_wr.num_sge = 1;
+
+	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+		rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+
+	DECR_CQCOUNT(ep);
+	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+	if (rc)
+		dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
+			rc);
+	return rc;
+}
-- 
cgit v1.2.3


From 113632d00acb569420b14eb7575833ac7e2eb311 Mon Sep 17 00:00:00 2001
From: "\\\"Talpey, Thomas\\" <Thomas.Talpey@netapp.com>
Date: Thu, 20 Sep 2007 17:37:58 -0400
Subject: SUNRPC: Add RDMA dependency to SUNRPC_XPRT_RDMA

Add a dependency on RDMA before enabling SUNRPC_XPRT_RDMA
Yes, "INFINIBAND" also turns on iWARP and other RDMA support.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index b9808bba5722..860ea8d1e475 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1730,7 +1730,7 @@ config SUNRPC_GSS
 
 config SUNRPC_XPRT_RDMA
 	tristate "RDMA transport for sunrpc (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
+	depends on SUNRPC && INFINIBAND && EXPERIMENTAL
 	default m
 	help
 	  Adds a client RPC transport for supporting kernel NFS over RDMA
-- 
cgit v1.2.3


From 6e88e0618cb1e354a44cc49a996df4dd89511039 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:39:50 -0400
Subject: NFS: Verify server address before invoking in-kernel mount client

Re-order mount option sanity checking slightly to ensure we have a valid
server address *before* trying to do the mountd RPC call.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 11ab7ff6e087..13a3ab30b143 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1100,6 +1100,10 @@ static int nfs_validate_mount_data(void *options,
 		if (mntfh->size < sizeof(mntfh->data))
 			memset(mntfh->data + mntfh->size, 0,
 			       sizeof(mntfh->data) - mntfh->size);
+
+		if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
+			goto out_no_address;
+
 		/*
 		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
 		 * can deal with.
@@ -1131,6 +1135,10 @@ static int nfs_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
+			goto out_no_address;
+
 		c = strchr(dev_name, ':');
 		if (c == NULL)
 			return -EINVAL;
@@ -1159,10 +1167,6 @@ static int nfs_validate_mount_data(void *options,
 		goto out_v3_not_compiled;
 #endif /* !CONFIG_NFS_V3 */
 
-	if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
-		goto out_no_address;
-
 	return 0;
 
 out_no_data:
-- 
cgit v1.2.3


From bcf35617a7c3474ad12892dfbb089a572e5c06d2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:39:55 -0400
Subject: NFS: Show "nointr" mount option

The default "intr" setting is different for NFS and NFSv4.  To avoid
confusion on this issue, don't hide the "nointr" option in /proc/mounts.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 13a3ab30b143..fa517ae9207f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -424,7 +424,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		const char *nostr;
 	} nfs_info[] = {
 		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
-		{ NFS_MOUNT_INTR, ",intr", "" },
+		{ NFS_MOUNT_INTR, ",intr", ",nointr" },
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
-- 
cgit v1.2.3


From 1321d8d971028e796978f6a48d195c09158b3bcd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:40:00 -0400
Subject: SUNRPC: Fix bytes-per-op accounting for RPC over UDP

NFS performance metrics reported zero bytes sent per op when mounting with
UDP.  The UDP socket transport wasn't properly counting the number of bytes
sent.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b81494a97a5e..e97e4ca43ef3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -587,6 +587,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
 
+	task->tk_bytes_sent += status;
 	if (likely(status >= (int) req->rq_slen))
 		return 0;
 
-- 
cgit v1.2.3


From 92f6c178250170222f6d80c8ae725400765aa7a4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:40:06 -0400
Subject: NFS: Don't call nfs_renew_times() in nfs_dentry_iput()

Negative dentries need to be reverified after an asynchronous unlink.

Quoth Trond:

"Unfortunately I don't think that we can avoid revalidating the
resulting negative dentry since the UNLINK call is asynchronous,
and so the new verifier on the directory will only be known a
posteriori."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6e0aa04451dd..439346abda62 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -872,8 +872,6 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 		nfs_complete_unlink(dentry, inode);
 		unlock_kernel();
 	}
-	/* When creating a negative dentry, we want to renew d_time */
-	nfs_renew_times(dentry);
 	iput(inode);
 }
 
-- 
cgit v1.2.3


From 77a55a1fe8f26f7d022986a599b68002e21d968a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:40:11 -0400
Subject: NFS: Eliminate nfs_renew_times()

The nfs_renew_times() function plants the current time in jiffies in
dentry->d_time.  But a call to nfs_renew_times() is always followed by
another call that overwrites dentry->d_time.  Get rid of the
nfs_renew_times() calls.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 439346abda62..78144daeb70f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -671,15 +671,6 @@ static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
 	nfs_set_verifier(dentry, verf);
 }
 
-/*
- * Whenever an NFS operation succeeds, we know that the dentry
- * is valid, so we update the revalidation timestamp.
- */
-static inline void nfs_renew_times(struct dentry * dentry)
-{
-	dentry->d_time = jiffies;
-}
-
 /*
  * Return the intent data that applies to this particular path component
  *
@@ -803,7 +794,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
 		goto out_bad;
 
-	nfs_renew_times(dentry);
 	nfs_refresh_verifier(dentry, verifier);
  out_valid:
 	unlock_kernel();
@@ -966,7 +956,6 @@ no_entry:
 		dput(parent);
 		dentry = res;
 	}
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unlock:
 	unlock_kernel();
@@ -1061,7 +1050,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		}
 	} else if (res != NULL)
 		dentry = res;
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out:
 	return res;
@@ -1181,11 +1169,9 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		dentry = alias;
 	}
 
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return dentry;
 out_renew:
-	nfs_renew_times(dentry);
 	nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir));
 	return dentry;
 }
@@ -1252,7 +1238,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
@@ -1286,7 +1271,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	nfs_end_data_update(dir);
 	if (status != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
@@ -1316,7 +1300,6 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
@@ -1409,7 +1392,6 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 				dir, &qsilly);
 	nfs_end_data_update(dir);
 	if (!error) {
-		nfs_renew_times(dentry);
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		d_move(dentry, sdentry);
 		error = nfs_async_unlink(dir, dentry);
@@ -1492,7 +1474,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	spin_unlock(&dcache_lock);
 	error = nfs_safe_remove(dentry);
 	if (!error) {
-		nfs_renew_times(dentry);
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	} else if (need_rehash)
 		d_rehash(dentry);
@@ -1714,7 +1695,6 @@ out:
 		d_rehash(rehash);
 	if (!error) {
 		d_move(old_dentry, new_dentry);
-		nfs_renew_times(new_dentry);
 		nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir));
 	}
 
-- 
cgit v1.2.3


From 8fb559f87fee7f71dbf9a595095ad7d8e84c55e7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Sep 2007 15:40:16 -0400
Subject: NFS: Eliminate nfs_refresh_verifier()

nfs_set_verifier() and nfs_refresh_verifier() do exactly the same thing, so
replace one with the other.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78144daeb70f..dde545925b4e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -666,11 +666,6 @@ static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
 	dentry->d_time = verf;
 }
 
-static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
-{
-	nfs_set_verifier(dentry, verf);
-}
-
 /*
  * Return the intent data that applies to this particular path component
  *
@@ -794,7 +789,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
 		goto out_bad;
 
-	nfs_refresh_verifier(dentry, verifier);
+	nfs_set_verifier(dentry, verifier);
  out_valid:
 	unlock_kernel();
 	dput(parent);
@@ -1093,7 +1088,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	verifier = nfs_save_change_attribute(dir);
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
 	if (!ret)
-		nfs_refresh_verifier(dentry, verifier);
+		nfs_set_verifier(dentry, verifier);
 	unlock_kernel();
 out:
 	dput(parent);
@@ -1169,10 +1164,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		dentry = alias;
 	}
 
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	return dentry;
 out_renew:
-	nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir));
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return dentry;
 }
 
@@ -1695,7 +1688,8 @@ out:
 		d_rehash(rehash);
 	if (!error) {
 		d_move(old_dentry, new_dentry);
-		nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir));
+		nfs_set_verifier(new_dentry,
+					nfs_save_change_attribute(new_dir));
 	}
 
 	/* new dentry created? */
-- 
cgit v1.2.3


From c03025d55540bd648f2546659090140ecc835572 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 10 Aug 2007 17:44:28 -0400
Subject: NFS: Add a helper to extract the nfs_open_context from a struct file

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1d1343fd2ddd..51e4a77030f1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -331,6 +331,11 @@ extern const struct inode_operations nfs3_file_inode_operations;
 extern const struct file_operations nfs_file_operations;
 extern const struct address_space_operations nfs_file_aops;
 
+static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
+{
+	return filp->private_data;
+}
+
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
 	if (file != NULL) {
-- 
cgit v1.2.3


From cd3758e37ddea66fccca7d93c4b601e8a2e51926 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 10 Aug 2007 17:44:32 -0400
Subject: NFS: Replace file->private_data with calls to nfs_file_open_context()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c    | 2 +-
 fs/nfs/dir.c           | 4 ++--
 fs/nfs/direct.c        | 4 ++--
 fs/nfs/file.c          | 8 ++++----
 fs/nfs/inode.c         | 2 +-
 fs/nfs/nfs4proc.c      | 8 ++++----
 fs/nfs/nfs4state.c     | 2 +-
 fs/nfs/read.c          | 6 ++----
 fs/nfs/write.c         | 4 ++--
 include/linux/nfs_fs.h | 8 ++------
 10 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index c55a761c22bb..7a1b6e869f9c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
-		if ((struct nfs_open_context *)fl->fl_file->private_data != ctx)
+		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
 		status = nfs4_lock_delegation_recall(state, fl);
 		if (status >= 0)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index dde545925b4e..b332c527d95d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -558,7 +558,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	memset(desc, 0, sizeof(*desc));
 
 	desc->file = filp;
-	desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie;
+	desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
 
@@ -623,7 +623,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 	}
 	if (offset != filp->f_pos) {
 		filp->f_pos = offset;
-		((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
+		nfs_file_open_context(filp)->dir_cookie = 0;
 	}
 out:
 	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fcf4d384610e..28c8e1b65db0 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -368,7 +368,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 		return -ENOMEM;
 
 	dreq->inode = inode;
-	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
@@ -718,7 +718,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 		sync = FLUSH_STABLE;
 
 	dreq->inode = inode;
-	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5595b32c0915..c664bb921425 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -208,7 +208,7 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = file->f_path.dentry->d_inode;
 	int		status;
 
@@ -296,7 +296,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 static int
 nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
 
 	dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
@@ -395,7 +395,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
 
 	if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
 		return 1;
-	ctx = filp->private_data;
+	ctx = nfs_file_open_context(filp);
 	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
 		return 1;
 	return 0;
@@ -438,7 +438,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/* Return error values for O_SYNC and IS_SYNC() */
 	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-		int err = nfs_do_fsync(iocb->ki_filp->private_data, inode);
+		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
 		if (err < 0)
 			result = err;
 	}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7c8ca175d87b..45633f9ad851 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -538,7 +538,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 static void nfs_file_clear_open_context(struct file *filp)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
 	if (ctx) {
 		filp->private_data = NULL;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d856e9f5913b..2919271a983a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1390,7 +1390,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
 	filp = lookup_instantiate_filp(nd, path->dentry, NULL);
 	if (!IS_ERR(filp)) {
 		struct nfs_open_context *ctx;
-		ctx = (struct nfs_open_context *)filp->private_data;
+		ctx = nfs_file_open_context(filp);
 		ctx->state = state;
 		return 0;
 	}
@@ -3303,7 +3303,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = -ENOMEM;
 	if (seqid == NULL)
 		goto out;
-	task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
+	task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
 	status = PTR_ERR(task);
 	if (IS_ERR(task))
 		goto out;
@@ -3447,7 +3447,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	int ret;
 
 	dprintk("%s: begin!\n", __FUNCTION__);
-	data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data,
+	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
 			fl->fl_u.nfs4_fl.owner);
 	if (data == NULL)
 		return -ENOMEM;
@@ -3573,7 +3573,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	int status;
 
 	/* verify open state */
-	ctx = (struct nfs_open_context *)filp->private_data;
+	ctx = nfs_file_open_context(filp);
 	state = ctx->state;
 
 	if (request->fl_start < 0 || request->fl_end < 0)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e4adf8c8312..bfb36261cecb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -774,7 +774,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
 	for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
-		if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state)
+		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
 		status = ops->recover_lock(state, fl);
 		if (status >= 0)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 19e05633f4e3..d6e62d7afc72 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -497,8 +497,7 @@ int nfs_readpage(struct file *file, struct page *page)
 		if (ctx == NULL)
 			goto out_unlock;
 	} else
-		ctx = get_nfs_open_context((struct nfs_open_context *)
-				file->private_data);
+		ctx = get_nfs_open_context(nfs_file_open_context(file));
 
 	error = nfs_readpage_async(ctx, inode, page);
 
@@ -576,8 +575,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 		if (desc.ctx == NULL)
 			return -EBADF;
 	} else
-		desc.ctx = get_nfs_open_context((struct nfs_open_context *)
-				filp->private_data);
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fb396ea5accf..3e9e268b887e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -667,7 +667,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
@@ -701,7 +701,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 int nfs_updatepage(struct file *file, struct page *page,
 		unsigned int offset, unsigned int count)
 {
-	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = page->mapping->host;
 	int		status = 0;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 51e4a77030f1..5c200fd8c652 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -338,12 +338,8 @@ static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
-	if (file != NULL) {
-		struct nfs_open_context *ctx;
-
-		ctx = (struct nfs_open_context*)file->private_data;
-		return ctx->cred;
-	}
+	if (file != NULL)
+		return nfs_file_open_context(file)->cred;
 	return NULL;
 }
 
-- 
cgit v1.2.3


From af22f94ae02ab9dd4fd7fe628c8434a59cc293be Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 10 Aug 2007 17:45:10 -0400
Subject: NFSv4: Simplify _nfs4_do_access()

Currently, _nfs4_do_access() is just a copy of nfs_do_access() with added
conversion of the open flags into an access mask. This patch merges the
duplicate functionality.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 22 ++++++++++++++++++++--
 fs/nfs/nfs4proc.c      | 35 ++---------------------------------
 include/linux/nfs_fs.h |  5 ++---
 3 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b332c527d95d..2b5e611352c5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1815,7 +1815,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
 	return NULL;
 }
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_access_entry *cache;
@@ -1882,7 +1882,7 @@ found:
 	nfs_access_free_entry(entry);
 }
 
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
 	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
 	if (cache == NULL)
@@ -1930,6 +1930,24 @@ out:
 	return -EACCES;
 }
 
+static int nfs_open_permission_mask(int openflags)
+{
+	int mask = 0;
+
+	if (openflags & FMODE_READ)
+		mask |= MAY_READ;
+	if (openflags & FMODE_WRITE)
+		mask |= MAY_WRITE;
+	if (openflags & FMODE_EXEC)
+		mask |= MAY_EXEC;
+	return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+
 int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct rpc_cred *cred;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2919271a983a..5aa0dd1e1bdf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,7 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 
@@ -454,7 +453,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
 		rcu_read_unlock();
 		lock_kernel();
-		ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode);
+		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
 		unlock_kernel();
 		if (ret != 0)
 			goto out;
@@ -948,36 +947,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	return 0;
 }
 
-static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
-{
-	struct nfs_access_entry cache;
-	int mask = 0;
-	int status;
-
-	if (openflags & FMODE_READ)
-		mask |= MAY_READ;
-	if (openflags & FMODE_WRITE)
-		mask |= MAY_WRITE;
-	if (openflags & FMODE_EXEC)
-		mask |= MAY_EXEC;
-	status = nfs_access_get_cached(inode, cred, &cache);
-	if (status == 0)
-		goto out;
-
-	/* Be clever: ask server to check for all possible rights */
-	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
-	cache.cred = cred;
-	cache.jiffies = jiffies;
-	status = _nfs4_proc_access(inode, &cache);
-	if (status != 0)
-		return status;
-	nfs_access_add_cache(inode, &cache);
-out:
-	if ((cache.mask & mask) == mask)
-		return 0;
-	return -EACCES;
-}
-
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
@@ -1381,7 +1350,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
 
 	/* If the open_intent is for execute, we have an extra check to make */
 	if (nd->intent.open.flags & FMODE_EXEC) {
-		ret = _nfs4_do_access(state->inode,
+		ret = nfs_may_open(state->inode,
 				state->owner->so_cred,
 				nd->intent.open.flags);
 		if (ret < 0)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5c200fd8c652..5b42fef0baf0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -292,9 +292,6 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int nfs_permission(struct inode *, int, struct nameidata *);
-extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
-extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
-extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
@@ -382,6 +379,8 @@ extern const struct file_operations nfs_dir_operations;
 extern struct dentry_operations nfs_dentry_operations;
 
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
+extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
+extern void nfs_access_zap_cache(struct inode *inode);
 
 /*
  * linux/fs/nfs/symlink.c
-- 
cgit v1.2.3


From 76b32999dfff6e59252a8af17a5671a4cf3bcf9b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 10 Aug 2007 17:45:11 -0400
Subject: NFSv4: Make NFSv4 ACCESS calls return attributes too...

It doesn't really make sense to cache an access call without also
revalidating the attributes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 11 +++++++++--
 fs/nfs/nfs4xdr.c        | 27 ++++++++++++++++++++-------
 include/linux/nfs_xdr.h |  3 +++
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5aa0dd1e1bdf..0e366a31f63b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,7 +62,6 @@ struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
-static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
@@ -1726,10 +1725,16 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
 	struct nfs4_accessargs args = {
 		.fh = NFS_FH(inode),
+		.bitmask = server->attr_bitmask,
+	};
+	struct nfs4_accessres res = {
+		.server = server,
+		.fattr = &fattr,
 	};
-	struct nfs4_accessres res = { 0 };
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
 		.rpc_argp = &args,
@@ -1755,6 +1760,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 		if (mode & MAY_EXEC)
 			args.access |= NFS4_ACCESS_EXECUTE;
 	}
+	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (!status) {
 		entry->mask = 0;
@@ -1764,6 +1770,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 			entry->mask |= MAY_WRITE;
 		if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
 			entry->mask |= MAY_EXEC;
+		nfs_refresh_inode(inode, &fattr);
 	}
 	return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f353d4686b6..51dd3804866f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -376,10 +376,12 @@ static int nfs4_stat_to_errno(int);
 				decode_locku_maxsz)
 #define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
-				encode_access_maxsz)
+				encode_access_maxsz + \
+				encode_getattr_maxsz)
 #define NFS4_dec_access_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
-				decode_access_maxsz)
+				decode_access_maxsz + \
+				decode_getattr_maxsz)
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
 				encode_getattr_maxsz)
@@ -1375,14 +1377,20 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.nops = 2,
+		.nops = 3,
 	};
 	int status;
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	if ((status = encode_putfh(&xdr, args->fh)) == 0)
-		status = encode_access(&xdr, args->access);
+	status = encode_putfh(&xdr, args->fh);
+	if (status != 0)
+		goto out;
+	status = encode_access(&xdr, args->access);
+	if (status != 0)
+		goto out;
+	status = encode_getfattr(&xdr, args->bitmask);
+out:
 	return status;
 }
 
@@ -3784,8 +3792,13 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
 	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
 	if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
 		goto out;
-	if ((status = decode_putfh(&xdr)) == 0)
-		status = decode_access(&xdr, res);
+	status = decode_putfh(&xdr);
+	if (status != 0)
+		goto out;
+	status = decode_access(&xdr, res);
+	if (status != 0)
+		goto out;
+	decode_getfattr(&xdr, res->fattr, res->server);
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cf74a4db84a5..03032017ffaa 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -538,10 +538,13 @@ typedef u64 clientid4;
 
 struct nfs4_accessargs {
 	const struct nfs_fh *		fh;
+	const u32 *			bitmask;
 	u32				access;
 };
 
 struct nfs4_accessres {
+	const struct nfs_server *	server;
+	struct nfs_fattr *		fattr;
 	u32				supported;
 	u32				access;
 };
-- 
cgit v1.2.3


From f2115dc9877d480392e48e3c83bc8cbb4b418fee Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 15 Aug 2007 12:49:17 -0400
Subject: NFS: Fix over-conservative attribute invalidation in
 nfs_update_inode()

We should always be declaring the attribute cache as valid after having
updated it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 45633f9ad851..721e511f8ba9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -978,8 +978,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* Are we racing with known updates of the metadata on the server? */
 	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
-	if (data_stable)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
@@ -1060,12 +1060,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
 	}
+	invalid &= ~NFS_INO_INVALID_ATTR;
 	/* Don't invalidate the data if we were to blame */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
 				|| S_ISLNK(inode->i_mode)))
 		invalid &= ~NFS_INO_INVALID_DATA;
-	if (data_stable)
-		invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
 	if (!nfs_have_delegation(inode, FMODE_READ) ||
 			(nfsi->cache_validity & NFS_INO_REVAL_FORCED))
 		nfsi->cache_validity |= invalid;
-- 
cgit v1.2.3


From 68e8a70d3cae23716f6b2b3872eba10eccea148c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 15 Aug 2007 12:59:12 -0400
Subject: NFS: nfs_post_op_update_inode() should call nfs_refresh_inode()

Ensure that we don't clobber the results from a more recent getattr call...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 721e511f8ba9..0d98074d0766 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -915,14 +915,14 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int status = 0;
 
-	spin_lock(&inode->i_lock);
 	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
+		spin_lock(&inode->i_lock);
 		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		spin_unlock(&inode->i_lock);
 		goto out;
 	}
-	status = nfs_update_inode(inode, fattr);
+	status = nfs_refresh_inode(inode, fattr);
 out:
-	spin_unlock(&inode->i_lock);
 	return status;
 }
 
-- 
cgit v1.2.3


From 7957c1418f4b6c66e28d4ac3c4d7a8c19d526c48 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 14:20:12 -0400
Subject: NFS: fix nfs_verify_change_attribute

We always want to check that the verifier and directory
cache_change_attribute match. This also allows us to remove the 'wraparound
hack' for the cache_change_attribute. If we're only checking for equality,
then we don't care about wraparound issues.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 4 ----
 include/linux/nfs_fs.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0d98074d0766..ed035a81eea2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -972,10 +972,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	nfsi->read_cache_jiffies = fattr->time_start;
 	nfsi->last_updated = now;
 
-	/* Fix a wraparound issue with nfsi->cache_change_attribute */
-	if (time_before(now, nfsi->cache_change_attribute))
-		nfsi->cache_change_attribute = now - 600*HZ;
-
 	/* Are we racing with known updates of the metadata on the server? */
 	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
 	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5b42fef0baf0..f45161363be2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -277,7 +277,7 @@ static inline long nfs_save_change_attribute(struct inode *inode)
 static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr)
 {
 	return !nfs_caches_unstable(inode)
-		&& time_after_eq(chattr, NFS_I(inode)->cache_change_attribute);
+		&& chattr == NFS_I(inode)->cache_change_attribute;
 }
 
 /*
-- 
cgit v1.2.3


From 6ecc5e8fcad7ad64d68c098249359831331bd299 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 14:20:33 -0400
Subject: NFS: Fix dcache revalidation bugs

We don't need to force a dentry lookup just because we're making changes to
the directory.

Don't update nfsi->cache_change_attribute in nfs_end_data_update: that
overrides the NFSv3/v4 weak consistency checking that tells us our update
was the only one, and that tells us the dcache is still valid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c   | 10 +++-------
 fs/nfs/inode.c |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2b5e611352c5..6518b098e625 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -650,15 +650,11 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
  */
 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
-	unsigned long verf;
-
 	if (IS_ROOT(dentry))
 		return 1;
-	verf = dentry->d_time;
-	if (nfs_caches_unstable(dir)
-			|| verf != NFS_I(dir)->cache_change_attribute)
-		return 0;
-	return 1;
+	if (dentry->d_time == NFS_I(dir)->cache_change_attribute)
+		return 1;
+	return 0;
 }
 
 static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ed035a81eea2..cca1ab2922bf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -785,7 +785,6 @@ void nfs_end_data_update(struct inode *inode)
 		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		spin_unlock(&inode->i_lock);
 	}
-	nfsi->cache_change_attribute = jiffies;
 	atomic_dec(&nfsi->data_updates);
 }
 
-- 
cgit v1.2.3


From e323ea46d95d7f8c789effd1194dfc120284dbbd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 30 Sep 2007 17:03:25 -0400
Subject: NFS: nfs_wcc_update_inode: directory caches are always invalidated

We must ensure that the readdir data is always invalidated whether or not
the weak cache consistency data update succeeds.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cca1ab2922bf..f1f6639f52b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -801,6 +801,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+			if (S_ISDIR(inode->i_mode))
+				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 			nfsi->cache_change_attribute = now;
 		}
 		if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
-- 
cgit v1.2.3


From 17cadc95372e28024be0874e67329c1862912c5d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 27 Sep 2007 10:07:31 -0400
Subject: NFS: Don't force a dcache revalidation if nfs_wcc_update_inode
 succeeds

The reason is that if the weak cache consistency update was successful,
then we know that our client must be the only one that changed the
directory, and we've already updated the dcache to reflect the change.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 11 +++--------
 fs/nfs/nfs4proc.c      |  7 +++++--
 include/linux/nfs_fs.h |  4 +++-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f1f6639f52b5..7e73edc1751f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -791,24 +791,18 @@ void nfs_end_data_update(struct inode *inode)
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long now = jiffies;
 
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-			nfsi->cache_change_attribute = now;
-		}
 		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-			nfsi->cache_change_attribute = now;
 		}
-		if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
+		if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
 			inode->i_size = fattr->size;
-			nfsi->cache_change_attribute = now;
-		}
 	}
 }
 
@@ -919,6 +913,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
 		spin_lock(&inode->i_lock);
 		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		nfsi->cache_change_attribute = jiffies;
 		spin_unlock(&inode->i_lock);
 		goto out;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e366a31f63b..871c102d9bdf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -208,9 +208,12 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	struct nfs_inode *nfsi = NFS_I(dir);
 
 	spin_lock(&dir->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
+	if (cinfo->after != nfsi->change_attr) {
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+		if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+			nfsi->cache_change_attribute = jiffies;
 		nfsi->change_attr = cinfo->after;
+	}
 	spin_unlock(&dir->i_lock);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f45161363be2..fd2c5c8158cf 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -235,8 +235,10 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
 		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+		nfsi->cache_change_attribute = jiffies;
+	}
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3


From 47aabaa7e45385fee4a535a6f6e523ff944e1684 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 27 Sep 2007 15:57:24 -0400
Subject: NFSv4: Don't use ctime/mtime for determining when to invalidate the
 caches

In NFSv4 we should only be looking at the change attribute.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7e73edc1751f..cd57f795229e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -976,6 +976,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
+	/* More cache consistency checks */
+	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+		/* NFSv2/v3: Check if the mtime agrees */
+		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+			dprintk("NFS: mtime change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			nfsi->cache_change_attribute = now;
+		}
+		/* If ctime has changed we should definitely clear access+acl caches */
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+			invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			nfsi->cache_change_attribute = now;
+		}
+	} else if (nfsi->change_attr != fattr->change_attr) {
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->cache_change_attribute = now;
+	}
+
 	/* Check if our cached file size is stale */
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	cur_isize = i_size_read(inode);
@@ -997,22 +1018,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				inode->i_sb->s_id, inode->i_ino);
 	}
 
-	/* Check if the mtime agrees */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-		dprintk("NFS: mtime change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-		nfsi->cache_change_attribute = now;
-	}
 
-	/* If ctime has changed we should definitely clear access+acl caches */
-	if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
-		invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		nfsi->cache_change_attribute = now;
-	}
+	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	nfsi->change_attr = fattr->change_attr;
 
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
 	    inode->i_uid != fattr->uid ||
@@ -1033,15 +1043,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		nfsi->change_attr = fattr->change_attr;
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		nfsi->cache_change_attribute = now;
-	}
-
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
-- 
cgit v1.2.3


From 57fa76f2da05d0fee597b26bbc1f05242252beab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 30 Sep 2007 18:01:13 -0400
Subject: NFS: Don't use readdirplus data if the page cache is invalid

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6518b098e625..9f8ec3c3e6a7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1110,6 +1110,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		.len = entry->len,
 	};
 	struct inode *inode;
+	unsigned long verf = nfs_save_change_attribute(dir);
 
 	switch (name.len) {
 		case 2:
@@ -1120,6 +1121,14 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 			if (name.name[0] == '.')
 				return dget(parent);
 	}
+
+	spin_lock(&dir->i_lock);
+	if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+		spin_unlock(&dir->i_lock);
+		return NULL;
+	}
+	spin_unlock(&dir->i_lock);
+
 	name.hash = full_name_hash(name.name, name.len);
 	dentry = d_lookup(parent, &name);
 	if (dentry != NULL) {
@@ -1161,7 +1170,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 	}
 
 out_renew:
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	nfs_set_verifier(dentry, verf);
 	return dentry;
 }
 
-- 
cgit v1.2.3


From c4812998398d9cbce8646494704c52297359ede0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 17:11:45 -0400
Subject: NFS: Fix atime revalidation in readdir()

NFSv3 will correctly update atime on a readdir call, so there is no need to
set the NFS_INO_INVALID_ATIME flag unless the call to nfs_refresh_inode()
fails.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 6 ------
 fs/nfs/inode.c         | 7 +++++++
 fs/nfs/nfs3proc.c      | 3 +++
 fs/nfs/nfs4proc.c      | 3 +++
 fs/nfs/proc.c          | 2 ++
 include/linux/nfs_fs.h | 1 +
 6 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9f8ec3c3e6a7..07df192e23a0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -200,9 +200,6 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	desc->timestamp = timestamp;
 	desc->timestamp_valid = 1;
 	SetPageUptodate(page);
-	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&inode->i_lock);
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_mutex or some other mechanism.
@@ -490,9 +487,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
-	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&inode->i_lock);
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cd57f795229e..e37faa3a7bac 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -156,6 +156,13 @@ static void nfs_zap_acl_cache(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
+void nfs_invalidate_atime(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * Invalidate, but do not unhash, the inode.
  * NB: must be called with inode->i_lock held!
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c7ca5d70870b..0ae263cdedc6 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -607,6 +607,9 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
 	nfs_fattr_init(&dir_attr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+	nfs_invalidate_atime(dir);
+
 	nfs_refresh_inode(dir, &dir_attr);
 	dprintk("NFS reply readdir: %d\n", status);
 	return status;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 871c102d9bdf..9c27a6ed1a62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2197,6 +2197,9 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	if (status == 0)
 		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+
+	nfs_invalidate_atime(dir);
+
 	dprintk("%s: returns %d\n", __FUNCTION__, status);
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 845cdde1d8b7..cfc4130f2aec 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -476,6 +476,8 @@ nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 
+	nfs_invalidate_atime(dir);
+
 	dprintk("NFS reply readdir: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fd2c5c8158cf..63850a884902 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -288,6 +288,7 @@ static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long
 extern int nfs_sync_mapping(struct address_space *mapping);
 extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
 extern void nfs_zap_caches(struct inode *);
+extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
-- 
cgit v1.2.3


From 8850df999cd16aa141098e2e8be04a590276f3cc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 17:20:07 -0400
Subject: NFS: Fix atime revalidation in read()

NFSv3 will correctly update atime on a read() call, so there is no need to
set the NFS_INO_INVALID_ATIME flag unless the call to nfs_refresh_inode()
fails.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 6 +++---
 fs/nfs/nfs4proc.c | 2 ++
 fs/nfs/proc.c     | 1 +
 fs/nfs/read.c     | 3 ---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0ae263cdedc6..fc6b1193a631 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -727,9 +727,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
-	/* Call back common NFS readpage processing */
-	if (task->tk_status >= 0)
-		nfs_refresh_inode(data->inode, &data->fattr);
+
+	nfs_invalidate_atime(data->inode);
+	nfs_refresh_inode(data->inode, &data->fattr);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9c27a6ed1a62..d311984d2c88 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2396,6 +2396,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 		rpc_restart_call(task);
 		return -EAGAIN;
 	}
+
+	nfs_invalidate_atime(data->inode);
 	if (task->tk_status > 0)
 		renew_lease(server, data->timestamp);
 	return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index cfc4130f2aec..ec3ede890bf4 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -552,6 +552,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+	nfs_invalidate_atime(data->inode);
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index d6e62d7afc72..4587a86adaac 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -341,9 +341,6 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 		set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
 		nfs_mark_for_revalidate(data->inode);
 	}
-	spin_lock(&data->inode->i_lock);
-	NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
-	spin_unlock(&data->inode->i_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7fdc49c4e49ba926348f71844cda7f5e12709738 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 19:11:33 -0400
Subject: NFS: Fix the ESTALE "revalidation" in _nfs_revalidate_inode()

For one thing, the test NFS_ATTRTIMEO() == 0 makes no sense: we're
testing whether or not the cache timeout length is zero, which is totally
unrelated to the issue of whether or not we trust the file staleness.

Secondly, we do not want to retry the GETATTR once a file has been declared
stale by the server: we rather want to discard that inode as soon as
possible, since there are broken servers still in use out there that reuse
filehandles on new files.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e37faa3a7bac..185cfd09d7f7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -607,16 +607,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	status = nfs_wait_on_inode(inode);
 	if (status < 0)
 		goto out;
-	if (NFS_STALE(inode)) {
-		status = -ESTALE;
-		/* Do we trust the cached ESTALE? */
-		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
-				/* no */
-			} else
-				goto out;
-		}
-	}
+
+	status = -ESTALE;
+	if (NFS_STALE(inode))
+		goto out;
 
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
-- 
cgit v1.2.3


From b64e8a5ef758888cb42b7c105dcfaaf51aab1baf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 30 Sep 2007 15:13:17 -0400
Subject: NFS: Remove bogus check of cache_change_attribute in nfs_update_inode

Remove the bogus 'data_stable' check in nfs_update_inode. The
cache_change_attribute tells you if the directory changed on the server,
and should have nothing to do with the file length.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 185cfd09d7f7..23feb9e3d8b0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -942,7 +942,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
 	unsigned long now = jiffies;
-	int data_stable;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
 			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
@@ -969,8 +968,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	nfsi->read_cache_jiffies = fattr->time_start;
 	nfsi->last_updated = now;
 
-	/* Are we racing with known updates of the metadata on the server? */
-	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
 	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
 			| NFS_INO_REVAL_PAGECACHE);
 
@@ -1002,15 +999,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	cur_isize = i_size_read(inode);
 	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes? */
-		if (nfsi->npages == 0) {
-			/* No, but did we race with nfs_end_data_update()? */
-			if (data_stable) {
-				inode->i_size = new_isize;
-				invalid |= NFS_INO_INVALID_DATA;
-			}
-			invalid |= NFS_INO_INVALID_ATTR;
-		} else if (new_isize > cur_isize) {
+		/* Do we perhaps have any outstanding writes, or has
+		 * the file grown beyond our last write? */
+		if (nfsi->npages == 0 || new_isize > cur_isize) {
 			inode->i_size = new_isize;
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
-- 
cgit v1.2.3


From 70ca88521fc7bee8ef0fc22033a439d4b9a2c70d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 30 Sep 2007 15:21:24 -0400
Subject: NFS: Fake up 'wcc' attributes to prevent cache invalidation after
 write

NFSv2 and v4 don't offer weak cache consistency attributes on WRITE calls.
In NFSv3, returning wcc data is optional. In all cases, we want to prevent
the client from invalidating our cached data whenever ->write_done()
attempts to update the inode attributes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 34 ++++++++++++++++++++++++++++++++++
 fs/nfs/nfs3proc.c       |  2 +-
 fs/nfs/nfs4proc.c       |  2 +-
 fs/nfs/proc.c           |  2 +-
 include/linux/nfs_fs.h  |  1 +
 include/linux/nfs_xdr.h |  3 ++-
 6 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 23feb9e3d8b0..c5f4e0567533 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -793,6 +793,12 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
+	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+			nfsi->change_attr == fattr->pre_change_attr) {
+		nfsi->change_attr = fattr->change_attr;
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+	}
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
 		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
@@ -923,6 +929,34 @@ out:
 	return status;
 }
 
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+		fattr->pre_change_attr = NFS_I(inode)->change_attr;
+		fattr->valid |= NFS_ATTR_WCC_V4;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+			(fattr->valid & NFS_ATTR_WCC) == 0) {
+		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->pre_size = inode->i_size;
+		fattr->valid |= NFS_ATTR_WCC;
+	}
+	return nfs_post_op_update_inode(inode, fattr);
+}
+
 /*
  * Many nfs protocol calls return the new file attributes after
  * an operation.  Here we update the inode to reflect the state
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fc6b1193a631..ce1fb99e67e1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -750,7 +750,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d311984d2c88..796bc8ea7194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2427,7 +2427,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), data->timestamp);
-		nfs_post_op_update_inode(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	}
 	return 0;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ec3ede890bf4..97669ed05500 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -579,7 +579,7 @@ static void nfs_proc_read_setup(struct nfs_read_data *data)
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
 	return 0;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 63850a884902..9449286c5867 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -293,6 +293,7 @@ extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
+extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int nfs_permission(struct inode *, int, struct nameidata *);
 extern int nfs_open(struct inode *, struct file *);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 03032017ffaa..daab252f2e5c 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -62,7 +62,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR		0x0002		/* post-op attributes */
 #define NFS_ATTR_FATTR_V3	0x0004		/* NFSv3 attributes */
 #define NFS_ATTR_FATTR_V4	0x0008		/* NFSv4 change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL	0x0010		/* NFSv4 referral */
+#define NFS_ATTR_WCC_V4		0x0010		/* pre-op change attribute */
+#define NFS_ATTR_FATTR_V4_REFERRAL	0x0020		/* NFSv4 referral */
 
 /*
  * Info on the file system
-- 
cgit v1.2.3


From 8edb01828837302055a8f0afddb2256659480bc5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:14:03 -0400
Subject: NFS: Fix the sign of the return value of nfs_save_change_attribute()

Also fix up the comments.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9449286c5867..a765dd456922 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -260,13 +260,13 @@ static inline int NFS_USE_READDIRPLUS(struct inode *inode)
 
 /**
  * nfs_save_change_attribute - Returns the inode attribute change cookie
- * @inode - pointer to inode
+ * @dir - pointer to parent directory inode
  * The "change attribute" is updated every time we finish an operation
  * that will result in a metadata change on the server.
  */
-static inline long nfs_save_change_attribute(struct inode *inode)
+static inline unsigned long nfs_save_change_attribute(struct inode *dir)
 {
-	return NFS_I(inode)->cache_change_attribute;
+	return NFS_I(dir)->cache_change_attribute;
 }
 
 /**
-- 
cgit v1.2.3


From 4b841736bc16b320bcdb1e8ece585b3ced9a8811 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:15:01 -0400
Subject: NFS: Fix nfs_verify_change_attribute()

We don't care about whether or not some other process on our client is
changing the directory while we're in nfs_lookup_revalidate(), because the
dcache will take care of ensuring local atomicity.
We can therefore remove the test for nfs_caches_unstable().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           |  2 +-
 include/linux/nfs_fs.h | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07df192e23a0..e275a6eb0a7c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -646,7 +646,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
 	if (IS_ROOT(dentry))
 		return 1;
-	if (dentry->d_time == NFS_I(dir)->cache_change_attribute)
+	if (nfs_verify_change_attribute(dir, dentry->d_time))
 		return 1;
 	return 0;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a765dd456922..0197b1243fe0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -270,16 +270,16 @@ static inline unsigned long nfs_save_change_attribute(struct inode *dir)
 }
 
 /**
- * nfs_verify_change_attribute - Detects NFS inode cache updates
- * @inode - pointer to inode
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir - pointer to parent directory inode
  * @chattr - previously saved change attribute
- * Return "false" if metadata has been updated (or is in the process of
- * being updated) since the change attribute was saved.
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
  */
-static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr)
+static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr)
 {
-	return !nfs_caches_unstable(inode)
-		&& chattr == NFS_I(inode)->cache_change_attribute;
+	return chattr == NFS_I(dir)->cache_change_attribute;
 }
 
 /*
-- 
cgit v1.2.3


From fab728e156b3cbfe31f05d6e7cdebe3d5eaff878 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:41:33 -0400
Subject: NFS: Ensure nfs_instantiate() invalidates the parent dir on error

Also ensure that it drops the dentry in this case.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e275a6eb0a7c..82395c511710 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1174,32 +1174,39 @@ out_renew:
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr)
 {
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = parent->d_inode;
 	struct inode *inode;
 	int error = -EACCES;
 
+	d_drop(dentry);
+
 	/* We may have been initialized further down */
 	if (dentry->d_inode)
-		return 0;
+		goto out;
 	if (fhandle->size == 0) {
-		struct inode *dir = dentry->d_parent->d_inode;
 		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 		if (error)
-			return error;
+			goto out_error;
 	}
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
 		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
 		if (error < 0)
-			return error;
+			goto out_error;
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
 	error = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		return error;
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry))
-		d_rehash(dentry);
+		goto out_error;
+	d_add(dentry, inode);
+out:
+	dput(parent);
 	return 0;
+out_error:
+	nfs_mark_for_revalidate(dir);
+	dput(parent);
+	return error;
 }
 
 /*
-- 
cgit v1.2.3


From 5724ab37872042176916441930e78fd353be1e5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 21:51:38 -0400
Subject: NFS: nfs_instantiate() should set the dentry verifier

That will also allow us to remove the calls in mknod and mkdir.
In addition it will ensure that symlinks set it correctly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 82395c511710..3f0def65b8ab 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1189,6 +1189,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 		if (error)
 			goto out_error;
 	}
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
 		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
@@ -1237,7 +1238,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
@@ -1270,7 +1270,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	nfs_end_data_update(dir);
 	if (status != 0)
 		goto out_err;
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
@@ -1299,7 +1298,6 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 	return 0;
 out_err:
-- 
cgit v1.2.3


From d4d9cdcb470784df76304f75d0ce88f20f15fa6a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 18:38:53 -0400
Subject: NFS: Don't hash the negative dentry when optimising for an O_EXCL
 open

We don't want to leave an unverified hashed negative dentry if the
exclusive create fails to complete.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 5 +++--
 fs/nfs/nfs4proc.c | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3f0def65b8ab..166a833be661 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -992,9 +992,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	}
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 
-	/* Let vfs_create() deal with O_EXCL */
+	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
+	 * the dentry. */
 	if (nd->intent.open.flags & O_EXCL) {
-		d_add(dentry, NULL);
+		d_instantiate(dentry, NULL);
 		goto out;
 	}
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 796bc8ea7194..0748c700301d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1879,11 +1879,12 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	}
 	state = nfs4_do_open(dir, &path, flags, sattr, cred);
 	put_rpccred(cred);
+	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
 	}
-	d_instantiate(dentry, igrab(state->inode));
+	d_add(dentry, igrab(state->inode));
 	if (flags & O_EXCL) {
 		struct nfs_fattr fattr;
 		status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
-- 
cgit v1.2.3


From 446e534985bada0ad7451c08cf213c06695f9b67 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 3 Oct 2007 15:58:38 -0400
Subject: NFS: Fix a bug in nfs_open_revalidate()

We want to set the verifier when the call to nfs4_open_revalidate()
_succeeds_.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 166a833be661..d58bfb81354e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1078,7 +1078,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	lock_kernel();
 	verifier = nfs_save_change_attribute(dir);
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-	if (!ret)
+	if (ret == 1)
 		nfs_set_verifier(dentry, verifier);
 	unlock_kernel();
 out:
-- 
cgit v1.2.3


From 2f78e4313afd34a4ded591ec5687843113fbaa01 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 30 Sep 2007 15:31:19 -0400
Subject: NFS: Don't set cache_change_attribute in nfs_revalidate_mapping

The attribute revalidation code will already have taken care of resetting
nfsi->cache_change_attribute.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c5f4e0567533..8c53b7eff7ba 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -686,11 +686,8 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 	}
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-	if (S_ISDIR(inode->i_mode)) {
+	if (S_ISDIR(inode->i_mode))
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		/* This ensures we revalidate child dentries */
-		nfsi->cache_change_attribute = jiffies;
-	}
 	spin_unlock(&inode->i_lock);
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
-- 
cgit v1.2.3


From 12b373ebf05485d4937dd63a00c16f8efeaa79ba Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 09:56:59 -0400
Subject: NFS: Don't revalidate dentries on directory size or ctime changes

We only need to look at the mtime changes...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8c53b7eff7ba..7fdaaf5869ce 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1015,10 +1015,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			nfsi->cache_change_attribute = now;
 		}
 		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
 			invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			nfsi->cache_change_attribute = now;
-		}
 	} else if (nfsi->change_attr != fattr->change_attr) {
 		dprintk("NFS: change_attr change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
@@ -1036,7 +1034,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			inode->i_size = new_isize;
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
-		nfsi->cache_change_attribute = now;
 		dprintk("NFS: isize change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
 	}
-- 
cgit v1.2.3


From 7668fdbe9aaeab705d1169ac86d0d18a12906d06 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 09:59:15 -0400
Subject: NFS: nfs_post_op_update_inode don't update cache_change_attribute

If nfs_post_op_update_inode fails because the server didn't return any
attributes, then we let the subsequent inode revalidation update
cache_change_attribute.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7fdaaf5869ce..d722a0e84361 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -912,18 +912,14 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int status = 0;
 
-	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-		spin_lock(&inode->i_lock);
-		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-		nfsi->cache_change_attribute = jiffies;
-		spin_unlock(&inode->i_lock);
-		goto out;
-	}
-	status = nfs_refresh_inode(inode, fattr);
-out:
-	return status;
+	if (fattr->valid & NFS_ATTR_FATTR)
+		return nfs_refresh_inode(inode, fattr);
+
+	spin_lock(&inode->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	spin_unlock(&inode->i_lock);
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From f38211100d4823be530577dc3452f838861222ec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 10:00:23 -0400
Subject: NFS: nfs_mark_for_revalidate don't update cache_change_attribute

Just let the subsequent inode revalidation do the update...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0197b1243fe0..35d61924e0b9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -235,10 +235,8 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
-	if (S_ISDIR(inode->i_mode)) {
+	if (S_ISDIR(inode->i_mode))
 		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-		nfsi->cache_change_attribute = jiffies;
-	}
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3


From cf8ba45e0554f1c8838fcfe43a93114f177af839 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 13:46:53 -0400
Subject: NFS: don't cache the verifer across ->lookup() calls

If the ->lookup() call causes the directory verifier to change, then there
is still no need to use the old verifier, since our dentry has been
verified.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d58bfb81354e..7cd2697f2d32 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -735,7 +735,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	int error;
 	struct nfs_fh fhandle;
 	struct nfs_fattr fattr;
-	unsigned long verifier;
 
 	parent = dget_parent(dentry);
 	lock_kernel();
@@ -770,7 +769,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	verifier = nfs_save_change_attribute(dir);
 	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
 	if (error)
 		goto out_bad;
@@ -779,7 +777,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
 		goto out_bad;
 
-	nfs_set_verifier(dentry, verifier);
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
 	unlock_kernel();
 	dput(parent);
@@ -1048,7 +1046,6 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct dentry *parent = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct inode *dir;
-	unsigned long verifier;
 	int openflags, ret = 0;
 
 	parent = dget_parent(dentry);
@@ -1076,10 +1073,9 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	 * change attribute *before* we do the RPC call.
 	 */
 	lock_kernel();
-	verifier = nfs_save_change_attribute(dir);
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
 	if (ret == 1)
-		nfs_set_verifier(dentry, verifier);
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 out:
 	dput(parent);
-- 
cgit v1.2.3


From 3258b4fa552c4f994b5e6490a8ad88f5d7e0e648 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 13:54:51 -0400
Subject: NFS: Remove bogus nfs_mark_for_revalidate() in nfs_lookup

The parent of the newly materialised dentry has just been revalidated...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cd2697f2d32..35b447d79dbe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -929,14 +929,8 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 no_entry:
 	res = d_materialise_unique(dentry, inode);
 	if (res != NULL) {
-		struct dentry *parent;
 		if (IS_ERR(res))
 			goto out_unlock;
-		/* Was a directory renamed! */
-		parent = dget_parent(res);
-		if (!IS_ROOT(parent))
-			nfs_mark_for_revalidate(parent->d_inode);
-		dput(parent);
 		dentry = res;
 	}
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-- 
cgit v1.2.3


From a1643a92f6de92074116922a2d2906dd33499ff4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:25:43 -0400
Subject: NFS: NFS_CACHEINV() should not test for nfs_caches_unstable()

The fact that we're in the process of modifying the inode does not mean
that we should not invalidate the attribute and data caches. The defensive
thing is to always invalidate when we're confronted with inode
mtime/ctime or change_attribute updates that we do not immediately
recognise.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 2 +-
 include/linux/nfs_fs.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 35b447d79dbe..a03ed2f85047 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -788,7 +788,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 out_zap_parent:
 	nfs_zap_caches(dir);
  out_bad:
-	NFS_CACHEINV(dir);
+	nfs_mark_for_revalidate(dir);
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
 		nfs_zap_caches(inode);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 35d61924e0b9..c947803a7ba4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -240,12 +240,6 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-static inline void NFS_CACHEINV(struct inode *inode)
-{
-	if (!nfs_caches_unstable(inode))
-		nfs_mark_for_revalidate(inode);
-}
-
 static inline int nfs_server_capable(struct inode *inode, int cap)
 {
 	return NFS_SERVER(inode)->caps & cap;
-- 
cgit v1.2.3


From 80eb209def76d375677840800eb838abce1e6639 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:34:46 -0400
Subject: NFS: Remove NFS_I(inode)->data_updates

We have no more users...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 21 +--------------------
 include/linux/nfs_fs.h | 14 ++++++--------
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d722a0e84361..1d507a2a96d6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,7 +85,6 @@ void nfs_clear_inode(struct inode *inode)
 	 */
 	BUG_ON(nfs_have_writebacks(inode));
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
-	BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
 }
@@ -756,16 +755,6 @@ out:
 	return ret;
 }
 
-/**
- * nfs_begin_data_update
- * @inode - pointer to inode
- * Declare that a set of operations will update file data on the server
- */
-void nfs_begin_data_update(struct inode *inode)
-{
-	atomic_inc(&NFS_I(inode)->data_updates);
-}
-
 /**
  * nfs_end_data_update
  * @inode - pointer to inode
@@ -775,15 +764,12 @@ void nfs_begin_data_update(struct inode *inode)
  */
 void nfs_end_data_update(struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-
 	/* Directories: invalidate page cache */
 	if (S_ISDIR(inode->i_mode)) {
 		spin_lock(&inode->i_lock);
-		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
 		spin_unlock(&inode->i_lock);
 	}
-	atomic_dec(&nfsi->data_updates);
 }
 
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -823,7 +809,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_size, new_isize;
-	int data_unstable;
 
 
 	/* Has the inode gone and changed behind our back? */
@@ -832,9 +817,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		return -EIO;
 	}
 
-	/* Are we in the process of updating data on the server? */
-	data_unstable = nfs_caches_unstable(inode);
-
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
@@ -1162,7 +1144,6 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	atomic_set(&nfsi->data_updates, 0);
 	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	nfs4_init_once(nfsi);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c947803a7ba4..1b4edc6f7fd7 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -136,11 +136,6 @@ struct nfs_inode {
 	 * server.
 	 */
 	unsigned long		cache_change_attribute;
-	/*
-	 * Counter indicating the number of outstanding requests that
-	 * will cause a file data update.
-	 */
-	atomic_t		data_updates;
 
 	struct rb_root		access_cache;
 	struct list_head	access_cache_entry_lru;
@@ -224,9 +219,13 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 
 #define NFS_FILEID(inode)		(NFS_I(inode)->fileid)
 
-static inline int nfs_caches_unstable(struct inode *inode)
+/**
+ * nfs_begin_data_update
+ * @inode - pointer to inode
+ * Declare that a set of operations will update file data on the server
+ */
+static inline void nfs_begin_data_update(struct inode *inode)
 {
-	return atomic_read(&NFS_I(inode)->data_updates) != 0;
 }
 
 static inline void nfs_mark_for_revalidate(struct inode *inode)
@@ -299,7 +298,6 @@ extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern void nfs_begin_attr_update(struct inode *);
 extern void nfs_end_attr_update(struct inode *);
-extern void nfs_begin_data_update(struct inode *);
 extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
-- 
cgit v1.2.3


From 60ccd4ec4170c9487e3792322626acd160197bce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Sep 2007 17:48:19 -0400
Subject: NFS: Remove nfs_begin_data_update/nfs_end_data_update

The lower level routines in fs/nfs/proc.c, fs/nfs/nfs3proc.c and
fs/nfs/nfs4proc.c should already be dealing with the revalidation issues.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 35 +----------------------------------
 fs/nfs/direct.c        |  4 ----
 fs/nfs/inode.c         | 19 -------------------
 fs/nfs/nfs3acl.c       |  2 --
 fs/nfs/unlink.c        |  3 ---
 fs/nfs/write.c         |  2 --
 include/linux/nfs_fs.h | 12 ------------
 7 files changed, 1 insertion(+), 76 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a03ed2f85047..34da48586829 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1001,12 +1001,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		goto out;
 	}
 
-	if (nd->intent.open.flags & O_CREAT) {
-		nfs_begin_data_update(dir);
-		res = nfs4_atomic_open(dir, dentry, nd);
-		nfs_end_data_update(dir);
-	} else
-		res = nfs4_atomic_open(dir, dentry, nd);
+	res = nfs4_atomic_open(dir, dentry, nd);
 	unlock_kernel();
 	if (IS_ERR(res)) {
 		error = PTR_ERR(res);
@@ -1224,9 +1219,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		open_flags = nd->intent.open.flags;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
-	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
 	unlock_kernel();
@@ -1256,9 +1249,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	attr.ia_valid = ATTR_MODE;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
-	nfs_end_data_update(dir);
 	if (status != 0)
 		goto out_err;
 	unlock_kernel();
@@ -1284,9 +1275,7 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	attr.ia_mode = mode | S_IFDIR;
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
-	nfs_end_data_update(dir);
 	if (error != 0)
 		goto out_err;
 	unlock_kernel();
@@ -1305,12 +1294,10 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 	/* Ensure the VFS deletes this inode */
 	if (error == 0 && dentry->d_inode != NULL)
 		clear_nlink(dentry->d_inode);
-	nfs_end_data_update(dir);
 	unlock_kernel();
 
 	return error;
@@ -1368,17 +1355,13 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
 	qsilly.name = silly;
 	qsilly.len  = strlen(silly);
-	nfs_begin_data_update(dir);
 	if (dentry->d_inode) {
-		nfs_begin_data_update(dentry->d_inode);
 		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
 				dir, &qsilly);
 		nfs_mark_for_revalidate(dentry->d_inode);
-		nfs_end_data_update(dentry->d_inode);
 	} else
 		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
 				dir, &qsilly);
-	nfs_end_data_update(dir);
 	if (!error) {
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		d_move(dentry, sdentry);
@@ -1412,19 +1395,15 @@ static int nfs_safe_remove(struct dentry *dentry)
 		goto out;
 	}
 
-	nfs_begin_data_update(dir);
 	if (inode != NULL) {
 		nfs_inode_return_delegation(inode);
-		nfs_begin_data_update(inode);
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 		/* The VFS may want to delete this inode */
 		if (error == 0)
 			drop_nlink(inode);
 		nfs_mark_for_revalidate(inode);
-		nfs_end_data_update(inode);
 	} else
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
-	nfs_end_data_update(dir);
 out:
 	return error;
 }
@@ -1516,9 +1495,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	nfs_begin_data_update(dir);
 	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
-	nfs_end_data_update(dir);
 	if (error != 0) {
 		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
@@ -1558,15 +1535,11 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	lock_kernel();
-	nfs_begin_data_update(dir);
-	nfs_begin_data_update(inode);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
 		atomic_inc(&inode->i_count);
 		d_instantiate(dentry, inode);
 	}
-	nfs_end_data_update(inode);
-	nfs_end_data_update(dir);
 	unlock_kernel();
 	return error;
 }
@@ -1669,15 +1642,9 @@ go_ahead:
 		d_delete(new_dentry);
 	}
 
-	nfs_begin_data_update(old_dir);
-	nfs_begin_data_update(new_dir);
-	nfs_begin_data_update(old_inode);
 	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
 					   new_dir, &new_dentry->d_name);
 	nfs_mark_for_revalidate(old_inode);
-	nfs_end_data_update(old_inode);
-	nfs_end_data_update(new_dir);
-	nfs_end_data_update(old_dir);
 out:
 	if (rehash)
 		d_rehash(rehash);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 28c8e1b65db0..32fe97211eea 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -510,7 +510,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			nfs_end_data_update(inode);
 			if (dreq->commit_data != NULL)
 				nfs_commit_free(dreq->commit_data);
 			nfs_direct_free_writedata(dreq);
@@ -533,7 +532,6 @@ static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_end_data_update(inode);
 	nfs_direct_free_writedata(dreq);
 	nfs_zap_mapping(inode, inode->i_mapping);
 	nfs_direct_complete(dreq);
@@ -724,8 +722,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 
 	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
 
-	nfs_begin_data_update(inode);
-
 	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
 	if (!result)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1d507a2a96d6..1c23d3a67c85 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -344,7 +344,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return 0;
 
 	lock_kernel();
-	nfs_begin_data_update(inode);
 	/* Write all dirty data */
 	if (S_ISREG(inode->i_mode)) {
 		filemap_write_and_wait(inode->i_mapping);
@@ -358,7 +357,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
 	if (error == 0)
 		nfs_refresh_inode(inode, &fattr);
-	nfs_end_data_update(inode);
 	unlock_kernel();
 	return error;
 }
@@ -755,23 +753,6 @@ out:
 	return ret;
 }
 
-/**
- * nfs_end_data_update
- * @inode - pointer to inode
- * Declare end of the operations that will update file data
- * This will mark the inode as immediately needing revalidation
- * of its attribute cache.
- */
-void nfs_end_data_update(struct inode *inode)
-{
-	/* Directories: invalidate page cache */
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
-		spin_unlock(&inode->i_lock);
-	}
-}
-
 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 7322da4d2055..9b7362565c0c 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -317,13 +317,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	}
 
 	dprintk("NFS call setacl\n");
-	nfs_begin_data_update(inode);
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
 	spin_unlock(&inode->i_lock);
-	nfs_end_data_update(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
 	/* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 045ab805c17f..1aed850d18f2 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -66,7 +66,6 @@ static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
 		.rpc_cred = data->cred,
 	};
 
-	nfs_begin_data_update(dir);
 	NFS_PROTO(dir)->unlink_setup(&msg, dir);
 	rpc_call_setup(task, &msg, 0);
 }
@@ -84,8 +83,6 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 
 	if (!NFS_PROTO(dir)->unlink_done(task, dir))
 		rpc_restart_call(task);
-	else
-		nfs_end_data_update(dir);
 }
 
 /**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3e9e268b887e..e2bb66c34406 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -378,7 +378,6 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 		return error;
 	if (!nfsi->npages) {
 		igrab(inode);
-		nfs_begin_data_update(inode);
 		if (nfs_have_delegation(inode, FMODE_WRITE))
 			nfsi->change_attr++;
 	}
@@ -406,7 +405,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	nfsi->npages--;
 	if (!nfsi->npages) {
 		spin_unlock(&inode->i_lock);
-		nfs_end_data_update(inode);
 		iput(inode);
 	} else
 		spin_unlock(&inode->i_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b4edc6f7fd7..8f632bd854af 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -219,15 +219,6 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 
 #define NFS_FILEID(inode)		(NFS_I(inode)->fileid)
 
-/**
- * nfs_begin_data_update
- * @inode - pointer to inode
- * Declare that a set of operations will update file data on the server
- */
-static inline void nfs_begin_data_update(struct inode *inode)
-{
-}
-
 static inline void nfs_mark_for_revalidate(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -296,9 +287,6 @@ extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *map
 extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
-extern void nfs_begin_attr_update(struct inode *);
-extern void nfs_end_attr_update(struct inode *);
-extern void nfs_end_data_update(struct inode *);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
-- 
cgit v1.2.3


From 6d2b2966869142660f46d1e06cf9d15c3debcf77 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 18:57:50 -0400
Subject: NFS: Reset nfsi->last_updated only if the attribute changed

Otherwise set it to nfsi->read_cache_jiffies in order to prevent jiffy
wraparound issues.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1c23d3a67c85..9d012a6ee4cc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -956,7 +956,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 * Update the read time so we don't revalidate too often.
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
-	nfsi->last_updated = now;
 
 	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
 			| NFS_INO_REVAL_PAGECACHE);
@@ -1027,10 +1026,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-	} else if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
-		if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
-			nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
-		nfsi->attrtimeo_timestamp = now;
+		nfsi->last_updated = now;
+	} else {
+		if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+			nfsi->attrtimeo_timestamp = now;
+		}
+		/*
+		 * Avoid jiffy wraparound issues with nfsi->last_updated
+		 */
+		if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
+			nfsi->last_updated = nfsi->read_cache_jiffies;
 	}
 	invalid &= ~NFS_INO_INVALID_ATTR;
 	/* Don't invalidate the data if we were to blame */
-- 
cgit v1.2.3


From f2c77f4e62a2290ae46b5b0449eb72d72afe691e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 12:54:39 -0400
Subject: NFS: Optimise nfs_lookup_revalidate()

We don't need to call nfs_revalidate_inode() on the directory if we already
know that the verifiers don't match.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 34da48586829..cf80cf234e27 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -646,9 +646,14 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
 	if (IS_ROOT(dentry))
 		return 1;
-	if (nfs_verify_change_attribute(dir, dentry->d_time))
-		return 1;
-	return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	/* Revalidate nfsi->cache_change_attribute before we declare a match */
+	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+		return 0;
+	if (!nfs_verify_change_attribute(dir, dentry->d_time))
+		return 0;
+	return 1;
 }
 
 static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
@@ -742,10 +747,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = dentry->d_inode;
 
-	/* Revalidate parent directory attribute cache */
-	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
-		goto out_zap_parent;
-
 	if (!inode) {
 		if (nfs_neg_need_reval(dir, dentry, nd))
 			goto out_bad;
-- 
cgit v1.2.3


From 0a5ebc148879be68acdb12fbe72b65cb88c410d9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 12:57:24 -0400
Subject: NFSv4: Don't revalidate the directory in nfs_atomic_lookup()

Why bother, since the call to nfs4_atomic_open() will do it for us.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cf80cf234e27..9ca38ab0e0a7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -994,14 +994,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 
 	/* Open the file on the server */
 	lock_kernel();
-	/* Revalidate parent directory attribute cache */
-	error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		unlock_kernel();
-		goto out;
-	}
-
 	res = nfs4_atomic_open(dir, dentry, nd);
 	unlock_kernel();
 	if (IS_ERR(res)) {
-- 
cgit v1.2.3


From 216d5d06883edfaf992ada0d72a2a22fdfdbd296 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 20:10:12 -0400
Subject: NFSv4: Use NFSv2/v3 rules for negative dentries in
 nfs_open_revalidate

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9ca38ab0e0a7..41b063c98822 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1037,8 +1037,12 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
-	if (inode == NULL)
+	if (inode == NULL) {
+		if (!nfs_neg_need_reval(dir, dentry, nd))
+			ret = 1;
 		goto out;
+	}
+
 	/* NFS only supports OPEN on regular files */
 	if (!S_ISREG(inode->i_mode))
 		goto no_open;
-- 
cgit v1.2.3


From d75340cc4de5c187fbf0bba234309ca86cf0a2fb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 21:42:01 -0400
Subject: NFSv4: Fix nfs_atomic_open() to set the verifier on negative dentries
 too

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 8 --------
 fs/nfs/nfs4proc.c      | 7 ++++++-
 include/linux/nfs_fs.h | 5 +++++
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 41b063c98822..82878a19538d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -656,11 +656,6 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 	return 1;
 }
 
-static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
-{
-	dentry->d_time = verf;
-}
-
 /*
  * Return the intent data that applies to this particular path component
  *
@@ -1016,7 +1011,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		}
 	} else if (res != NULL)
 		dentry = res;
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out:
 	return res;
 no_open:
@@ -1060,8 +1054,6 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	 */
 	lock_kernel();
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-	if (ret == 1)
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	unlock_kernel();
 out:
 	dput(parent);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0748c700301d..52af5a7d679e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1399,13 +1399,16 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
 	put_rpccred(cred);
 	if (IS_ERR(state)) {
-		if (PTR_ERR(state) == -ENOENT)
+		if (PTR_ERR(state) == -ENOENT) {
 			d_add(dentry, NULL);
+			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+		}
 		return (struct dentry *)state;
 	}
 	res = d_add_unique(dentry, igrab(state->inode));
 	if (res != NULL)
 		path.dentry = res;
+	nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
 	nfs4_intent_set_file(nd, &path, state);
 	return res;
 }
@@ -1439,6 +1442,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 		}
 	}
 	if (state->inode == dentry->d_inode) {
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		nfs4_intent_set_file(nd, &path, state);
 		return 1;
 	}
@@ -1885,6 +1889,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		goto out;
 	}
 	d_add(dentry, igrab(state->inode));
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (flags & O_EXCL) {
 		struct nfs_fattr fattr;
 		status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8f632bd854af..992931cd301e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -240,6 +240,11 @@ static inline int NFS_USE_READDIRPLUS(struct inode *inode)
 	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 }
 
+static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
+{
+	dentry->d_time = verf;
+}
+
 /**
  * nfs_save_change_attribute - Returns the inode attribute change cookie
  * @dir - pointer to parent directory inode
-- 
cgit v1.2.3


From 81c768808c78283e1b4ed4cd2cad2571294b2090 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 10:30:00 -0400
Subject: NFSv3: Always use directory post-op attributes in nfs3_proc_lookup

LOOKUP returns the directory post-op attributes whether or not the
operation was successful.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce1fb99e67e1..6e05f6c9c910 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -166,6 +166,7 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_refresh_inode(dir, &dir_attr);
 	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_argp = fhandle;
@@ -173,8 +174,6 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	dprintk("NFS reply lookup: %d\n", status);
-	if (status >= 0)
-		status = nfs_refresh_inode(dir, &dir_attr);
 	return status;
 }
 
-- 
cgit v1.2.3


From efbb06b7f98a154ef51ad41674548af5cc1fd005 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 17:11:54 -0400
Subject: NFS: Remove the redundant nfs_reval_fsid()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 82878a19538d..0cc798b40cd7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -864,16 +864,6 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
-static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr)
-{
-	struct nfs_server *server = NFS_SERVER(dir);
-
-	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
-		/* Revalidate fsid using the parent directory */
-		return __nfs_revalidate_inode(server, dir);
-	return 0;
-}
-
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -912,11 +902,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
-	error = nfs_reval_fsid(dir, &fattr);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		goto out_unlock;
-	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
-- 
cgit v1.2.3


From b050aa791fad6b060d6ff59305f01289e18b058c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 19:02:07 -0400
Subject: NFS: Don't zap the readdir caches upon error

If necessary, the caches will get zapped under normal revalidation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0cc798b40cd7..393c48136919 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -211,9 +211,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	unlock_page(page);
 	return 0;
  error:
-	SetPageError(page);
 	unlock_page(page);
-	nfs_zap_caches(inode);
 	desc->error = error;
 	return -EIO;
 }
-- 
cgit v1.2.3


From a12802cab8520f86c9a80bbf87d10ee6171687d1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 19:13:04 -0400
Subject: NFS: Be strict about dentry revalidation when doing exclusive create

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 393c48136919..c2207e3f2634 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -668,6 +668,19 @@ static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigne
 	return nd->flags & mask;
 }
 
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
+{
+	if (NFS_PROTO(dir)->version == 2)
+		return 0;
+	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+		return 0;
+	return (nd->intent.open.flags & O_EXCL) != 0;
+}
+
 /*
  * Inode and filehandle revalidation for lookups.
  *
@@ -754,7 +767,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	}
 
 	/* Force a full look up iff the parent directory has changed */
-	if (nfs_check_verifier(dir, dentry)) {
+	if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
 		if (nfs_lookup_verify_inode(inode, nd))
 			goto out_zap_parent;
 		goto out_valid;
@@ -848,20 +861,6 @@ struct dentry_operations nfs_dentry_operations = {
 	.d_iput		= nfs_dentry_iput,
 };
 
-/*
- * Use intent information to check whether or not we're going to do
- * an O_EXCL create using this path component.
- */
-static inline
-int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
-{
-	if (NFS_PROTO(dir)->version == 2)
-		return 0;
-	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-		return 0;
-	return (nd->intent.open.flags & O_EXCL) != 0;
-}
-
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
-- 
cgit v1.2.3


From 9697d2342e1a480bc14921c52f185c2fe01016e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 21:58:05 -0400
Subject: NFS: Ensure that nfs_link() returns a hashed dentry

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c2207e3f2634..d352509d7e2d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1506,10 +1506,11 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	lock_kernel();
+	d_drop(dentry);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
 		atomic_inc(&inode->i_count);
-		d_instantiate(dentry, inode);
+		d_add(dentry, inode);
 	}
 	unlock_kernel();
 	return error;
-- 
cgit v1.2.3


From 4f48af45842c6e78ab958c90344d3c940db4da15 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Oct 2007 23:13:32 -0400
Subject: NFS: Simplify filehandle revalidation

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d352509d7e2d..bdf4ba42abd7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -704,6 +704,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 				(S_ISREG(inode->i_mode) ||
 				 S_ISDIR(inode->i_mode)))
 			goto out_force;
+		return 0;
 	}
 	return nfs_revalidate_inode(server, inode);
 out_force:
-- 
cgit v1.2.3


From c7c209730d635226b81e9aeae63b6dc8f445569f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2007 19:22:40 -0400
Subject: NFS: Get rid of some obsolete macros

- NFS_READTIME, NFS_CHANGE_ATTR are completely unused.
- Inline the few remaining uses of NFS_ATTRTIMEO, and remove.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 2 +-
 fs/nfs/inode.c         | 4 ++--
 include/linux/nfs_fs.h | 4 ----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bdf4ba42abd7..a4fdc4cc306c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1762,7 +1762,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 	cache = nfs_access_search_rbtree(inode, cred);
 	if (cache == NULL)
 		goto out;
-	if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+	if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9d012a6ee4cc..469387920dd9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -117,8 +117,8 @@ static void nfs_zap_caches_locked(struct inode *inode)
 
 	nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 
-	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
-	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
+	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+	nfsi->attrtimeo_timestamp = jiffies;
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 992931cd301e..12fe97bac9ff 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -203,16 +203,12 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 #define NFS_CLIENT(inode)		(NFS_SERVER(inode)->client)
 #define NFS_PROTO(inode)		(NFS_SERVER(inode)->nfs_client->rpc_ops)
 #define NFS_COOKIEVERF(inode)		(NFS_I(inode)->cookieverf)
-#define NFS_READTIME(inode)		(NFS_I(inode)->read_cache_jiffies)
-#define NFS_CHANGE_ATTR(inode)		(NFS_I(inode)->change_attr)
-#define NFS_ATTRTIMEO(inode)		(NFS_I(inode)->attrtimeo)
 #define NFS_MINATTRTIMEO(inode) \
 	(S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
 			       : NFS_SERVER(inode)->acregmin)
 #define NFS_MAXATTRTIMEO(inode) \
 	(S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \
 			       : NFS_SERVER(inode)->acregmax)
-#define NFS_ATTRTIMEO_UPDATE(inode)	(NFS_I(inode)->attrtimeo_timestamp)
 
 #define NFS_FLAGS(inode)		(NFS_I(inode)->flags)
 #define NFS_STALE(inode)		(test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
-- 
cgit v1.2.3


From bf19aacecbeebccb2c3d150a8bd9416b7dba81fe Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 26 Sep 2007 14:38:09 -0400
Subject: nfs: add server port to rpc_pipe info file

On the client, when an alternate server port is specified on the mount
commandline, we need to make sure gssd knows about it.

Also, on the server side, when we're sending krb5 callbacks to the
client, we'll use the same mechanism to let gssd know about the callback
port.

Thanks to Olga Kornievskaia for testing and for an earlier
implementation.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Cc: Olga Kornievskaia <aglo@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpc_pipe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index ae83ac84f63b..c8433e8865aa 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -329,6 +329,7 @@ rpc_show_info(struct seq_file *m, void *v)
 			clnt->cl_prog, clnt->cl_vers);
 	seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
 	seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
+	seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
 	return 0;
 }
 
-- 
cgit v1.2.3


From afde94f398b62c8596a8d0cbfc25798f0b52a371 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 26 Sep 2007 14:38:08 -0400
Subject: SUNRPC: Fix default hostname created in rpc_create()

Since 43780b87fa7..., rpc_create() fills in a default hostname based on
the ip address if the servername passed in is null.  A small typo made
that default incorrect.  (But this information appears to be used only
for debugging right now, so I don't believe the typo causes any bugs in
the current kernel.)

Thanks to Olga Kornievskaia for bug report and testing.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6cdf53c489b7..e9866fc93df1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -259,7 +259,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	 */
 	if (args->servername == NULL) {
 		struct sockaddr_in *addr =
-					(struct sockaddr_in *) &args->address;
+					(struct sockaddr_in *) args->address;
 		snprintf(servername, sizeof(servername), NIPQUAD_FMT,
 			NIPQUAD(addr->sin_addr.s_addr));
 		args->servername = servername;
-- 
cgit v1.2.3


From 67f97d83bfcca9d9f8fbeeb14e7c644a82b24e12 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 26 Sep 2007 14:38:10 -0400
Subject: SUNRPC: Use correct type in buffer length calculations

Use correct type signage in gss_krb5_remove_padding() when doing length
calculations.  Both xdr_buf.len and iov.iov_len are size_t, which is
unsigned; so use an unsigned type for our temporary length variable to
ensure we don't overflow it..

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/gss_krb5_wrap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42b3220bed39..8bd074df27d3 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -42,7 +42,7 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
 {
 	u8 *ptr;
 	u8 pad;
-	int len = buf->len;
+	size_t len = buf->len;
 
 	if (len <= buf->head[0].iov_len) {
 		pad = *(u8 *)(buf->head[0].iov_base + len - 1);
@@ -53,9 +53,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
 	} else
 		len -= buf->head[0].iov_len;
 	if (len <= buf->page_len) {
-		int last = (buf->page_base + len - 1)
+		unsigned int last = (buf->page_base + len - 1)
 					>>PAGE_CACHE_SHIFT;
-		int offset = (buf->page_base + len - 1)
+		unsigned int offset = (buf->page_base + len - 1)
 					& (PAGE_CACHE_SIZE - 1);
 		ptr = kmap_atomic(buf->pages[last], KM_USER0);
 		pad = *(ptr + offset);
-- 
cgit v1.2.3


From 7f4adeff6fa32ff46f148110c4f08130175c8da8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 13 Jun 2007 01:03:13 +0200
Subject: [2.6 patch] net/sunrpc/rpcb_clnt.c: make struct rpcb_program static

This patch makes the needlessly global struct rpcb_program static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 8db3d70c23e3..a05493aedb68 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -103,7 +103,7 @@ enum {
 #define RPCB_MAXOWNERLEN	sizeof(RPCB_OWNER_STRING)
 
 static void			rpcb_getport_done(struct rpc_task *, void *);
-extern struct rpc_program	rpcb_program;
+static struct rpc_program	rpcb_program;
 
 struct rpcbind_args {
 	struct rpc_xprt *	r_xprt;
@@ -666,7 +666,7 @@ static struct rpc_version *rpcb_version[] = {
 
 static struct rpc_stat rpcb_stats;
 
-struct rpc_program rpcb_program = {
+static struct rpc_program rpcb_program = {
 	.name		= "rpcbind",
 	.number		= RPCBIND_PROGRAM,
 	.nrvers		= ARRAY_SIZE(rpcb_version),
-- 
cgit v1.2.3


From 58eaab93376cb524fd0f1531a56902d2b3eaa619 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sat, 21 Jul 2007 17:03:28 +0200
Subject: [23/37] Clean up duplicate includes in

Hi,

This patch cleans up duplicate includes in
	include/linux/nfs_fs.h

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 12fe97bac9ff..0e0c2e0d3aa3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -47,10 +47,8 @@
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
-
 #include <linux/nfs_fs_sb.h>
 
-#include <linux/rwsem.h>
 #include <linux/mempool.h>
 
 /*
-- 
cgit v1.2.3


From 2199700f1d660494d2552278354422c51becb686 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 11:43:37 -0400
Subject: SUNRPC: Fix buggy UDP transmission

xs_sendpages() may return a negative result. We sure as hell don't want to
add that to the 'tk_bytes_sent' tally...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e97e4ca43ef3..6f32cc57f429 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -587,13 +587,13 @@ static int xs_udp_send_request(struct rpc_task *task)
 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
 
-	task->tk_bytes_sent += status;
-	if (likely(status >= (int) req->rq_slen))
-		return 0;
-
-	/* Still some bytes left; set up for a retry later. */
-	if (status > 0)
+	if (status >= 0) {
+		task->tk_bytes_sent += status;
+		if (status >= req->rq_slen)
+			return 0;
+		/* Still some bytes left; set up for a retry later. */
 		status = -EAGAIN;
+	}
 
 	switch (status) {
 	case -ENETUNREACH:
-- 
cgit v1.2.3


From b6e9c713f5c526a85893c6e0ab1d5d6c6f1ab479 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 12:06:44 -0400
Subject: SUNRPC: Don't call xprt_release() if call_allocate fails

It completely fouls up the RPC call statistics, and serves no useful
purpose.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e9866fc93df1..e6a2678ecec3 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -846,8 +846,7 @@ call_allocate(struct rpc_task *task)
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
 	if (RPC_IS_ASYNC(task) || !signalled()) {
-		xprt_release(task);
-		task->tk_action = call_reserve;
+		task->tk_action = call_allocate;
 		rpc_delay(task, HZ>>4);
 		return;
 	}
-- 
cgit v1.2.3


From 220bcc2afd7011b3e0569fc178331fa983c92c1b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 1 Oct 2007 12:06:48 -0400
Subject: SUNRPC: Don't call xprt_release in call refresh

Call it from call_verify() instead...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e6a2678ecec3..76be83ee4b04 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1271,7 +1271,6 @@ call_refresh(struct rpc_task *task)
 {
 	dprint_status(task);
 
-	xprt_release(task);	/* Must do to obtain new XID */
 	task->tk_action = call_refreshresult;
 	task->tk_status = 0;
 	task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1389,6 +1388,8 @@ call_verify(struct rpc_task *task)
 			dprintk("RPC: %5u %s: retry stale creds\n",
 					task->tk_pid, __FUNCTION__);
 			rpcauth_invalcred(task);
+			/* Ensure we obtain a new XID! */
+			xprt_release(task);
 			task->tk_action = call_refresh;
 			goto out_retry;
 		case RPC_AUTH_BADCRED:
-- 
cgit v1.2.3


From 9e08a3c5aec5b745e844328bcbc16458b6118faf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Oct 2007 14:10:31 -0400
Subject: NFS: Use nfs_refresh_inode() in ops that aren't expected to change
 the inode

nfs_post_op_update_inode() is really only meant to be used if we expect the
inode and its attributes to have changed in some way.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 3 +--
 fs/nfs/nfs4proc.c | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6e05f6c9c910..4cdc2361a669 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -777,8 +777,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
-	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(data->inode, data->res.fattr);
+	nfs_refresh_inode(data->inode, data->res.fattr);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 52af5a7d679e..30d5dd5735cd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2475,8 +2475,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 		rpc_restart_call(task);
 		return -EAGAIN;
 	}
-	if (task->tk_status >= 0)
-		nfs_post_op_update_inode(inode, data->res.fattr);
+	nfs_refresh_inode(inode, data->res.fattr);
 	return 0;
 }
 
@@ -3046,7 +3045,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	if (status == 0) {
 		status = data->rpc_status;
 		if (status == 0)
-			nfs_post_op_update_inode(inode, &data->fattr);
+			nfs_refresh_inode(inode, &data->fattr);
 	}
 	rpc_put_task(task);
 	return status;
-- 
cgit v1.2.3


From 40d24704091c8a29a4c99d25670f1996749aea6f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Oct 2007 09:24:22 -0400
Subject: NFS: Fix a connectathon regression in NFSv3 and NFSv4

We're failing basic test6 against Linux servers because they lack a correct
change attribute. The fix is to assume that we always want to invalidate
the readdir caches when we call update_changeattr and/or
nfs_post_op_update_inode on a directory.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 10 +++++++++-
 fs/nfs/nfs4proc.c | 10 ++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 469387920dd9..c44cd02e5181 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -876,11 +876,19 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (fattr->valid & NFS_ATTR_FATTR)
+	if (fattr->valid & NFS_ATTR_FATTR) {
+		if (S_ISDIR(inode->i_mode)) {
+			spin_lock(&inode->i_lock);
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+			spin_unlock(&inode->i_lock);
+		}
 		return nfs_refresh_inode(inode, fattr);
+	}
 
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 	return 0;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 30d5dd5735cd..cb99fd90a9ac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -208,12 +208,10 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	struct nfs_inode *nfsi = NFS_I(dir);
 
 	spin_lock(&dir->i_lock);
-	if (cinfo->after != nfsi->change_attr) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-		if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
-			nfsi->cache_change_attribute = jiffies;
-		nfsi->change_attr = cinfo->after;
-	}
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+		nfsi->cache_change_attribute = jiffies;
+	nfsi->change_attr = cinfo->after;
 	spin_unlock(&dir->i_lock);
 }
 
-- 
cgit v1.2.3


From 2a3f5fd45938bd86ce8faf4cb26be4f7e9ae2941 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Oct 2007 14:26:13 -0400
Subject: NFS: nfs_refresh_inode should clear cache_validity flags on success

If the cached attributes match the ones supplied in the fattr, then assume
we've revalidated the inode.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c44cd02e5181..cad1246bf575 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -790,6 +790,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_size, new_isize;
+	unsigned long invalid = 0;
 
 
 	/* Has the inode gone and changed behind our back? */
@@ -803,29 +804,36 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
 	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	if (cur_size != new_isize && nfsi->npages == 0)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Have any file permissions changed? */
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
 			|| inode->i_uid != fattr->uid
 			|| inode->i_gid != fattr->gid)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
 	if (inode->i_nlink != fattr->nlink)
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+		invalid |= NFS_INO_INVALID_ATTR;
 
 	if (!timespec_equal(&inode->i_atime, &fattr->atime))
-		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
+		invalid |= NFS_INO_INVALID_ATIME;
+
+	if (invalid != 0)
+		nfsi->cache_validity |= invalid;
+	else
+		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_PAGECACHE);
 
 	nfsi->read_cache_jiffies = fattr->time_start;
 	return 0;
@@ -876,21 +884,12 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (fattr->valid & NFS_ATTR_FATTR) {
-		if (S_ISDIR(inode->i_mode)) {
-			spin_lock(&inode->i_lock);
-			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-			spin_unlock(&inode->i_lock);
-		}
-		return nfs_refresh_inode(inode, fattr);
-	}
-
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 	if (S_ISDIR(inode->i_mode))
 		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
-	return 0;
+	return nfs_refresh_inode(inode, fattr);
 }
 
 /**
@@ -938,7 +937,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
-	unsigned int	invalid = 0;
+	unsigned long invalid = 0;
 	unsigned long now = jiffies;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
-- 
cgit v1.2.3


From f43bf0bebed7c33b698a8a25f95812f9e87c3843 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 9 Oct 2007 12:01:04 -0400
Subject: NFS: Add a boot parameter to disable 64 bit inode numbers

This boot parameter will allow legacy 32-bit applications which call stat()
to continue to function even if the NFSv3/v4 server uses 64-bit inode
numbers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt |  7 +++++++
 fs/nfs/dir.c                        |  3 ++-
 fs/nfs/inode.c                      | 27 ++++++++++++++++++++++++++-
 include/linux/nfs_fs.h              |  1 +
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4d175c751246..83e21045114e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1073,6 +1073,13 @@ and is between 256 and 4096 characters. It is defined in the file
 			[NFS] set the maximum lifetime for idmapper cache
 			entries.
 
+	nfs.enable_ino64=
+			[NFS] enable 64-bit inode numbers.
+			If zero, the NFS client will fake up a 32-bit inode
+			number for the readdir() and stat() syscalls instead
+			of returning the full 64-bit number.
+			The default is to return 64-bit inode numbers.
+
 	nmi_watchdog=	[KNL,BUGS=X86-32] Debugging features for SMP kernels
 
 	no387		[BUGS=X86-32] Tells the kernel to use the 387 maths
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a4fdc4cc306c..8ec7fbd8240c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -427,7 +427,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		}
 
 		res = filldir(dirent, entry->name, entry->len, 
-			      file->f_pos, fileid, d_type);
+			      file->f_pos, nfs_compat_user_ino64(fileid),
+			      d_type);
 		if (res < 0)
 			break;
 		file->f_pos++;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cad1246bf575..035c769b715e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -49,6 +49,11 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED	1
+
+/* Default is to see 64-bit inode numbers */
+static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
@@ -62,6 +67,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+	int ino;
+
+	if (enable_ino64)
+		return fileid;
+	ino = fileid;
+	if (sizeof(ino) < sizeof(fileid))
+		ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+	return ino;
+}
+
 int nfs_write_inode(struct inode *inode, int sync)
 {
 	int ret;
@@ -456,7 +480,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 		err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (!err) {
 		generic_fillattr(inode, stat);
-		stat->ino = NFS_FILEID(inode);
+		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	}
 	return err;
 }
@@ -1235,6 +1259,7 @@ static void __exit exit_nfs_fs(void)
 /* Not quite true; I just maintain it */
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
 MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
 
 module_init(init_nfs_fs)
 module_exit(exit_nfs_fs)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0e0c2e0d3aa3..c5164c257f71 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -289,6 +289,7 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern u64 nfs_compat_user_ino64(u64 fileid);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3


From 05c88babab957dfd63bd351b25042d80bd854dd0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 11 Oct 2007 15:11:51 -0400
Subject: NFSv4: Fix a typo in nfs_inode_reclaim_delegation

We were intending to put the previous instance of delegation->cred
before setting a new one.

Thanks to David Howells for spotting this.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7a1b6e869f9c..af8b235d405d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -109,6 +109,7 @@ again:
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
 	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
+	struct rpc_cred *oldcred;
 
 	if (delegation == NULL)
 		return;
@@ -116,11 +117,12 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
 			sizeof(delegation->stateid.data));
 	delegation->type = res->delegation_type;
 	delegation->maxsize = res->maxsize;
-	put_rpccred(cred);
+	oldcred = delegation->cred;
 	delegation->cred = get_rpccred(cred);
 	delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
 	NFS_I(inode)->delegation_state = delegation->type;
 	smp_wmb();
+	put_rpccred(oldcred);
 }
 
 /*
-- 
cgit v1.2.3