Merge branch 'master'

author: David Teigland <teigland@redhat.com> 2006-04-27 11:49:55 -0400
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-04-27 11:49:55 -0400
commit: d26046bb0aff707aac38a9bf3dd56fa39b28a399 (patch)
tree: c59e0514ca45cb56c3cbf7920a4590dad6975bcd /fs
parent: e7f5c01caddbad150dcf003f76cd5aac413f4c50 (diff)
parent: 2be4d50295e2b6f62c07b614e1b103e280dddb84 (diff)
21 files changed, 491 insertions, 234 deletions
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8a2de038882e..1a27ecb46c9a 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,7 +1,11 @@
 Version 1.42
 ------------
 Fix slow oplock break when mounted to different servers at the same time and
-the tids match and we try to find matching fid on wrong server.
+the tids match and we try to find matching fid on wrong server. Fix read
+looping when signing required by server (2.6.16 kernel only). Fix readdir
+vs. rename race which could cause each to hang. Return . and .. even
+if server does not.  Allow searches to skip first three entries and
+begin at any location. Fix oops in find_writeable_file.
 
 Version 1.41
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index b2b4d0803761..0355003f4f0a 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -511,6 +511,14 @@ LinuxExtensionsEnabled	If set to one then the client will attempt to
 			support and want to map the uid and gid fields 
 			to values supplied at mount (rather than the 
 			actual values, then set this to zero. (default 1)
+Experimental            When set to 1 used to enable certain experimental
+			features (currently enables multipage writes
+			when signing is enabled, the multipage write
+			performance enhancement was disabled when
+			signing turned on in case buffer was modified
+			just before it was sent, also this flag will
+			be used to use the new experimental sessionsetup
+			code).
 
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d4b713e5affb..c262d8874ce9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -33,6 +33,7 @@
 #include <linux/vfs.h>
 #include <linux/mempool.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256");
 
-static DECLARE_COMPLETION(cifs_oplock_exited);
-static DECLARE_COMPLETION(cifs_dnotify_exited);
-
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg)
 	__u16  netfid;
 	int rc;
 
-	daemonize("cifsoplockd");
-	allow_signal(SIGTERM);
-
-	oplockThread = current;
 	do {
 		if (try_to_freeze()) 
 			continue;
@@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg)
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(1);  /* yield in case q were corrupt */
 		}
-	} while(!signal_pending(current));
-	oplockThread = NULL;
-	complete_and_exit (&cifs_oplock_exited, 0);
+	} while (!kthread_should_stop());
+
+	return 0;
 }
 
 static int cifs_dnotify_thread(void * dummyarg)
@@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg)
 	struct list_head *tmp;
 	struct cifsSesInfo *ses;
 
-	daemonize("cifsdnotifyd");
-	allow_signal(SIGTERM);
-
-	dnotifyThread = current;
 	do {
 		if(try_to_freeze())
 			continue;
@@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg)
 				wake_up_all(&ses->server->response_q);
 		}
 		read_unlock(&GlobalSMBSeslock);
-	} while(!signal_pending(current));
-	complete_and_exit (&cifs_dnotify_exited, 0);
+	} while (!kthread_should_stop());
+
+	return 0;
 }
 
 static int __init
@@ -982,32 +973,48 @@ init_cifs(void)
 	}
 
 	rc = cifs_init_inodecache();
-	if (!rc) {
-		rc = cifs_init_mids();
-		if (!rc) {
-			rc = cifs_init_request_bufs();
-			if (!rc) {
-				rc = register_filesystem(&cifs_fs_type);
-				if (!rc) {                
-					rc = (int)kernel_thread(cifs_oplock_thread, NULL, 
-						CLONE_FS | CLONE_FILES | CLONE_VM);
-					if(rc > 0) {
-						rc = (int)kernel_thread(cifs_dnotify_thread, NULL,
-							CLONE_FS | CLONE_FILES | CLONE_VM);
-						if(rc > 0)
-							return 0;
-						else
-							cERROR(1,("error %d create dnotify thread", rc));
-					} else {
-						cERROR(1,("error %d create oplock thread",rc));
-					}
-				}
-				cifs_destroy_request_bufs();
-			}
-			cifs_destroy_mids();
-		}
-		cifs_destroy_inodecache();
+	if (rc)
+		goto out_clean_proc;
+
+	rc = cifs_init_mids();
+	if (rc)
+		goto out_destroy_inodecache;
+
+	rc = cifs_init_request_bufs();
+	if (rc)
+		goto out_destroy_mids;
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_destroy_request_bufs;
+
+	oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
+	if (IS_ERR(oplockThread)) {
+		rc = PTR_ERR(oplockThread);
+		cERROR(1,("error %d create oplock thread", rc));
+		goto out_unregister_filesystem;
 	}
+
+	dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
+	if (IS_ERR(dnotifyThread)) {
+		rc = PTR_ERR(dnotifyThread);
+		cERROR(1,("error %d create dnotify thread", rc));
+		goto out_stop_oplock_thread;
+	}
+
+	return 0;
+
+ out_stop_oplock_thread:
+	kthread_stop(oplockThread);
+ out_unregister_filesystem:
+	unregister_filesystem(&cifs_fs_type);
+ out_destroy_request_bufs:
+	cifs_destroy_request_bufs();
+ out_destroy_mids:
+	cifs_destroy_mids();
+ out_destroy_inodecache:
+	cifs_destroy_inodecache();
+ out_clean_proc:
 #ifdef CONFIG_PROC_FS
 	cifs_proc_clean();
 #endif
@@ -1025,14 +1032,8 @@ exit_cifs(void)
 	cifs_destroy_inodecache();
 	cifs_destroy_mids();
 	cifs_destroy_request_bufs();
-	if(oplockThread) {
-		send_sig(SIGTERM, oplockThread, 1);
-		wait_for_completion(&cifs_oplock_exited);
-	}
-	if(dnotifyThread) {
-		send_sig(SIGTERM, dnotifyThread, 1);
-		wait_for_completion(&cifs_dnotify_exited);
-	}
+	kthread_stop(oplockThread);
+	kthread_stop(dnotifyThread);
 }
 
 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d705500aa283..fd36892eda55 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3119,7 +3119,7 @@ findFirstRetry:
 				psrch_inf->endOfSearch = FALSE;
 
 			psrch_inf->entries_in_buffer  = le16_to_cpu(parms->SearchCount);
-			psrch_inf->index_of_last_entry = 
+			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
 			*pnetfid = parms->SearchHandle;
 		} else {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0b86d5ca9014..d2ec806a4f32 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3447,6 +3447,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			pSesInfo->server->secMode,
 			pSesInfo->server->capabilities,
 			pSesInfo->server->timeZone));
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		if(experimEnabled > 1)
+			rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
+					    &ntlmv2_flag, nls_info);	
+		else
+#endif
 		if (extended_security
 				&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
 				&& (pSesInfo->server->secType == NTLMSSP)) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1d0ca3eaaca5..82315edc77d7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL)
 		rc = -ENOMEM;
 	else if (pTcon->ses->capabilities & CAP_UNIX) {
@@ -440,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
+	/*
+	 * Don't allow the separator character in a path component.
+	 * The VFS will not allow "/", but "\" is allowed by posix.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
+		int i;
+		for (i = 0; i < direntry->d_name.len; i++)
+			if (direntry->d_name.name[i] == '\\') {
+				cFYI(1, ("Invalid file name"));
+				FreeXid(xid);
+				return ERR_PTR(-EINVAL);
+			}
+	}
+
 	/* can not grab the rename sem here since it would
 	deadlock in the cases (beginning of sys_rename itself)
 	in which we already have the sb rename sem */
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index ec4dfe9bf5ef..633a93811328 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg)
 	cifs_sb = CIFS_SB(file->f_dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		rc = -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5c497c529772..e152bf6afa60 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -203,9 +203,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		}
 	}
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -906,8 +904,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				if (rc != 0)
 					break;
 			}
-			/* BB FIXME We can not sign across two buffers yet */
-			if((pTcon->ses->server->secMode & 
+			if(experimEnabled || (pTcon->ses->server->secMode & 
 			 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) {
 				struct kvec iov[2];
 				unsigned int len;
@@ -923,13 +920,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 						*poffset, &bytes_written,
 						iov, 1, long_op);
 			} else
-			/* BB FIXME fixup indentation of line below */
-			rc = CIFSSMBWrite(xid, pTcon,
-				 open_file->netfid,
-				 min_t(const int, cifs_sb->wsize, 
-				       write_size - total_written),
-				 *poffset, &bytes_written,
-				 write_data + total_written, NULL, long_op);
+				rc = CIFSSMBWrite(xid, pTcon,
+					 open_file->netfid,
+					 min_t(const int, cifs_sb->wsize,
+					       write_size - total_written),
+					 *poffset, &bytes_written,
+					 write_data + total_written,
+					 NULL, long_op);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -968,6 +965,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	struct cifsFileInfo *open_file;
 	int rc;
 
+	/* Having a null inode here (because mapping->host was set to zero by
+	the VFS or MM) should not happen but we had reports of on oops (due to
+	it being zero) during stress testcases so we need to check for it */
+
+	if(cifs_inode == NULL) {
+		cERROR(1,("Null inode passed to cifs_writeable_file"));
+		dump_stack();
+		return NULL;
+	}
+
 	read_lock(&GlobalSMBSeslock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
@@ -1093,12 +1100,11 @@ static int cifs_writepages(struct address_space *mapping,
 	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
 		return generic_writepages(mapping, wbc);
 
-	/* BB FIXME we do not have code to sign across multiple buffers yet,
-	   so go to older writepage style write which we can sign if needed */
 	if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
 		if(cifs_sb->tcon->ses->server->secMode &
                           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			return generic_writepages(mapping, wbc);
+			if(!experimEnabled)
+				return generic_writepages(mapping, wbc);
 
 	/*
 	 * BB: Is this meaningful for a non-block-device file system?
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 957ddd1571c6..4093764ef461 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			rc = 0;
 	}
 		
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 9562f5bba65c..2ec99f833142 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 /* No need to check for cross device links since server will do that
    BB note DFS case in future though (when we may have to check) */
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	fromName = build_path_from_dentry(old_file);
 	toName = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if((fromName == NULL) || (toName == NULL)) {
 		rc = -ENOMEM;
 		goto cifs_hl_exit;
@@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	xid = GetXid();
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 
 	if (!full_path)
 		goto out_no_free;
@@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		FreeXid(xid);
diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c
index 78866f925747..115359cc7a32 100644
--- a/fs/cifs/ntlmssp.c
+++ b/fs/cifs/ntlmssp.c
@@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type,
 	}
 
 
+	/* copy session key */
+
+	/* if Unicode, align strings to two byte boundary */
+
+	/* copy user name */ /* BB Do we need to special case null user name? */
+
+	/* copy domain name */
+
+	/* copy Linux version */
+
+	/* copy network operating system name */
+
+	/* update bcc and smb buffer length */
+
 /*	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */
 	/* SMB request buf freed in SendReceive2 */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2f6e2825571e..b689c5035124 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if(pTcon == NULL)
 		return -EINVAL;
 
-	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		return -ENOMEM;
@@ -592,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	first_entry_in_buffer = 
 		cifsFile->srch_inf.index_of_last_entry - 
 			cifsFile->srch_inf.entries_in_buffer;
+
+	/* if first entry in buf is zero then is first buffer
+	in search response data which means it is likely . and ..
+	will be in this buffer, although some servers do not return
+	. and .. for the root of a drive and for those we need
+	to start two entries earlier */
+
 /*	dump_cifs_file_struct(file, "In fce ");*/
 	if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
 	     is_dir_changed(file)) || 
@@ -634,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + 
 			smbCalcSize((struct smb_hdr *)
 				cifsFile->srch_inf.ntwrk_buf_start);
+
+		current_entry = cifsFile->srch_inf.srch_entries_start;
 		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
 					- cifsFile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
 		cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); 
-		current_entry = cifsFile->srch_inf.srch_entries_start;
 		for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) {
 			/* go entry by entry figuring out which is first */
-			/* if( . or ..)
-				skip */
-			rc = cifs_entry_is_dot(current_entry,cifsFile);
-			if(rc == 1) /* is . or .. so skip */ {
-				cFYI(1,("Entry is .")); /* BB removeme BB */
-				/* continue; */
-			} else if (rc == 2 ) {
-				cFYI(1,("Entry is ..")); /* BB removeme BB */
-				/* continue; */
-			}
 			current_entry = nxt_dir_entry(current_entry,end_of_smb);
 		}
 		if((current_entry == NULL) && (i < pos_in_buf)) {
@@ -770,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if(file->f_dentry == NULL)
 		return -ENOENT;
 
+	rc = cifs_entry_is_dot(pfindEntry,pCifsF);
+	/* skip . and .. since we added them first */
+	if(rc != 0) 
+		return 0;
+
 	cifs_sb = CIFS_SB(file->f_dentry->d_sb);
 
 	qstring.name = scratch_buf;
@@ -898,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	switch ((int) file->f_pos) {
 	case 0:
-		/*if (filldir(direntry, ".", 1, file->f_pos,
+		if (filldir(direntry, ".", 1, file->f_pos,
 		     file->f_dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for current dir failed "));
+			cERROR(1, ("Filldir for current dir failed"));
 			rc = -ENOMEM;
 			break;
 		}
-		file->f_pos++; */
+		file->f_pos++;
 	case 1:
-		/* if (filldir(direntry, "..", 2, file->f_pos,
+		if (filldir(direntry, "..", 2, file->f_pos,
 		     file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
 			cERROR(1, ("Filldir for parent dir failed "));
 			rc = -ENOMEM;
 			break;
 		}
-		file->f_pos++; */
-	case 2:
+		file->f_pos++;
+	default:
 		/* 1) If search is active, 
 			is in current search buffer? 
 			if it before then restart search
@@ -927,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 				return rc;
 			}
 		}
-	default:
 		if(file->private_data == NULL) {
 			rc = -EINVAL;
 			FreeXid(xid);
@@ -947,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL; */
 
-		/* BB account for . and .. in f_pos as special case */
-
 		rc = find_cifs_entry(xid,pTcon, file,
 				&current_entry,&num_to_fill);
 		if(rc) {
@@ -977,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 					  num_to_fill, i));
 				break;
 			}
-
+			/* if buggy server returns . and .. late do
+			we want to check for that here? */
 			rc = cifs_filldir(current_entry, file, 
 					filldir, direntry,tmp_buf);
 			file->f_pos++;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3938444d87b2..7754d641775e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
                                                                                      
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/compat.c b/fs/compat.c
index 7f8e26ea427c..2e32bd340474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	if (ret < 0)
 		goto out;
 
+	ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
+	if (ret)
+		goto out;
+
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index aaf1da17b6d4..8c22aa9a7fbb 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT3_DIRSYNC_FL;
 
+		mutex_lock(&inode->i_mutex);
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
 				return -EPERM;
+			}
 		}
 
 		/*
@@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_RESOURCE)) {
+				mutex_unlock(&inode->i_mutex);
 				return -EPERM;
+			}
 		}
 
 
 		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle))
+		if (IS_ERR(handle)) {
+			mutex_unlock(&inode->i_mutex);
 			return PTR_ERR(handle);
+		}
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
 		err = ext3_reserve_inode_write(handle, inode, &iloc);
@@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext3_journal_stop(handle);
-		if (err)
+		if (err) {
+			mutex_unlock(&inode->i_mutex);
 			return err;
+		}
 
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	case EXT3_IOC_GETVERSION:
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index c5ffa8523968..8aac5334680d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			goto exit_bh;
 		}
 		lock_buffer(bh);
-		memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(bh);
 		ext3_journal_dirty_metadata(handle, gdb);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cc750c68fe70..104a62dadb94 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -128,14 +128,24 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
+/*
+ * Called with sbput_sem held for read (request_end) or write
+ * (fuse_put_super).  By the time fuse_put_super() is finished, all
+ * inodes belonging to background requests must be released, so the
+ * iputs have to be done within the locked region.
+ */
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
-	list_del_init(&req->bg_entry);
+	iput(req->inode);
+	iput(req->inode2);
+	spin_lock(&fc->lock);
+	list_del(&req->bg_entry);
 	if (fc->num_background == FUSE_MAX_BACKGROUND) {
 		fc->blocked = 0;
 		wake_up_all(&fc->blocked_waitq);
 	}
 	fc->num_background--;
+	spin_unlock(&fc->lock);
 }
 
 /*
@@ -165,27 +175,22 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		wake_up(&req->waitq);
 		fuse_put_request(fc, req);
 	} else {
-		struct inode *inode = req->inode;
-		struct inode *inode2 = req->inode2;
-		struct file *file = req->file;
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
-		req->inode = NULL;
-		req->inode2 = NULL;
-		req->file = NULL;
-		if (!list_empty(&req->bg_entry))
-			fuse_remove_background(fc, req);
 		spin_unlock(&fc->lock);
+		down_read(&fc->sbput_sem);
+		if (fc->mounted)
+			fuse_release_background(fc, req);
+		up_read(&fc->sbput_sem);
+
+		/* fput must go outside sbput_sem, otherwise it can deadlock */
+		if (req->file)
+			fput(req->file);
 
 		if (end)
 			end(fc, req);
 		else
 			fuse_put_request(fc, req);
-
-		if (file)
-			fput(file);
-		iput(inode);
-		iput(inode2);
 	}
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 59661c481d9d..0474202cb5dc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -258,9 +258,15 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
+	/** RW semaphore for exclusion with fuse_put_super() */
+	struct rw_semaphore sbput_sem;
+
 	/** The next unique request id */
 	u64 reqctr;
 
+	/** Mount is active */
+	unsigned mounted;
+
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Remove request from the the background list
+ * Release inodes and file associated with background request
  */
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/** Abort all requests */
+/* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
 /**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43a6fc0db8a7..7627022446b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
+	down_write(&fc->sbput_sem);
+	while (!list_empty(&fc->background))
+		fuse_release_background(fc,
+					list_entry(fc->background.next,
+						   struct fuse_req, bg_entry));
+
 	spin_lock(&fc->lock);
+	fc->mounted = 0;
 	fc->connected = 0;
-	while (!list_empty(&fc->background)) {
-		struct fuse_req *req = list_entry(fc->background.next,
-						  struct fuse_req, bg_entry);
-		struct inode *inode = req->inode;
-		struct inode *inode2 = req->inode2;
-
-		/* File would hold a reference to vfsmount */
-		BUG_ON(req->file);
-		req->inode = NULL;
-		req->inode2 = NULL;
-		fuse_remove_background(fc, req);
-
-		spin_unlock(&fc->lock);
-		iput(inode);
-		iput(inode2);
-		spin_lock(&fc->lock);
-	}
 	spin_unlock(&fc->lock);
+	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
@@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
 		INIT_LIST_HEAD(&fc->background);
+		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
@@ -508,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->f_op != &fuse_dev_operations)
 		return -EINVAL;
 
-	/* Setting file->private_data can't race with other mount()
-	   instances, since BKL is held for ->get_sb() */
-	if (file->private_data)
-		return -EINVAL;
-
 	fc = new_conn();
 	if (!fc)
 		return -ENOMEM;
@@ -548,7 +535,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_free_req;
 
+	/* Setting file->private_data can't race with other mount()
+	   instances, since BKL is held for ->get_sb() */
+	err = -EINVAL;
+	if (file->private_data)
+		goto err_kobject_del;
+
 	sb->s_root = root_dentry;
+	fc->mounted = 1;
 	fc->connected = 1;
 	kobject_get(&fc->kobj);
 	file->private_data = fc;
@@ -563,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 
+ err_kobject_del:
+	kobject_del(&fc->kobj);
  err_free_req:
 	fuse_request_free(init_req);
  err_put_root:
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 58c418fbca2c..97ae1b92bc47 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode)
 		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
 		reiserfs_read_unlock_xattrs(inode->i_sb);
 		reiserfs_read_unlock_xattr_i(inode);
-		ret = acl ? 1 : 0;
-		posix_acl_release(acl);
+		ret = (acl && !IS_ERR(acl));
+		if (ret)
+			posix_acl_release(acl);
 	}
 
 	return ret;
diff --git a/fs/splice.c b/fs/splice.c
index 0559e7577a04..447ebc0a37f3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/uio.h>
+
+struct partial_page {
+	unsigned int offset;
+	unsigned int len;
+};
 
 /*
- * Passed to the actors
+ * Passed to splice_to_pipe
  */
-struct splice_desc {
-	unsigned int len, total_len;	/* current and remaining length */
+struct splice_pipe_desc {
+	struct page **pages;		/* page map */
+	struct partial_page *partial;	/* pages[] may not be contig */
+	int nr_pages;			/* number of pages in map */
 	unsigned int flags;		/* splice flags */
-	struct file *file;		/* file to read/write */
-	loff_t pos;			/* file position */
+	struct pipe_buf_operations *ops;/* ops associated with output pipe */
 };
 
 /*
@@ -128,6 +135,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 	kunmap(buf->page);
 }
 
+static void *user_page_pipe_buf_map(struct file *file,
+				    struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return kmap(buf->page);
+}
+
+static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	kunmap(buf->page);
+}
+
 static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
 				    struct pipe_buffer *buf)
 {
@@ -143,19 +163,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.get = page_cache_pipe_buf_get,
 };
 
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = user_page_pipe_buf_map,
+	.unmap = user_page_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
+};
+
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long len,
-			    unsigned int offset, unsigned int flags)
+static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+			      struct splice_pipe_desc *spd)
 {
-	int ret, do_wakeup, i;
+	int ret, do_wakeup, page_nr;
 
 	ret = 0;
 	do_wakeup = 0;
-	i = 0;
+	page_nr = 0;
 
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +205,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
-			struct page *page = pages[i++];
-			unsigned long this_len;
 
-			this_len = PAGE_CACHE_SIZE - offset;
-			if (this_len > len)
-				this_len = len;
-
-			buf->page = page;
-			buf->offset = offset;
-			buf->len = this_len;
-			buf->ops = &page_cache_pipe_buf_ops;
+			buf->page = spd->pages[page_nr];
+			buf->offset = spd->partial[page_nr].offset;
+			buf->len = spd->partial[page_nr].len;
+			buf->ops = spd->ops;
 			pipe->nrbufs++;
+			page_nr++;
+			ret += buf->len;
+
 			if (pipe->inode)
 				do_wakeup = 1;
 
-			ret += this_len;
-			len -= this_len;
-			offset = 0;
-			if (!--nr_pages)
-				break;
-			if (!len)
+			if (!--spd->nr_pages)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
@@ -199,7 +225,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			break;
 		}
 
-		if (flags & SPLICE_F_NONBLOCK) {
+		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
@@ -234,8 +260,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
+	while (page_nr < spd->nr_pages)
+		page_cache_release(spd->pages[page_nr++]);
 
 	return ret;
 }
@@ -246,17 +272,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int loff, offset, nr_pages;
+	unsigned int loff, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
-	size_t bytes;
-	int i, error;
+	size_t total_len;
+	int error;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	loff = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
@@ -266,15 +299,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!offset || nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!loff || spd.nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
 
 	/*
 	 * Now fill in the holes:
 	 */
 	error = 0;
-	bytes = 0;
-	for (i = 0; i < nr_pages; i++, index++) {
+	total_len = 0;
+	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
 		unsigned int this_len;
 
 		if (!len)
@@ -283,7 +316,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		/*
 		 * this_len is the max we'll use from this page
 		 */
-		this_len = min(len, PAGE_CACHE_SIZE - loff);
+		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 find_page:
 		/*
 		 * lookup the page for this index
@@ -367,26 +400,29 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (bytes + loff > isize) {
+				if (total_len + loff > isize) {
 					page_cache_release(page);
 					break;
 				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = i;
+				nr_pages = spd.nr_pages;
 				this_len = min(this_len, loff);
+				loff = 0;
 			}
 		}
 fill_it:
-		pages[i] = page;
-		bytes += this_len;
+		pages[spd.nr_pages] = page;
+		partial[spd.nr_pages].offset = loff;
+		partial[spd.nr_pages].len = this_len;
 		len -= this_len;
+		total_len += this_len;
 		loff = 0;
 	}
 
-	if (i)
-		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+	if (spd.nr_pages)
+		return splice_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -439,14 +475,13 @@ EXPORT_SYMBOL(generic_file_splice_read);
 
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
- * using sendpage().
+ * using sendpage(). Return the number of bytes sent.
  */
 static int pipe_to_sendpage(struct pipe_inode_info *info,
 			    struct pipe_buffer *buf, struct splice_desc *sd)
 {
 	struct file *file = sd->file;
 	loff_t pos = sd->pos;
-	unsigned int offset;
 	ssize_t ret;
 	void *ptr;
 	int more;
@@ -461,16 +496,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
-	offset = pos & ~PAGE_CACHE_MASK;
 	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
-	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);
+	ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
+				   &pos, more);
 
 	buf->ops->unmap(info, buf);
-	if (ret == sd->len)
-		return 0;
-
-	return -EIO;
+	return ret;
 }
 
 /*
@@ -499,7 +531,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	struct file *file = sd->file;
 	struct address_space *mapping = file->f_mapping;
 	gfp_t gfp_mask = mapping_gfp_mask(mapping);
-	unsigned int offset;
+	unsigned int offset, this_len;
 	struct page *page;
 	pgoff_t index;
 	char *src;
@@ -515,6 +547,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
+	this_len = sd->len;
+	if (this_len + offset > PAGE_CACHE_SIZE)
+		this_len = PAGE_CACHE_SIZE - offset;
+
 	/*
 	 * Reuse buf page, if SPLICE_F_MOVE is set.
 	 */
@@ -558,7 +594,7 @@ find_page:
 		 * the full page.
 		 */
 		if (!PageUptodate(page)) {
-			if (sd->len < PAGE_CACHE_SIZE) {
+			if (this_len < PAGE_CACHE_SIZE) {
 				ret = mapping->a_ops->readpage(file, page);
 				if (unlikely(ret))
 					goto out;
@@ -582,7 +618,7 @@ find_page:
 		}
 	}
 
-	ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
 	if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
@@ -592,18 +628,22 @@ find_page:
 	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
 		char *dst = kmap_atomic(page, KM_USER0);
 
-		memcpy(dst + offset, src + buf->offset, sd->len);
+		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER0);
 	}
 
-	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
 	if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
 	} else if (ret)
 		goto out;
 
+	/*
+	 * Return the number of bytes written.
+	 */
+	ret = this_len;
 	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
@@ -616,17 +656,14 @@ out_nomem:
 	return ret;
 }
 
-typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
-			   struct splice_desc *);
-
 /*
  * Pipe input worker. Most of this logic works like a regular pipe, the
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			      loff_t *ppos, size_t len, unsigned int flags,
-			      splice_actor *actor)
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -652,16 +689,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 				sd.len = sd.total_len;
 
 			err = actor(pipe, buf, &sd);
-			if (err) {
+			if (err <= 0) {
 				if (!ret && err != -ENODATA)
 					ret = err;
 
 				break;
 			}
 
-			ret += sd.len;
-			buf->offset += sd.len;
-			buf->len -= sd.len;
+			ret += err;
+			buf->offset += err;
+			buf->len -= err;
+
+			sd.len -= err;
+			sd.pos += err;
+			sd.total_len -= err;
+			if (sd.len)
+				continue;
 
 			if (!buf->len) {
 				buf->ops = NULL;
@@ -672,8 +715,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 					do_wakeup = 1;
 			}
 
-			sd.pos += sd.len;
-			sd.total_len -= sd.len;
 			if (!sd.total_len)
 				break;
 		}
@@ -741,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	struct address_space *mapping = out->f_mapping;
 	ssize_t ret;
 
-	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 	if (ret > 0) {
 		struct inode *inode = mapping->host;
 
@@ -783,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -870,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 		/*
 		 * We don't have an immediate reader, but we'll read the stuff
-		 * out of the pipe right after the move_to_pipe(). So set
+		 * out of the pipe right after the splice_to_pipe(). So set
 		 * PIPE_READERS appropriately.
 		 */
 		pipe->readers = 1;
@@ -1010,6 +1051,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial)
+{
+	int buffers = 0, error = 0;
+
+	/*
+	 * It's ok to take the mmap_sem for reading, even
+	 * across a "get_user()".
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		void __user *base;
+		size_t len;
+		int i;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (unlikely(!base))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > PIPE_BUFFERS - buffers)
+			npages = PIPE_BUFFERS - buffers;
+
+		error = get_user_pages(current, current->mm,
+				       (unsigned long) base, npages, 0, 0,
+				       &pages[buffers], NULL);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE) - off;
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == PIPE_BUFFERS)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	if (buffers)
+		return buffers;
+
+	return error;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, const struct iovec __user *iov,
+			unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+	};
+
+	if (unlikely(!pipe))
+		return -EBADF;
+	if (unlikely(nr_segs > UIO_MAXIOV))
+		return -EINVAL;
+	else if (unlikely(!nr_segs))
+		return 0;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
+	if (spd.nr_pages <= 0)
+		return spd.nr_pages;
+
+	return splice_to_pipe(pipe, &spd);
+}
+
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct file *file;
+	long error;
+	int fput;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = do_vmsplice(file, iov, nr_segs, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags)
author	David Teigland <teigland@redhat.com>	2006-04-27 11:49:55 -0400
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-04-27 11:49:55 -0400
commit	d26046bb0aff707aac38a9bf3dd56fa39b28a399 (patch)
tree	c59e0514ca45cb56c3cbf7920a4590dad6975bcd /fs
parent	e7f5c01caddbad150dcf003f76cd5aac413f4c50 (diff)
parent	2be4d50295e2b6f62c07b614e1b103e280dddb84 (diff)