From 29732938a6289a15e907da234d6692a2ead71855 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 10 Nov 2015 16:53:30 -0500 Subject: vfs: add copy_file_range syscall and vfs helper Add a copy_file_range() system call for offloading copies between regular files. This gives an interface to underlying layers of the storage stack which can copy without reading and writing all the data. There are a few candidates that should support copy offloading in the nearer term: - btrfs shares extent references with its clone ioctl - NFS has patches to add a COPY command which copies on the server - SCSI has a family of XCOPY commands which copy in the device This system call avoids the complexity of also accelerating the creation of the destination file by operating on an existing destination file descriptor, not a path. Currently the high level vfs entry point limits copy offloading to files on the same mount and super (and not in the same file). This can be relaxed if we get implementations which can copy between file systems safely. Signed-off-by: Zach Brown [Anna Schumaker: Change -EINVAL to -EBADF during file verification, Change flags parameter from int to unsigned int, Add function to include/linux/syscalls.h, Check copy len after file open mode, Don't forbid ranges inside the same file, Use rw_verify_area() to veriy ranges, Use file_out rather than file_in, Add COPY_FR_REFLINK flag] Signed-off-by: Anna Schumaker Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 3 +++ include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 4 +++- 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa514254161..e8a736242b1a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1629,6 +1629,7 @@ struct file_operations { #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); }; struct inode_operations { @@ -1680,6 +1681,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, loff_t *); extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); +extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, + loff_t, size_t, unsigned int); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c2b66a277e98..185815c96433 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -886,6 +886,9 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *envp, int flags); asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1324b0292ec2..2622b33fb2ec 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) __SYSCALL(__NR_membarrier, sys_membarrier) #define __NR_mlock2 284 __SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_copy_file_range 285 +__SYSCALL(__NR_copy_file_range, sys_copy_file_range) #undef __NR_syscalls -#define __NR_syscalls 285 +#define __NR_syscalls 286 /* * All syscalls below here should go away really, -- cgit v1.2.3 From acc15575e78e534c12549d8057a692f490a50f61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2015 12:59:49 +0100 Subject: locks: new locks_mandatory_area calling convention Pass a loff_t end for the last byte instead of the 32-bit count parameter to allow full file clones even on 32-bit architectures. While we're at it also simplify the read/write selection. Signed-off-by: Christoph Hellwig Acked-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/locks.c | 22 +++++++++------------- fs/read_write.c | 5 ++--- include/linux/fs.h | 30 ++++++++++++++---------------- 3 files changed, 25 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/fs/locks.c b/fs/locks.c index 0d2b3267e2a3..c77a299c1e9e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1227,20 +1227,16 @@ int locks_mandatory_locked(struct file *file) /** * locks_mandatory_area - Check for a conflicting lock - * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ - * for shared - * @inode: the file to check + * @inode: the file to check * @filp: how the file was opened (if it was) - * @offset: start of area to check - * @count: length of area to check + * @start: first byte in the file to check + * @end: lastbyte in the file to check + * @type: %F_WRLCK for a write lock, else %F_RDLCK * * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from rw_verify_area() and - * locks_verify_truncate(). */ -int locks_mandatory_area(int read_write, struct inode *inode, - struct file *filp, loff_t offset, - size_t count) +int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, + loff_t end, unsigned char type) { struct file_lock fl; int error; @@ -1252,9 +1248,9 @@ int locks_mandatory_area(int read_write, struct inode *inode, fl.fl_flags = FL_POSIX | FL_ACCESS; if (filp && !(filp->f_flags & O_NONBLOCK)) sleep = true; - fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; - fl.fl_start = offset; - fl.fl_end = offset + count - 1; + fl.fl_type = type; + fl.fl_start = start; + fl.fl_end = end; for (;;) { if (filp) { diff --git a/fs/read_write.c b/fs/read_write.c index c81ef394a3d4..6cfad4761fd8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -396,9 +396,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area( - read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, - inode, file, pos, count); + retval = locks_mandatory_area(inode, file, pos, pos + count - 1, + read_write == READ ? F_RDLCK : F_WRLCK); if (retval < 0) return retval; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e8a736242b1a..4377b2df991d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2030,12 +2030,9 @@ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK) -#define FLOCK_VERIFY_READ 1 -#define FLOCK_VERIFY_WRITE 2 - #ifdef CONFIG_FILE_LOCKING extern int locks_mandatory_locked(struct file *); -extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); +extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char); /* * Candidates for mandatory locking have the setgid bit set @@ -2065,17 +2062,19 @@ static inline int locks_verify_locked(struct file *file) } static inline int locks_verify_truncate(struct inode *inode, - struct file *filp, + struct file *f, loff_t size) { - if (inode->i_flctx && mandatory_lock(inode)) - return locks_mandatory_area( - FLOCK_VERIFY_WRITE, inode, filp, - size < inode->i_size ? size : inode->i_size, - (size < inode->i_size ? inode->i_size - size - : size - inode->i_size) - ); - return 0; + if (!inode->i_flctx || !mandatory_lock(inode)) + return 0; + + if (size < inode->i_size) { + return locks_mandatory_area(inode, f, size, inode->i_size - 1, + F_WRLCK); + } else { + return locks_mandatory_area(inode, f, inode->i_size, size - 1, + F_WRLCK); + } } static inline int break_lease(struct inode *inode, unsigned int mode) @@ -2144,9 +2143,8 @@ static inline int locks_mandatory_locked(struct file *file) return 0; } -static inline int locks_mandatory_area(int rw, struct inode *inode, - struct file *filp, loff_t offset, - size_t count) +static inline int locks_mandatory_area(struct inode *inode, struct file *filp, + loff_t start, loff_t end, unsigned char type) { return 0; } -- cgit v1.2.3 From 04b38d601239b4d9be641b412cf4b7456a041c67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2015 12:59:50 +0100 Subject: vfs: pull btrfs clone API to vfs layer The btrfs clone ioctls are now adopted by other file systems, with NFS and CIFS already having support for them, and XFS being under active development. To avoid growth of various slightly incompatible implementations, add one to the VFS. Note that clones are different from file copies in several ways: - they are atomic vs other writers - they support whole file clones - they support 64-bit legth clones - they do not allow partial success (aka short writes) - clones are expected to be a fast metadata operation Because of that it would be rather cumbersome to try to piggyback them on top of the recent clone_file_range infrastructure. The converse isn't true and the clone_file_range system call could try clone file range as a first attempt to copy, something that further patches will enable. Based on earlier work from Peng Tao. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/ctree.h | 3 +- fs/btrfs/file.c | 1 + fs/btrfs/ioctl.c | 49 ++----------------- fs/cifs/cifsfs.c | 63 ++++++++++++++++++++++++ fs/cifs/cifsfs.h | 1 - fs/cifs/ioctl.c | 126 +++++++++++++++++++++++------------------------- fs/ioctl.c | 29 +++++++++++ fs/nfs/nfs4file.c | 87 ++++----------------------------- fs/read_write.c | 72 +++++++++++++++++++++++++++ include/linux/fs.h | 7 ++- include/uapi/linux/fs.h | 9 ++++ 11 files changed, 254 insertions(+), 193 deletions(-) (limited to 'include') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ede7277c167f..dd4733fa882c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4025,7 +4025,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); - /* file.c */ int btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); @@ -4058,6 +4057,8 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); +int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e67fe6ab8c9e..232e300a6c93 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2925,6 +2925,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_ioctl, #endif .copy_file_range = btrfs_copy_file_range, + .clone_file_range = btrfs_clone_file_range, }; void btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f92735299d3..85b1caeeec85 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3906,49 +3906,10 @@ ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +int btrfs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) { - struct fd src_file; - int ret; - - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) - return -EINVAL; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) { - ret = -EINVAL; - goto out_fput; - } - - ret = btrfs_clone_files(file, src_file.file, off, olen, destoff); - -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(file); - return ret; -} - -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) -{ - struct btrfs_ioctl_clone_range_args args; - - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, - args.src_length, args.dest_offset); + return btrfs_clone_files(dst_file, src_file, off, len, destoff); } /* @@ -5498,10 +5459,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cbc0f4bca0c0..e9b978f2e114 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -914,6 +914,61 @@ const struct inode_operations cifs_symlink_inode_ops = { #endif }; +static int cifs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *target_inode = file_inode(dst_file); + struct cifsFileInfo *smb_file_src = src_file->private_data; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct cifs_tcon *src_tcon = tlink_tcon(smb_file_src->tlink); + struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink); + unsigned int xid; + int rc; + + cifs_dbg(FYI, "clone range\n"); + + xid = get_xid(); + + if (!src_file->private_data || !dst_file->private_data) { + rc = -EBADF; + cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); + goto out; + } + + /* + * Note: cifs case is easier than btrfs since server responsible for + * checks for proper open modes and file type and if it wants + * server could even support copy of range where source = target + */ + lock_two_nondirectories(target_inode, src_inode); + + if (len == 0) + len = src_inode->i_size - off; + + cifs_dbg(FYI, "about to flush pages\n"); + /* should we flush first and last page first */ + truncate_inode_pages_range(&target_inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len)-1); + + if (target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; + + /* force revalidate of size and timestamps of target file now + that target is updated on the server */ + CIFS_I(target_inode)->time = 0; +out_unlock: + /* although unlocking in the reverse order from locking is not + strictly necessary here it is a little cleaner to be consistent */ + unlock_two_nondirectories(src_inode, target_inode); +out: + free_xid(xid); + return rc; +} + const struct file_operations cifs_file_ops = { .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, @@ -926,6 +981,7 @@ const struct file_operations cifs_file_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -942,6 +998,8 @@ const struct file_operations cifs_file_strict_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -958,6 +1016,7 @@ const struct file_operations cifs_file_direct_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -974,6 +1033,7 @@ const struct file_operations cifs_file_nobrl_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -989,6 +1049,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1004,6 +1065,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1014,6 +1076,7 @@ const struct file_operations cifs_dir_ops = { .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c3cc1609025f..c399513c3cbd 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -131,7 +131,6 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *, extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 35cf990f87d3..7a3b84e300f8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -34,73 +34,36 @@ #include "cifs_ioctl.h" #include -static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff, - bool dup_extents) +static int cifs_file_clone_range(unsigned int xid, struct file *src_file, + struct file *dst_file) { - int rc; - struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); - struct cifs_tcon *target_tcon; - struct fd src_file; struct cifsFileInfo *smb_file_src; - struct inode *src_inode; + struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; + struct cifs_tcon *target_tcon; + int rc; cifs_dbg(FYI, "ioctl clone range\n"); - /* the destination must be opened for writing */ - if (!(dst_file->f_mode & FMODE_WRITE)) { - cifs_dbg(FYI, "file target not open for write\n"); - return -EINVAL; - } - /* check if target volume is readonly and take reference */ - rc = mnt_want_write_file(dst_file); - if (rc) { - cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); - return rc; - } - - src_file = fdget(srcfd); - if (!src_file.file) { - rc = -EBADF; - goto out_drop_write; - } - - if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { - rc = -EBADF; - cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); - goto out_fput; - } - - if ((!src_file.file->private_data) || (!dst_file->private_data)) { + if (!src_file->private_data || !dst_file->private_data) { rc = -EBADF; cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); - goto out_fput; + goto out; } rc = -EXDEV; smb_file_target = dst_file->private_data; - smb_file_src = src_file.file->private_data; + smb_file_src = src_file->private_data; src_tcon = tlink_tcon(smb_file_src->tlink); target_tcon = tlink_tcon(smb_file_target->tlink); - /* check source and target on same server (or volume if dup_extents) */ - if (dup_extents && (src_tcon != target_tcon)) { - cifs_dbg(VFS, "source and target of copy not on same share\n"); - goto out_fput; - } - - if (!dup_extents && (src_tcon->ses != target_tcon->ses)) { + if (src_tcon->ses != target_tcon->ses) { cifs_dbg(VFS, "source and target of copy not on same server\n"); - goto out_fput; + goto out; } - src_inode = file_inode(src_file.file); - rc = -EINVAL; - if (S_ISDIR(src_inode->i_mode)) - goto out_fput; - /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants @@ -108,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, */ lock_two_nondirectories(target_inode, src_inode); - /* determine range to clone */ - rc = -EINVAL; - if (off + len > src_inode->i_size || off + len < off) - goto out_unlock; - if (len == 0) - len = src_inode->i_size - off; - cifs_dbg(FYI, "about to flush pages\n"); /* should we flush first and last page first */ - truncate_inode_pages_range(&target_inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len)-1); + truncate_inode_pages(&target_inode->i_data, 0); - if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) - rc = target_tcon->ses->server->ops->duplicate_extents(xid, - smb_file_src, smb_file_target, off, len, destoff); - else if (!dup_extents && target_tcon->ses->server->ops->clone_range) + if (target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, - smb_file_src, smb_file_target, off, len, destoff); + smb_file_src, smb_file_target, 0, src_inode->i_size, 0); else rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ CIFS_I(target_inode)->time = 0; -out_unlock: /* although unlocking in the reverse order from locking is not strictly necessary here it is a little cleaner to be consistent */ unlock_two_nondirectories(src_inode, target_inode); +out: + return rc; +} + +static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, + unsigned long srcfd) +{ + int rc; + struct fd src_file; + struct inode *src_inode; + + cifs_dbg(FYI, "ioctl clone range\n"); + /* the destination must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) { + cifs_dbg(FYI, "file target not open for write\n"); + return -EINVAL; + } + + /* check if target volume is readonly and take reference */ + rc = mnt_want_write_file(dst_file); + if (rc) { + cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); + return rc; + } + + src_file = fdget(srcfd); + if (!src_file.file) { + rc = -EBADF; + goto out_drop_write; + } + + if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { + rc = -EBADF; + cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); + goto out_fput; + } + + src_inode = file_inode(src_file.file); + rc = -EINVAL; + if (S_ISDIR(src_inode->i_mode)) + goto out_fput; + + rc = cifs_file_clone_range(xid, src_file.file, dst_file); + out_fput: fdput(src_file); out_drop_write: @@ -256,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); - break; - case BTRFS_IOC_CLONE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + rc = cifs_ioctl_clone(xid, filep, arg); break; case CIFS_IOC_SET_INTEGRITY: if (pSMBFile == NULL) diff --git a/fs/ioctl.c b/fs/ioctl.c index 5d01d2638ca5..84c6e79829ab 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) return error; } +static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct fd src_file = fdget(srcfd); + int ret; + + if (!src_file.file) + return -EBADF; + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + fdput(src_file); + return ret; +} + +static long ioctl_file_clone_range(struct file *file, void __user *argp) +{ + struct file_clone_range args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return ioctl_file_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + #ifdef CONFIG_BLOCK static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -600,6 +623,12 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, argp); + case FICLONE: + return ioctl_file_clone(filp, arg, 0, 0, 0); + + case FICLONERANGE: + return ioctl_file_clone_range(filp, argp); + default: if (S_ISREG(inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index db9b5fea5b3e..26f9a23e2b25 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -195,65 +195,27 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static noinline long -nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, - u64 src_off, u64 dst_off, u64 count) +static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, u64 count) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); - struct fd src_file; - struct inode *src_inode; + struct inode *src_inode = file_inode(src_file); unsigned int bs = server->clone_blksize; bool same_inode = false; int ret; - /* dst file must be opened for writing */ - if (!(dst_file->f_mode & FMODE_WRITE)) - return -EINVAL; - - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - src_inode = file_inode(src_file.file); - - if (src_inode == dst_inode) - same_inode = true; - - /* src file must be opened for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; - - /* src and dst must be regular files */ - ret = -EISDIR; - if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src_file.file->f_path.mnt != dst_file->f_path.mnt || - src_inode->i_sb != dst_inode->i_sb) - goto out_fput; - /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs)) - goto out_fput; + goto out; if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count)) - goto out_fput; + goto out; } - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (dst_off + count > src_off && dst_off < src_off + count) - goto out_fput; - } + if (src_inode == dst_inode) + same_inode = true; /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { @@ -275,7 +237,7 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, if (ret) goto out_unlock; - ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count); + ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count); /* truncate inode page cache of the dst range so that future reads can fetch * new data from server */ @@ -292,37 +254,9 @@ out_unlock: mutex_unlock(&dst_inode->i_mutex); mutex_unlock(&src_inode->i_mutex); } -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(dst_file); +out: return ret; } - -static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp) -{ - struct btrfs_ioctl_clone_range_args args; - - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - - return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_offset, - args.dest_offset, args.src_length); -} - -long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - - switch (cmd) { - case BTRFS_IOC_CLONE: - return nfs42_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return nfs42_ioctl_clone_range(file, argp); - } - - return -ENOTTY; -} #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { @@ -342,8 +276,7 @@ const struct file_operations nfs4_file_operations = { #ifdef CONFIG_NFS_V4_2 .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .unlocked_ioctl = nfs4_ioctl, - .compat_ioctl = nfs4_ioctl, + .clone_file_range = nfs42_clone_file_range, #else .llseek = nfs_file_llseek, #endif diff --git a/fs/read_write.c b/fs/read_write.c index 6cfad4761fd8..c75d02cb13ec 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1451,3 +1451,75 @@ out1: out2: return ret; } + +static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + int ret; + + if (inode_in->i_sb != inode_out->i_sb || + file_in->f_path.mnt != file_out->f_path.mnt) + return -EXDEV; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EOPNOTSUPP; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND) || + !file_in->f_op->clone_file_range) + return -EBADF; + + ret = clone_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = clone_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + if (pos_in + len > i_size_read(inode_in)) + return -EINVAL; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (!ret) { + fsnotify_access(file_in); + fsnotify_modify(file_out); + } + + mnt_drop_write_file(file_out); + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4377b2df991d..5d987aefcf1e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1629,7 +1629,10 @@ struct file_operations { #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif - ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, + loff_t, size_t, unsigned int); + int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, + u64); }; struct inode_operations { @@ -1683,6 +1686,8 @@ extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); +extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..cd5db7fb3cb7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -39,6 +39,13 @@ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +struct file_clone_range { + __s64 src_fd; + __u64 src_offset; + __u64 src_length; + __u64 dest_offset; +}; + struct fstrim_range { __u64 start; __u64 len; @@ -159,6 +166,8 @@ struct inodes_stat_t { #define FIFREEZE _IOWR('X', 119, int) /* Freeze */ #define FITHAW _IOWR('X', 120, int) /* Thaw */ #define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */ +#define FICLONE _IOW(0x94, 9, int) +#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) #define FS_IOC_GETFLAGS _IOR('f', 1, long) #define FS_IOC_SETFLAGS _IOW('f', 2, long) -- cgit v1.2.3 From ffa0160a103917defd5d9c097ae0455a59166e03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2015 12:59:52 +0100 Subject: nfsd: implement the NFSv4.2 CLONE operation This is basically a remote version of the btrfs CLONE operation, so the implementation is fairly trivial. Made even more trivial by stealing the XDR code and general framework Anna Schumaker's COPY prototype. Signed-off-by: Christoph Hellwig Acked-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/nfsd/nfs4proc.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 21 +++++++++++++++++++++ fs/nfsd/vfs.c | 8 ++++++++ fs/nfsd/vfs.h | 2 ++ fs/nfsd/xdr4.h | 10 ++++++++++ include/linux/nfs4.h | 4 ++-- 6 files changed, 90 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3ba10a3534f1..819ad812c71b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1011,6 +1011,47 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, + &clone->cl_src_stateid, RD_STATE, + &src, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + goto out; + } + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &clone->cl_dst_stateid, WR_STATE, + &dst, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + goto out_put_src; + } + + /* fix up for NFS-specific error code */ + if (!S_ISREG(file_inode(src)->i_mode) || + !S_ISREG(file_inode(dst)->i_mode)) { + status = nfserr_wrong_type; + goto out_put_dst; + } + + status = nfsd4_clone_file_range(src, clone->cl_src_pos, + dst, clone->cl_dst_pos, clone->cl_count); + +out_put_dst: + fput(dst); +out_put_src: + fput(src); +out: + return status; +} + static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) @@ -2281,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_DEALLOCATE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_CLONE] = { + .op_func = (nfsd4op_func)nfsd4_clone, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_CLONE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51c9e9ca39a4..924416f91fdd 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1674,6 +1674,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, DECODE_TAIL; } +static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8); + p = xdr_decode_hyper(p, &clone->cl_src_pos); + p = xdr_decode_hyper(p, &clone->cl_dst_pos); + p = xdr_decode_hyper(p, &clone->cl_count); + DECODE_TAIL; +} + static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { @@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, }; static inline bool @@ -4292,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, }; /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 994d66fbb446..5411bf09b810 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -36,6 +36,7 @@ #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 +#include "../internal.h" #include "acl.h" #include "idmap.h" #endif /* CONFIG_NFSD_V4 */ @@ -498,6 +499,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fcfc48cbe136..c11ba316f23f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); +__be32 nfsd4_clone_file_range(struct file *, u64, struct file *, + u64, u64); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ce7362c88b48..d9554813e58a 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -491,6 +491,15 @@ struct nfsd4_fallocate { u64 falloc_length; }; +struct nfsd4_clone { + /* request */ + stateid_t cl_src_stateid; + stateid_t cl_dst_stateid; + u64 cl_src_pos; + u64 cl_dst_pos; + u64 cl_count; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -555,6 +564,7 @@ struct nfsd4_op { /* NFSv4.2 */ struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; + struct nfsd4_clone clone; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e7e78537aea2..43aeabd4b968 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -139,10 +139,10 @@ enum nfs_opnum4 { Needs to be updated if more operations are defined in future.*/ #define FIRST_NFS4_OP OP_ACCESS -#define LAST_NFS4_OP OP_WRITE_SAME #define LAST_NFS40_OP OP_RELEASE_LOCKOWNER #define LAST_NFS41_OP OP_RECLAIM_COMPLETE -#define LAST_NFS42_OP OP_WRITE_SAME +#define LAST_NFS42_OP OP_CLONE +#define LAST_NFS4_OP LAST_NFS42_OP enum nfsstat4 { NFS4_OK = 0, -- cgit v1.2.3 From 54dbc15172375641ef03399e8f911d7165eb90fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 19 Dec 2015 00:55:59 -0800 Subject: vfs: hoist the btrfs deduplication ioctl to the vfs Hoist the btrfs EXTENT_SAME ioctl up to the VFS and make the name more systematic (FIDEDUPERANGE). Signed-off-by: Darrick J. Wong Signed-off-by: Al Viro --- fs/compat_ioctl.c | 1 + fs/ioctl.c | 38 ++++++++++++++++++ fs/read_write.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 4 ++ include/uapi/linux/fs.h | 30 +++++++++++++++ 5 files changed, 173 insertions(+) (limited to 'include') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 70d4b104c08d..eab31e74b9cc 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1582,6 +1582,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, case FICLONE: case FICLONERANGE: + case FIDEDUPERANGE: goto do_ioctl; case FIBMAP: diff --git a/fs/ioctl.c b/fs/ioctl.c index 84c6e79829ab..fcdd33b7ec78 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -568,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } +static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +{ + struct file_dedupe_range __user *argp = arg; + struct file_dedupe_range *same = NULL; + int ret; + unsigned long size; + u16 count; + + if (get_user(count, &argp->dest_count)) { + ret = -EFAULT; + goto out; + } + + size = offsetof(struct file_dedupe_range __user, info[count]); + + same = memdup_user(argp, size); + if (IS_ERR(same)) { + ret = PTR_ERR(same); + same = NULL; + goto out; + } + + ret = vfs_dedupe_file_range(file, same); + if (ret) + goto out; + + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; + +out: + kfree(same); + return ret; +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -629,6 +664,9 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, case FICLONERANGE: return ioctl_file_clone_range(filp, argp); + case FIDEDUPERANGE: + return ioctl_file_dedupe_range(filp, argp); + default: if (S_ISREG(inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/read_write.c b/fs/read_write.c index 60ee26941231..2116e74a83d3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1523,3 +1523,103 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; } EXPORT_SYMBOL(vfs_clone_file_range); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + bool is_admin = capable(CAP_SYS_ADMIN); + u16 count = same->dest_count; + struct file *dst_file; + loff_t dst_off; + ssize_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; + + ret = -EINVAL; + if (!S_ISREG(src->i_mode)) + goto out; + + ret = clone_verify_area(file, off, len, false); + if (ret < 0) + goto out; + ret = 0; + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_fd = fdget(info->dest_fd); + + dst_file = dst_fd.file; + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + dst = file_inode(dst_file); + + ret = mnt_want_write_file(dst_file); + if (ret) { + info->status = ret; + goto next_loop; + } + + dst_off = info->dest_offset; + ret = clone_verify_area(dst_file, dst_off, len, true); + if (ret < 0) { + info->status = ret; + goto next_file; + } + ret = 0; + + if (info->reserved) { + info->status = -EINVAL; + } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + } else if (file->f_path.mnt != dst_file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + } else if (dst_file->f_op->dedupe_file_range == NULL) { + info->status = -EINVAL; + } else { + deduped = dst_file->f_op->dedupe_file_range(file, off, + len, dst_file, + info->dest_offset); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped += deduped; + } + +next_file: + mnt_drop_write_file(dst_file); +next_loop: + fdput(dst_fd); + } + +out: + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d987aefcf1e..d71814b81a3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1633,6 +1633,8 @@ struct file_operations { loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, + u64); }; struct inode_operations { @@ -1688,6 +1690,8 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); +extern int vfs_dedupe_file_range(struct file *file, + struct file_dedupe_range *same); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index cd5db7fb3cb7..b38e647664a0 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -52,6 +52,35 @@ struct fstrim_range { __u64 minlen; }; +/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ +#define FILE_DEDUPE_RANGE_SAME 0 +#define FILE_DEDUPE_RANGE_DIFFERS 1 + +/* from struct btrfs_ioctl_file_extent_same_info */ +struct file_dedupe_range_info { + __s64 dest_fd; /* in - destination file */ + __u64 dest_offset; /* in - start of extent in destination */ + __u64 bytes_deduped; /* out - total # of bytes we were able + * to dedupe from this file. */ + /* status of this dedupe operation: + * < 0 for error + * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds + * == FILE_DEDUPE_RANGE_DIFFERS if data differs + */ + __s32 status; /* out - see above description */ + __u32 reserved; /* must be zero */ +}; + +/* from struct btrfs_ioctl_file_extent_same_args */ +struct file_dedupe_range { + __u64 src_offset; /* in - start of extent in source */ + __u64 src_length; /* in - length of extent */ + __u16 dest_count; /* in - total elements in info array */ + __u16 reserved1; /* must be zero */ + __u32 reserved2; /* must be zero */ + struct file_dedupe_range_info info[0]; +}; + /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { unsigned long nr_files; /* read only */ @@ -168,6 +197,7 @@ struct inodes_stat_t { #define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */ #define FICLONE _IOW(0x94, 9, int) #define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) +#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range) #define FS_IOC_GETFLAGS _IOR('f', 1, long) #define FS_IOC_SETFLAGS _IOW('f', 2, long) -- cgit v1.2.3