summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c40
-rw-r--r--fs/9p/v9fs.h3
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/adfs/inode.c4
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/amigaffs.c6
-rw-r--r--fs/affs/inode.c4
-rw-r--r--fs/affs/namei.c6
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/cmservice.c168
-rw-r--r--fs/afs/dir.c9
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/fsclient.c148
-rw-r--r--fs/afs/internal.h38
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/rxrpc.c522
-rw-r--r--fs/afs/server.c11
-rw-r--r--fs/afs/vlclient.c7
-rw-r--r--fs/afs/vlocation.c4
-rw-r--r--fs/afs/write.c5
-rw-r--r--fs/aio.c9
-rw-r--r--fs/attr.c52
-rw-r--r--fs/autofs4/autofs_i.h9
-rw-r--r--fs/autofs4/dev-ioctl.c77
-rw-r--r--fs/autofs4/inode.c47
-rw-r--r--fs/autofs4/root.c10
-rw-r--r--fs/autofs4/waitq.c4
-rw-r--r--fs/bad_inode.c25
-rw-r--r--fs/befs/befs.h19
-rw-r--r--fs/befs/btree.c60
-rw-r--r--fs/befs/datastream.c253
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/io.c26
-rw-r--r--fs/befs/io.h2
-rw-r--r--fs/befs/linuxvfs.c132
-rw-r--r--fs/befs/super.c36
-rw-r--r--fs/bfs/dir.c20
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/block_dev.c95
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/inode.c60
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-log.c4
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/cachefiles/bind.c4
-rw-r--r--fs/cachefiles/interface.c8
-rw-r--r--fs/cachefiles/internal.h3
-rw-r--r--fs/cachefiles/namei.c15
-rw-r--r--fs/ceph/acl.c11
-rw-r--r--fs/ceph/addr.c24
-rw-r--r--fs/ceph/dir.c9
-rw-r--r--fs/ceph/file.c11
-rw-r--r--fs/ceph/inode.c30
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c30
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/strings.c2
-rw-r--r--fs/ceph/super.c49
-rw-r--r--fs/ceph/xattr.c6
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/cifs_debug.c1
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h8
-rw-r--r--fs/cifs/cifsacl.c123
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h10
-rw-r--r--fs/cifs/cifsglob.h50
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c31
-rw-r--r--fs/cifs/connect.c43
-rw-r--r--fs/cifs/dir.c6
-rw-r--r--fs/cifs/file.c123
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/ioctl.c16
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/cifs/misc.c15
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/smb2inode.c6
-rw-r--r--fs/cifs/smb2misc.c16
-rw-r--r--fs/cifs/smb2ops.c62
-rw-r--r--fs/cifs/smb2pdu.c587
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/xattr.c62
-rw-r--r--fs/coda/dir.c8
-rw-r--r--fs/coda/file.c25
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/compat.c16
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/configfs/inode.c6
-rw-r--r--fs/crypto/crypto.c26
-rw-r--r--fs/crypto/fname.c85
-rw-r--r--fs/crypto/keyinfo.c71
-rw-r--r--fs/crypto/policy.c4
-rw-r--r--fs/dax.c254
-rw-r--r--fs/debugfs/file.c15
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/debugfs/internal.h4
-rw-r--r--fs/devpts/inode.c77
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c75
-rw-r--r--fs/ecryptfs/main.c1
-rw-r--r--fs/ecryptfs/mmap.c13
-rw-r--r--fs/efivarfs/inode.c7
-rw-r--r--fs/efivarfs/super.c13
-rw-r--r--fs/exec.c9
-rw-r--r--fs/exofs/dir.c8
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/exofs/namei.c14
-rw-r--r--fs/exportfs/expfs.c10
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/acl.c12
-rw-r--r--fs/ext2/dir.c6
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c80
-rw-r--r--fs/ext2/ialloc.c7
-rw-r--r--fs/ext2/inode.c121
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/namei.c18
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/symlink.c6
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext4/acl.c12
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h35
-rw-r--r--fs/ext4/extents.c27
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/fsync.c9
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c73
-rw-r--r--fs/ext4/ioctl.c11
-rw-r--r--fs/ext4/mballoc.h17
-rw-r--r--fs/ext4/move_extent.c7
-rw-r--r--fs/ext4/namei.c48
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/super.c147
-rw-r--r--fs/ext4/symlink.c19
-rw-r--r--fs/ext4/sysfs.c4
-rw-r--r--fs/ext4/xattr.c360
-rw-r--r--fs/f2fs/acl.c18
-rw-r--r--fs/f2fs/acl.h1
-rw-r--r--fs/f2fs/checkpoint.c205
-rw-r--r--fs/f2fs/data.c133
-rw-r--r--fs/f2fs/debug.c17
-rw-r--r--fs/f2fs/dir.c124
-rw-r--r--fs/f2fs/f2fs.h175
-rw-r--r--fs/f2fs/file.c33
-rw-r--r--fs/f2fs/gc.c94
-rw-r--r--fs/f2fs/inline.c27
-rw-r--r--fs/f2fs/inode.c17
-rw-r--r--fs/f2fs/namei.c47
-rw-r--r--fs/f2fs/node.c40
-rw-r--r--fs/f2fs/node.h77
-rw-r--r--fs/f2fs/recovery.c124
-rw-r--r--fs/f2fs/segment.c219
-rw-r--r--fs/f2fs/segment.h11
-rw-r--r--fs/f2fs/super.c93
-rw-r--r--fs/f2fs/xattr.c41
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_msdos.c18
-rw-r--r--fs/fat/namei_vfat.c35
-rw-r--r--fs/file.c34
-rw-r--r--fs/fuse/Kconfig1
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/acl.c99
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dev.c64
-rw-r--r--fs/fuse/dir.c288
-rw-r--r--fs/fuse/file.c70
-rw-r--r--fs/fuse/fuse_i.h42
-rw-r--r--fs/fuse/inode.c28
-rw-r--r--fs/fuse/xattr.c211
-rw-r--r--fs/gfs2/acl.c12
-rw-r--r--fs/gfs2/aops.c19
-rw-r--r--fs/gfs2/bmap.c14
-rw-r--r--fs/gfs2/dir.c32
-rw-r--r--fs/gfs2/file.c34
-rw-r--r--fs/gfs2/glock.c10
-rw-r--r--fs/gfs2/inode.c25
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/meta_io.c35
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hfs/attr.c83
-rw-r--r--fs/hfs/catalog.c8
-rw-r--r--fs/hfs/dir.c8
-rw-r--r--fs/hfs/hfs_fs.h6
-rw-r--r--fs/hfs/inode.c9
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/catalog.c8
-rw-r--r--fs/hfsplus/dir.c15
-rw-r--r--fs/hfsplus/inode.c7
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/posix_acl.c4
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/file.c6
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/namei.c6
-rw-r--r--fs/hugetlbfs/inode.c24
-rw-r--r--fs/inode.c105
-rw-r--r--fs/internal.h22
-rw-r--r--fs/iomap.c94
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/journal.c131
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--fs/jffs2/acl.c11
-rw-r--r--fs/jffs2/dir.c12
-rw-r--r--fs/jffs2/file.c3
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/symlink.c3
-rw-r--r--fs/jffs2/xattr.h6
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/file.c5
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_inode.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c3
-rw-r--r--fs/jfs/namei.c32
-rw-r--r--fs/jfs/resize.c10
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/symlink.c6
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernfs/dir.c93
-rw-r--r--fs/kernfs/inode.c159
-rw-r--r--fs/kernfs/kernfs-internal.h7
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/kernfs/symlink.c3
-rw-r--r--fs/libfs.c51
-rw-r--r--fs/lockd/procfs.h2
-rw-r--r--fs/locks.c150
-rw-r--r--fs/logfs/dir.c12
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/logfs/inode.c4
-rw-r--r--fs/logfs/readwrite.c4
-rw-r--r--fs/mbcache.c6
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/dir.c6
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_common.c4
-rw-r--r--fs/minix/namei.c10
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c42
-rw-r--r--fs/namespace.c80
-rw-r--r--fs/ncpfs/dir.c11
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/cache_lib.c2
-rw-r--r--fs/nfs/callback.c136
-rw-r--r--fs/nfs/callback.h12
-rw-r--r--fs/nfs/callback_proc.c16
-rw-r--r--fs/nfs/callback_xdr.c53
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/delegation.c213
-rw-r--r--fs/nfs/delegation.h8
-rw-r--r--fs/nfs/dir.c30
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c36
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/internal.h18
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs42proc.c1
-rw-r--r--fs/nfs/nfs4_fs.h15
-rw-r--r--fs/nfs/nfs4client.c118
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c907
-rw-r--r--fs/nfs/nfs4session.h2
-rw-r--r--fs/nfs/nfs4state.c84
-rw-r--r--fs/nfs/nfs4xdr.c42
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c73
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/pnfs_nfs.c58
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/auth.c6
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/flexfilelayout.c1
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs4callback.c64
-rw-r--r--fs/nfsd/nfs4layouts.c6
-rw-r--r--fs/nfsd/nfs4proc.c90
-rw-r--r--fs/nfsd/nfs4state.c239
-rw-r--r--fs/nfsd/nfs4xdr.c65
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/nfsproc.c11
-rw-r--r--fs/nfsd/nfssvc.c18
-rw-r--r--fs/nfsd/pnfs.h1
-rw-r--r--fs/nfsd/state.h22
-rw-r--r--fs/nfsd/vfs.c16
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr4.h23
-rw-r--r--fs/nfsd/xdr4cb.h9
-rw-r--r--fs/nilfs2/dir.c6
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/namei.c12
-rw-r--r--fs/notify/fanotify/fanotify_user.c39
-rw-r--r--fs/notify/group.c6
-rw-r--r--fs/notify/inotify/inotify_user.c16
-rw-r--r--fs/notify/notification.c35
-rw-r--r--fs/nsfs.c107
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ocfs2/acl.c12
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c12
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c8
-rw-r--r--fs/ocfs2/file.c51
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/namei.c19
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/refcounttree.c4
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ocfs2/symlink.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/dir.c10
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c25
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/acl.c15
-rw-r--r--fs/orangefs/dcache.c5
-rw-r--r--fs/orangefs/devorangefs-req.c164
-rw-r--r--fs/orangefs/dir.c4
-rw-r--r--fs/orangefs/downcall.h11
-rw-r--r--fs/orangefs/file.c63
-rw-r--r--fs/orangefs/inode.c19
-rw-r--r--fs/orangefs/namei.c71
-rw-r--r--fs/orangefs/orangefs-cache.c6
-rw-r--r--fs/orangefs/orangefs-debugfs.c765
-rw-r--r--fs/orangefs/orangefs-debugfs.h8
-rw-r--r--fs/orangefs/orangefs-dev-proto.h6
-rw-r--r--fs/orangefs/orangefs-kernel.h58
-rw-r--r--fs/orangefs/orangefs-mod.c86
-rw-r--r--fs/orangefs/orangefs-sysfs.c1287
-rw-r--r--fs/orangefs/orangefs-utils.c403
-rw-r--r--fs/orangefs/protocol.h25
-rw-r--r--fs/orangefs/super.c31
-rw-r--r--fs/orangefs/symlink.c1
-rw-r--r--fs/orangefs/upcall.h18
-rw-r--r--fs/orangefs/waitqueue.c10
-rw-r--r--fs/orangefs/xattr.c3
-rw-r--r--fs/overlayfs/copy_up.c93
-rw-r--r--fs/overlayfs/dir.c20
-rw-r--r--fs/overlayfs/inode.c52
-rw-r--r--fs/overlayfs/overlayfs.h4
-rw-r--r--fs/overlayfs/super.c44
-rw-r--r--fs/pipe.c181
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/posix_acl.c62
-rw-r--r--fs/proc/array.c198
-rw-r--r--fs/proc/base.c117
-rw-r--r--fs/proc/fd.c8
-rw-r--r--fs/proc/fd.h2
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c211
-rw-r--r--fs/proc/proc_net.c13
-rw-r--r--fs/proc/proc_sysctl.c23
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/stat.c49
-rw-r--r--fs/proc/task_mmu.c39
-rw-r--r--fs/proc/task_nommu.c28
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/platform.c53
-rw-r--r--fs/pstore/pmsg.c35
-rw-r--r--fs/pstore/ram.c46
-rw-r--r--fs/pstore/ram_core.c96
-rw-r--r--fs/quota/quota.c18
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c29
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/ioctl.c4
-rw-r--r--fs/reiserfs/namei.c27
-rw-r--r--fs/reiserfs/stree.c8
-rw-r--r--fs/reiserfs/super.c14
-rw-r--r--fs/reiserfs/xattr.c6
-rw-r--r--fs/reiserfs/xattr_acl.c10
-rw-r--r--fs/select.c14
-rw-r--r--fs/seq_file.c57
-rw-r--r--fs/splice.c677
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/xattr.h1
-rw-r--r--fs/super.c43
-rw-r--r--fs/sysfs/dir.c6
-rw-r--r--fs/sysfs/group.c4
-rw-r--r--fs/sysv/dir.c6
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/itree.c4
-rw-r--r--fs/sysv/namei.c10
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/dir.c256
-rw-r--r--fs/ubifs/file.c10
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/journal.c188
-rw-r--r--fs/ubifs/lprops.c2
-rw-r--r--fs/ubifs/lpt_commit.c4
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/ubifs.h8
-rw-r--r--fs/ubifs/xattr.c8
-rw-r--r--fs/udf/file.c20
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/namei.c26
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c8
-rw-r--r--fs/ufs/namei.c12
-rw-r--r--fs/utimes.c19
-rw-r--r--fs/xattr.c250
-rw-r--r--fs/xfs/Makefile8
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c338
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h35
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c158
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c783
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h79
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_btree.h44
-rw-r--r--fs/xfs/libxfs/xfs_defer.c79
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c3
-rw-r--r--fs/xfs/libxfs/xfs_format.h98
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c37
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h3
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c70
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h28
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h128
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1698
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h70
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c451
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h74
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c914
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c82
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c23
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/xfs_acl.c15
-rw-r--r--fs/xfs/xfs_aops.c253
-rw-r--r--fs/xfs/xfs_aops.h5
-rw-r--r--fs/xfs/xfs_bmap_item.c508
-rw-r--r--fs/xfs/xfs_bmap_item.h98
-rw-r--r--fs/xfs/xfs_bmap_util.c591
-rw-r--r--fs/xfs/xfs_buf_item.c9
-rw-r--r--fs/xfs/xfs_dir2_readdir.c3
-rw-r--r--fs/xfs/xfs_error.h10
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_file.c319
-rw-r--r--fs/xfs/xfs_filestream.c13
-rw-r--r--fs/xfs/xfs_fsops.c109
-rw-r--r--fs/xfs/xfs_fsops.h3
-rw-r--r--fs/xfs/xfs_globals.c5
-rw-r--r--fs/xfs/xfs_icache.c257
-rw-r--r--fs/xfs/xfs_icache.h7
-rw-r--r--fs/xfs/xfs_inode.c55
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c77
-rw-r--r--fs/xfs/xfs_iomap.c548
-rw-r--r--fs/xfs/xfs_iomap.h5
-rw-r--r--fs/xfs/xfs_iops.c114
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c548
-rw-r--r--fs/xfs/xfs_mount.c47
-rw-r--r--fs/xfs/xfs_mount.h52
-rw-r--r--fs/xfs/xfs_ondisk.h3
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_refcount_item.c539
-rw-r--r--fs/xfs/xfs_refcount_item.h101
-rw-r--r--fs/xfs/xfs_reflink.c1733
-rw-r--r--fs/xfs/xfs_reflink.h55
-rw-r--r--fs/xfs/xfs_rmap_item.c48
-rw-r--r--fs/xfs/xfs_rmap_item.h8
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c94
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c51
-rw-r--r--fs/xfs/xfs_trace.h855
-rw-r--r--fs/xfs/xfs_trans.c3
-rw-r--r--fs/xfs/xfs_trans.h29
-rw-r--r--fs/xfs/xfs_trans_bmap.c249
-rw-r--r--fs/xfs/xfs_trans_extfree.c3
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_trans_refcount.c264
-rw-r--r--fs/xfs/xfs_trans_rmap.c9
-rw-r--r--fs/xfs/xfs_xattr.c1
533 files changed, 22556 insertions, 8902 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 5b6a1743ea17..b3c2cc79c20d 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,32 +276,26 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- umode_t mode = inode->i_mode;
- retval = posix_acl_equiv_mode(acl, &mode);
- if (retval < 0)
+ struct iattr iattr;
+
+ retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+ if (retval)
goto err_out;
- else {
- struct iattr iattr;
- if (retval == 0) {
- /*
- * ACL can be represented
- * by the mode bits. So don't
- * update ACL.
- */
- acl = NULL;
- value = NULL;
- size = 0;
- }
- /* Updte the mode bits */
- iattr.ia_mode = ((mode & S_IALLUGO) |
- (inode->i_mode & ~S_IALLUGO));
- iattr.ia_valid = ATTR_MODE;
- /* FIXME should we update ctime ?
- * What is the following setxattr update the
- * mode ?
+ if (!acl) {
+ /*
+ * ACL can be represented
+ * by the mode bits. So don't
+ * update ACL.
*/
- v9fs_vfs_setattr_dotl(dentry, &iattr);
+ value = NULL;
+ size = 0;
}
+ iattr.ia_valid = ATTR_MODE;
+ /* FIXME should we update ctime ?
+ * What is the following setxattr update the
+ * mode ?
+ */
+ v9fs_vfs_setattr_dotl(dentry, &iattr);
}
break;
case ACL_TYPE_DEFAULT:
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6877050384a1..443d12e02043 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -148,7 +148,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry);
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags);
extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
struct p9_fid *fid,
struct super_block *sb, int new);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8b1999b528e9..30ca770c5e0b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -276,7 +276,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode_init_owner(inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_mapping->a_ops = &v9fs_addr_operations;
switch (mode & S_IFMT) {
@@ -955,7 +955,8 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
int
v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
int retval;
struct inode *old_inode;
@@ -966,6 +967,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
+ if (flags)
+ return -EINVAL;
+
p9_debug(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = d_inode(old_dentry);
@@ -1094,7 +1098,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct p9_wstat wstat;
p9_debug(P9_DEBUG_VFS, "\n");
- retval = inode_change_ok(d_inode(dentry), iattr);
+ retval = setattr_prepare(dentry, iattr);
if (retval)
return retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index eeabcb0bad12..afaa4b6de801 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -558,7 +558,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
p9_debug(P9_DEBUG_VFS, "\n");
- retval = inode_change_ok(inode, iattr);
+ retval = setattr_prepare(dentry, iattr);
if (retval)
return retval;
@@ -967,9 +967,6 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
.rename = v9fs_vfs_rename,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = v9fs_listxattr,
.get_acl = v9fs_iop_get_acl,
};
@@ -977,9 +974,6 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
const struct inode_operations v9fs_file_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = v9fs_listxattr,
.get_acl = v9fs_iop_get_acl,
};
@@ -989,8 +983,5 @@ const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = v9fs_listxattr,
};
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bc7ad775842..4bd03a2b0518 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
+ select PERCPU_RWSEM
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
@@ -199,6 +200,9 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config ARCH_HAS_GIGANTIC_PAGE
+ bool
+
source "fs/configfs/Kconfig"
source "fs/efivarfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c7efddf6e038..4c09d93d9569 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -89,7 +89,7 @@ config BINFMT_SCRIPT
config BINFMT_FLAT
bool "Kernel support for flat binaries"
- depends on !MMU || M68K
+ depends on !MMU || ARM || M68K
depends on !FRV || BROKEN
help
Support uClinux FLAT format binaries.
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 335055d828e4..8dbd36f5e581 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -199,7 +199,7 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
return;
cur_time:
- *tv = CURRENT_TIME;
+ *tv = current_time(inode);
return;
too_early:
@@ -303,7 +303,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
unsigned int ia_valid = attr->ia_valid;
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
/*
* we can't change the UID or GID of any file -
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index cc2b2efc9211..2f088773f1c0 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -173,7 +173,8 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir,
extern int affs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname);
extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry);
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags);
/* inode.c */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d8f217c711d3..0ec65c133b93 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -58,7 +58,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
mark_buffer_dirty_inode(dir_bh, dir);
affs_brelse(dir_bh);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
dir->i_version++;
mark_inode_dirty(dir);
@@ -112,7 +112,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
dir->i_version++;
mark_inode_dirty(dir);
@@ -313,7 +313,7 @@ affs_remove_header(struct dentry *dentry)
else
clear_nlink(inode);
affs_unlock_link(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
done:
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0fdb0f5b2239..fe4e1290dbb5 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -219,7 +219,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
- error = inode_change_ok(inode,attr);
+ error = setattr_prepare(dentry, attr);
if (error)
goto out;
@@ -309,7 +309,7 @@ affs_new_inode(struct inode *dir)
inode->i_gid = current_fsgid();
inode->i_ino = block;
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index a2d68f828d53..29186d29a3b6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -414,12 +414,16 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
int
affs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct super_block *sb = old_dir->i_sb;
struct buffer_head *bh = NULL;
int retval;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 7ef637d7f3a5..1e9d2f84e5b5 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work)
*/
int __init afs_callback_update_init(void)
{
- afs_callback_update_worker =
- create_singlethread_workqueue("kafs_callbackd");
+ afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd",
+ WQ_MEM_RECLAIM);
return afs_callback_update_worker ? 0 : -ENOMEM;
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 85737e96ab8b..2037e7a77a37 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -17,19 +17,12 @@
#include "internal.h"
#include "afs_cm.h"
-#if 0
-struct workqueue_struct *afs_cm_workqueue;
-#endif /* 0 */
-
-static int afs_deliver_cb_init_call_back_state(struct afs_call *,
- struct sk_buff *, bool);
-static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
- struct sk_buff *, bool);
-static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *,
- struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state(struct afs_call *);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
+static int afs_deliver_cb_probe(struct afs_call *);
+static int afs_deliver_cb_callback(struct afs_call *);
+static int afs_deliver_cb_probe_uuid(struct afs_call *);
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *);
static void afs_cm_destructor(struct afs_call *);
/*
@@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call)
* received. The step number here must match the final number in
* afs_deliver_cb_callback().
*/
- if (call->unmarshall == 6) {
+ if (call->unmarshall == 5) {
ASSERT(call->server && call->count && call->request);
afs_break_callbacks(call->server, call->count, call->request);
}
@@ -168,27 +161,27 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
/*
* deliver request data to a CB.CallBack call
*/
-static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
- bool last)
+static int afs_deliver_cb_callback(struct afs_call *call)
{
+ struct sockaddr_rxrpc srx;
struct afs_callback *cb;
struct afs_server *server;
- struct in_addr addr;
__be32 *bp;
u32 tmp;
int ret, loop;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
+ rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
call->offset = 0;
call->unmarshall++;
/* extract the FID array and its count in two steps */
case 1:
_debug("extract FID count");
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -205,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
case 2:
_debug("extract FID array");
- ret = afs_extract_data(call, skb, last, call->buffer,
- call->count * 3 * 4);
+ ret = afs_extract_data(call, call->buffer,
+ call->count * 3 * 4, true);
if (ret < 0)
return ret;
@@ -232,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
/* extract the callback array and its count in two steps */
case 3:
_debug("extract CB count");
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -242,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
return -EBADMSG;
call->offset = 0;
call->unmarshall++;
- if (tmp == 0)
- goto empty_cb_array;
case 4:
_debug("extract CB array");
- ret = afs_extract_data(call, skb, last, call->request,
- call->count * 3 * 4);
+ ret = afs_extract_data(call, call->buffer,
+ call->count * 3 * 4, false);
if (ret < 0)
return ret;
@@ -261,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
cb->type = ntohl(*bp++);
}
- empty_cb_array:
call->offset = 0;
call->unmarshall++;
- case 5:
- ret = afs_data_complete(call, skb, last);
- if (ret < 0)
- return ret;
-
/* Record that the message was unmarshalled successfully so
* that the call destructor can know do the callback breaking
* work, even if the final ACK isn't received.
@@ -278,17 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
* updated also.
*/
call->unmarshall++;
- case 6:
+ case 5:
break;
}
-
call->state = AFS_CALL_REPLYING;
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- memcpy(&addr, &ip_hdr(skb)->saddr, 4);
- server = afs_find_server(&addr);
+ server = afs_find_server(&srx);
if (!server)
return -ENOTCONN;
call->server = server;
@@ -315,17 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
/*
* deliver request data to a CB.InitCallBackState call
*/
-static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
- struct sk_buff *skb,
- bool last)
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
+ struct sockaddr_rxrpc srx;
struct afs_server *server;
- struct in_addr addr;
int ret;
- _enter(",{%u},%d", skb->len, last);
+ _enter("");
+
+ rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
- ret = afs_data_complete(call, skb, last);
+ ret = afs_extract_data(call, NULL, 0, false);
if (ret < 0)
return ret;
@@ -334,8 +317,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- memcpy(&addr, &ip_hdr(skb)->saddr, 4);
- server = afs_find_server(&addr);
+ server = afs_find_server(&srx);
if (!server)
return -ENOTCONN;
call->server = server;
@@ -348,27 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
/*
* deliver request data to a CB.InitCallBackState3 call
*/
-static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
- struct sk_buff *skb,
- bool last)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
{
+ struct sockaddr_rxrpc srx;
struct afs_server *server;
- struct in_addr addr;
+ struct afs_uuid *r;
+ unsigned loop;
+ __be32 *b;
+ int ret;
+
+ _enter("");
+
+ rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+
+ _enter("{%u}", call->unmarshall);
- _enter(",{%u},%d", skb->len, last);
+ switch (call->unmarshall) {
+ case 0:
+ call->offset = 0;
+ call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL);
+ if (!call->buffer)
+ return -ENOMEM;
+ call->unmarshall++;
- /* There are some arguments that we ignore */
- afs_data_consumed(call, skb);
- if (!last)
- return -EAGAIN;
+ case 1:
+ _debug("extract UUID");
+ ret = afs_extract_data(call, call->buffer,
+ 11 * sizeof(__be32), false);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
+
+ _debug("unmarshall UUID");
+ call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+ if (!call->request)
+ return -ENOMEM;
+
+ b = call->buffer;
+ r = call->request;
+ r->time_low = ntohl(b[0]);
+ r->time_mid = ntohl(b[1]);
+ r->time_hi_and_version = ntohl(b[2]);
+ r->clock_seq_hi_and_reserved = ntohl(b[3]);
+ r->clock_seq_low = ntohl(b[4]);
+
+ for (loop = 0; loop < 6; loop++)
+ r->node[loop] = ntohl(b[loop + 5]);
+
+ call->offset = 0;
+ call->unmarshall++;
+
+ case 2:
+ break;
+ }
/* no unmarshalling required */
call->state = AFS_CALL_REPLYING;
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- memcpy(&addr, &ip_hdr(skb)->saddr, 4);
- server = afs_find_server(&addr);
+ server = afs_find_server(&srx);
if (!server)
return -ENOTCONN;
call->server = server;
@@ -393,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
/*
* deliver request data to a CB.Probe call
*/
-static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
- bool last)
+static int afs_deliver_cb_probe(struct afs_call *call)
{
int ret;
- _enter(",{%u},%d", skb->len, last);
+ _enter("");
- ret = afs_data_complete(call, skb, last);
+ ret = afs_extract_data(call, NULL, 0, false);
if (ret < 0)
return ret;
@@ -426,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
_enter("");
-
if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
reply.match = htonl(0);
else
@@ -439,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
/*
* deliver request data to a CB.ProbeUuid call
*/
-static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
- bool last)
+static int afs_deliver_cb_probe_uuid(struct afs_call *call)
{
struct afs_uuid *r;
unsigned loop;
__be32 *b;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-
- ret = afs_data_complete(call, skb, last);
- if (ret < 0)
- return ret;
+ _enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
@@ -463,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
case 1:
_debug("extract UUID");
- ret = afs_extract_data(call, skb, last, call->buffer,
- 11 * sizeof(__be32));
+ ret = afs_extract_data(call, call->buffer,
+ 11 * sizeof(__be32), false);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -491,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
call->unmarshall++;
case 2:
- _debug("trailer");
- if (skb->len != 0)
- return -EBADMSG;
break;
}
- ret = afs_data_complete(call, skb, last);
- if (ret < 0)
- return ret;
-
call->state = AFS_CALL_REPLYING;
INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
@@ -574,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
/*
* deliver request data to a CB.TellMeAboutYourself call
*/
-static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
{
int ret;
- _enter(",{%u},%d", skb->len, last);
+ _enter("");
- ret = afs_data_complete(call, skb, last);
+ ret = afs_extract_data(call, NULL, 0, false);
if (ret < 0)
return ret;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index eba541004d90..51a241e09fbb 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -38,7 +38,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
static int afs_symlink(struct inode *dir, struct dentry *dentry,
const char *content);
static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry);
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags);
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
@@ -1083,12 +1084,16 @@ error:
* rename a file in an AFS filesystem and/or move it between directories
*/
static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
struct key *key;
int ret;
+ if (flags)
+ return -EINVAL;
+
vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index d91a9c9cfbd0..3191dff2c156 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -36,8 +36,8 @@ static int afs_init_lock_manager(void)
if (!afs_lock_manager) {
mutex_lock(&afs_lock_manager_mutex);
if (!afs_lock_manager) {
- afs_lock_manager =
- create_singlethread_workqueue("kafs_lockd");
+ afs_lock_manager = alloc_workqueue("kafs_lockd",
+ WQ_MEM_RECLAIM, 0);
if (!afs_lock_manager)
ret = -ENOMEM;
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 9312b92e54be..96f4d764d1a6 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
/*
* deliver reply data to an FS.FetchStatus
*/
-static int afs_deliver_fs_fetch_status(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_fetch_status(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter(",,%u", last);
+ _enter("");
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
/*
* deliver reply data to an FS.FetchData
*/
-static int afs_deliver_fs_fetch_data(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_fetch_data(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
@@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
void *buffer;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
@@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
* client) */
case 1:
_debug("extract data length (MSW)");
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
/* extract the returned data length */
case 2:
_debug("extract data length");
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
_debug("extract data");
if (call->count > 0) {
page = call->reply3;
- buffer = kmap_atomic(page);
- ret = afs_extract_data(call, skb, last, buffer,
- call->count);
- kunmap_atomic(buffer);
+ buffer = kmap(page);
+ ret = afs_extract_data(call, buffer,
+ call->count, true);
+ kunmap(buffer);
if (ret < 0)
return ret;
}
@@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
/* extract the metadata */
case 4:
- ret = afs_extract_data(call, skb, last, call->buffer,
- (21 + 3 + 6) * 4);
+ ret = afs_extract_data(call, call->buffer,
+ (21 + 3 + 6) * 4, false);
if (ret < 0)
return ret;
@@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
call->unmarshall++;
case 5:
- ret = afs_data_complete(call, skb, last);
- if (ret < 0)
- return ret;
break;
}
if (call->count < PAGE_SIZE) {
_debug("clear");
page = call->reply3;
- buffer = kmap_atomic(page);
+ buffer = kmap(page);
memset(buffer + call->count, 0, PAGE_SIZE - call->count);
- kunmap_atomic(buffer);
+ kunmap(buffer);
}
_leave(" = 0 [done]");
@@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server,
/*
* deliver reply data to an FS.GiveUpCallBacks
*/
-static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call)
{
- _enter(",{%u},%d", skb->len, last);
+ _enter("");
/* shouldn't be any reply data */
- return afs_data_complete(call, skb, last);
+ return afs_extract_data(call, NULL, 0, false);
}
/*
@@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
/*
* deliver reply data to an FS.CreateFile or an FS.MakeDir
*/
-static int afs_deliver_fs_create_vnode(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_create_vnode(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server,
/*
* deliver reply data to an FS.RemoveFile or FS.RemoveDir
*/
-static int afs_deliver_fs_remove(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_remove(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server,
/*
* deliver reply data to an FS.Link
*/
-static int afs_deliver_fs_link(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_link(struct afs_call *call)
{
struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server,
/*
* deliver reply data to an FS.Symlink
*/
-static int afs_deliver_fs_symlink(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_symlink(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server,
/*
* deliver reply data to an FS.Rename
*/
-static int afs_deliver_fs_rename(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_rename(struct afs_call *call)
{
struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server,
/*
* deliver reply data to an FS.StoreData
*/
-static int afs_deliver_fs_store_data(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_store_data(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter(",,%u", last);
+ _enter("");
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
/*
* deliver reply data to an FS.StoreStatus
*/
-static int afs_deliver_fs_store_status(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_store_status(struct afs_call *call)
{
afs_dataversion_t *store_version;
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
int ret;
- _enter(",,%u", last);
+ _enter("");
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
@@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
/*
* deliver reply data to an FS.GetVolumeStatus
*/
-static int afs_deliver_fs_get_volume_status(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_get_volume_status(struct afs_call *call)
{
const __be32 *bp;
char *p;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
@@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the returned status record */
case 1:
_debug("extract status");
- ret = afs_extract_data(call, skb, last, call->buffer,
- 12 * 4);
+ ret = afs_extract_data(call, call->buffer,
+ 12 * 4, true);
if (ret < 0)
return ret;
@@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the volume name length */
case 2:
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 3:
_debug("extract volname");
if (call->count > 0) {
- ret = afs_extract_data(call, skb, last, call->reply3,
- call->count);
+ ret = afs_extract_data(call, call->reply3,
+ call->count, true);
if (ret < 0)
return ret;
}
@@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
call->count = 4 - (call->count & 3);
case 4:
- ret = afs_extract_data(call, skb, last, call->buffer,
- call->count);
+ ret = afs_extract_data(call, call->buffer,
+ call->count, true);
if (ret < 0)
return ret;
@@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the offline message length */
case 5:
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 6:
_debug("extract offline");
if (call->count > 0) {
- ret = afs_extract_data(call, skb, last, call->reply3,
- call->count);
+ ret = afs_extract_data(call, call->reply3,
+ call->count, true);
if (ret < 0)
return ret;
}
@@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
call->count = 4 - (call->count & 3);
case 7:
- ret = afs_extract_data(call, skb, last, call->buffer,
- call->count);
+ ret = afs_extract_data(call, call->buffer,
+ call->count, true);
if (ret < 0)
return ret;
@@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the message of the day length */
case 8:
- ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
@@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 9:
_debug("extract motd");
if (call->count > 0) {
- ret = afs_extract_data(call, skb, last, call->reply3,
- call->count);
+ ret = afs_extract_data(call, call->reply3,
+ call->count, true);
if (ret < 0)
return ret;
}
@@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
call->unmarshall++;
/* extract the message of the day padding */
- if ((call->count & 3) == 0) {
- call->unmarshall++;
- goto no_motd_padding;
- }
- call->count = 4 - (call->count & 3);
+ call->count = (4 - (call->count & 3)) & 3;
case 10:
- ret = afs_extract_data(call, skb, last, call->buffer,
- call->count);
+ ret = afs_extract_data(call, call->buffer,
+ call->count, false);
if (ret < 0)
return ret;
call->offset = 0;
call->unmarshall++;
- no_motd_padding:
-
case 11:
- ret = afs_data_complete(call, skb, last);
- if (ret < 0)
- return ret;
break;
}
@@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server,
/*
* deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
*/
-static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
{
const __be32 *bp;
int ret;
- _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+ _enter("{%u}", call->unmarshall);
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index df976b2a7f40..5497c8496055 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -13,13 +13,13 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/skbuff.h>
#include <linux/rxrpc.h>
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/fscache.h>
#include <linux/backing-dev.h>
+#include <net/af_rxrpc.h>
#include "afs.h"
#include "afs_vl.h"
@@ -56,7 +56,7 @@ struct afs_mount_params {
*/
struct afs_wait_mode {
/* RxRPC received message notification */
- void (*rx_wakeup)(struct afs_call *call);
+ rxrpc_notify_rx_t notify_rx;
/* synchronous call waiter and call dispatched notification */
int (*wait)(struct afs_call *call);
@@ -75,10 +75,8 @@ struct afs_call {
const struct afs_call_type *type; /* type of call */
const struct afs_wait_mode *wait_mode; /* completion wait mode */
wait_queue_head_t waitq; /* processes awaiting completion */
- void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
struct work_struct async_work; /* asynchronous work processor */
struct work_struct work; /* actual work processor */
- struct sk_buff_head rx_queue; /* received packets */
struct rxrpc_call *rxcall; /* RxRPC call handle */
struct key *key; /* security for this call */
struct afs_server *server; /* server affected by incoming CM call */
@@ -92,6 +90,7 @@ struct afs_call {
void *reply4; /* reply buffer (fourth part) */
pgoff_t first; /* first page in mapping to deal with */
pgoff_t last; /* last page in mapping to deal with */
+ size_t offset; /* offset into received data store */
enum { /* call state */
AFS_CALL_REQUESTING, /* request is being sent for outgoing call */
AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */
@@ -99,21 +98,18 @@ struct afs_call {
AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */
AFS_CALL_REPLYING, /* replying to incoming call */
AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */
- AFS_CALL_COMPLETE, /* successfully completed */
- AFS_CALL_BUSY, /* server was busy */
- AFS_CALL_ABORTED, /* call was aborted */
- AFS_CALL_ERROR, /* call failed due to error */
+ AFS_CALL_COMPLETE, /* Completed or failed */
} state;
int error; /* error code */
+ u32 abort_code; /* Remote abort ID or 0 */
unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */
- unsigned reply_size; /* current size of reply */
unsigned first_offset; /* offset into mapping[first] */
unsigned last_to; /* amount of mapping[last] */
- unsigned offset; /* offset into received data store */
unsigned char unmarshall; /* unmarshalling phase */
bool incoming; /* T if incoming call */
bool send_pages; /* T if data from mapping should be sent */
+ bool need_attention; /* T if RxRPC poked us */
u16 service_id; /* RxRPC service ID to call */
__be16 port; /* target UDP port */
__be32 operation_ID; /* operation ID for an incoming call */
@@ -128,8 +124,7 @@ struct afs_call_type {
/* deliver request or reply data to an call
* - returning an error will cause the call to be aborted
*/
- int (*deliver)(struct afs_call *call, struct sk_buff *skb,
- bool last);
+ int (*deliver)(struct afs_call *call);
/* map an abort code to an error number */
int (*abort_to_error)(u32 abort_code);
@@ -607,29 +602,22 @@ extern void afs_proc_cell_remove(struct afs_cell *);
/*
* rxrpc.c
*/
+extern struct socket *afs_socket;
+
extern int afs_open_socket(void);
extern void afs_close_socket(void);
-extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
const struct afs_wait_mode *);
extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
size_t, size_t);
extern void afs_flat_call_destructor(struct afs_call *);
-extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
extern void afs_send_empty_reply(struct afs_call *);
extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
-extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
- size_t);
+extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
-static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
- bool last)
+static inline int afs_transfer_reply(struct afs_call *call)
{
- if (skb->len > 0)
- return -EBADMSG;
- afs_data_consumed(call, skb);
- if (!last)
- return -EAGAIN;
- return 0;
+ return afs_extract_data(call, call->buffer, call->reply_max, false);
}
/*
@@ -654,7 +642,7 @@ do { \
extern struct afs_server *afs_lookup_server(struct afs_cell *,
const struct in_addr *);
-extern struct afs_server *afs_find_server(const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *);
extern void afs_put_server(struct afs_server *);
extern void __exit afs_purge_servers(void);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 35de0c04729f..0b187ef3b5b7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/sched.h>
+#include <linux/random.h>
#include "internal.h"
MODULE_DESCRIPTION("AFS Client File System");
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 14d04c848465..477928b25940 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -16,34 +16,36 @@
#include "internal.h"
#include "afs_cm.h"
-static struct socket *afs_socket; /* my RxRPC socket */
+struct socket *afs_socket; /* my RxRPC socket */
static struct workqueue_struct *afs_async_calls;
+static struct afs_call *afs_spare_incoming_call;
static atomic_t afs_outstanding_calls;
-static atomic_t afs_outstanding_skbs;
-static void afs_wake_up_call_waiter(struct afs_call *);
+static void afs_free_call(struct afs_call *);
+static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static int afs_wait_for_call_to_complete(struct afs_call *);
-static void afs_wake_up_async_call(struct afs_call *);
+static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct afs_call *);
-static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
-static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static int afs_deliver_cm_op_id(struct afs_call *);
/* synchronous call management */
const struct afs_wait_mode afs_sync_call = {
- .rx_wakeup = afs_wake_up_call_waiter,
+ .notify_rx = afs_wake_up_call_waiter,
.wait = afs_wait_for_call_to_complete,
};
/* asynchronous call management */
const struct afs_wait_mode afs_async_call = {
- .rx_wakeup = afs_wake_up_async_call,
+ .notify_rx = afs_wake_up_async_call,
.wait = afs_dont_wait_for_call_to_complete,
};
/* asynchronous incoming call management */
static const struct afs_wait_mode afs_async_incoming_call = {
- .rx_wakeup = afs_wake_up_async_call,
+ .notify_rx = afs_wake_up_async_call,
};
/* asynchronous incoming call initial processing */
@@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = {
.abort_to_error = afs_abort_to_error,
};
-static void afs_collect_incoming_call(struct work_struct *);
+static void afs_charge_preallocation(struct work_struct *);
-static struct sk_buff_head afs_incoming_calls;
-static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
-
-static void afs_async_workfn(struct work_struct *work)
-{
- struct afs_call *call = container_of(work, struct afs_call, async_work);
-
- call->async_workfn(call);
-}
+static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
static int afs_wait_atomic_t(atomic_t *p)
{
@@ -83,10 +77,8 @@ int afs_open_socket(void)
_enter("");
- skb_queue_head_init(&afs_incoming_calls);
-
ret = -ENOMEM;
- afs_async_calls = create_singlethread_workqueue("kafsd");
+ afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
if (!afs_async_calls)
goto error_0;
@@ -110,13 +102,15 @@ int afs_open_socket(void)
if (ret < 0)
goto error_2;
+ rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
+ afs_rx_discard_new_call);
+
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
goto error_2;
- rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
-
afs_socket = socket;
+ afs_charge_preallocation(NULL);
_leave(" = 0");
return 0;
@@ -136,52 +130,28 @@ void afs_close_socket(void)
{
_enter("");
+ if (afs_spare_incoming_call) {
+ atomic_inc(&afs_outstanding_calls);
+ afs_free_call(afs_spare_incoming_call);
+ afs_spare_incoming_call = NULL;
+ }
+
+ _debug("outstanding %u", atomic_read(&afs_outstanding_calls));
wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
TASK_UNINTERRUPTIBLE);
_debug("no outstanding calls");
+ flush_workqueue(afs_async_calls);
+ kernel_sock_shutdown(afs_socket, SHUT_RDWR);
+ flush_workqueue(afs_async_calls);
sock_release(afs_socket);
_debug("dework");
destroy_workqueue(afs_async_calls);
-
- ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
_leave("");
}
/*
- * Note that the data in a socket buffer is now consumed.
- */
-void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
-{
- if (!skb) {
- _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
- dump_stack();
- } else {
- _debug("DLVR %p{%u} [%d]",
- skb, skb->mark, atomic_read(&afs_outstanding_skbs));
- rxrpc_kernel_data_consumed(call->rxcall, skb);
- }
-}
-
-/*
- * free a socket buffer
- */
-static void afs_free_skb(struct sk_buff *skb)
-{
- if (!skb) {
- _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
- dump_stack();
- } else {
- _debug("FREE %p{%u} [%d]",
- skb, skb->mark, atomic_read(&afs_outstanding_skbs));
- if (atomic_dec_return(&afs_outstanding_skbs) == -1)
- BUG();
- rxrpc_kernel_free_skb(skb);
- }
-}
-
-/*
* free a call
*/
static void afs_free_call(struct afs_call *call)
@@ -191,7 +161,6 @@ static void afs_free_call(struct afs_call *call)
ASSERTCMP(call->rxcall, ==, NULL);
ASSERT(!work_pending(&call->async_work));
- ASSERT(skb_queue_empty(&call->rx_queue));
ASSERT(call->type->name != NULL);
kfree(call->request);
@@ -207,7 +176,7 @@ static void afs_free_call(struct afs_call *call)
static void afs_end_call_nofree(struct afs_call *call)
{
if (call->rxcall) {
- rxrpc_kernel_end_call(call->rxcall);
+ rxrpc_kernel_end_call(afs_socket, call->rxcall);
call->rxcall = NULL;
}
if (call->type->destructor)
@@ -227,7 +196,7 @@ static void afs_end_call(struct afs_call *call)
* allocate a call with flat request and reply buffers
*/
struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
- size_t request_size, size_t reply_size)
+ size_t request_size, size_t reply_max)
{
struct afs_call *call;
@@ -241,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
call->type = type;
call->request_size = request_size;
- call->reply_max = reply_size;
+ call->reply_max = reply_max;
if (request_size) {
call->request = kmalloc(request_size, GFP_NOFS);
@@ -249,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
goto nomem_free;
}
- if (reply_size) {
- call->buffer = kmalloc(reply_size, GFP_NOFS);
+ if (reply_max) {
+ call->buffer = kmalloc(reply_max, GFP_NOFS);
if (!call->buffer)
goto nomem_free;
}
init_waitqueue_head(&call->waitq);
- skb_queue_head_init(&call->rx_queue);
return call;
nomem_free:
@@ -325,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
* returns from sending the request */
if (first + loop >= last)
call->state = AFS_CALL_AWAIT_REPLY;
- ret = rxrpc_kernel_send_data(call->rxcall, msg,
- to - offset);
+ ret = rxrpc_kernel_send_data(afs_socket, call->rxcall,
+ msg, to - offset);
kunmap(pages[loop]);
if (ret < 0)
break;
@@ -354,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
struct msghdr msg;
struct kvec iov[1];
int ret;
- struct sk_buff *skb;
_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
@@ -366,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
atomic_read(&afs_outstanding_calls));
call->wait_mode = wait_mode;
- call->async_workfn = afs_process_async_call;
- INIT_WORK(&call->async_work, afs_async_workfn);
+ INIT_WORK(&call->async_work, afs_process_async_call);
memset(&srx, 0, sizeof(srx));
srx.srx_family = AF_RXRPC;
@@ -380,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
/* create a call */
rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
- (unsigned long) call, gfp);
+ (unsigned long) call, gfp,
+ wait_mode->notify_rx);
call->key = NULL;
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
@@ -406,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
* request */
if (!call->send_pages)
call->state = AFS_CALL_AWAIT_REPLY;
- ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+ ret = rxrpc_kernel_send_data(afs_socket, rxcall,
+ &msg, call->request_size);
if (ret < 0)
goto error_do_abort;
@@ -421,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
return wait_mode->wait(call);
error_do_abort:
- rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
- while ((skb = skb_dequeue(&call->rx_queue)))
- afs_free_skb(skb);
+ rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD");
error_kill_call:
afs_end_call(call);
_leave(" = %d", ret);
@@ -431,140 +397,77 @@ error_kill_call:
}
/*
- * Handles intercepted messages that were arriving in the socket's Rx queue.
- *
- * Called from the AF_RXRPC call processor in waitqueue process context. For
- * each call, it is guaranteed this will be called in order of packet to be
- * delivered.
- */
-static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
- struct sk_buff *skb)
-{
- struct afs_call *call = (struct afs_call *) user_call_ID;
-
- _enter("%p,,%u", call, skb->mark);
-
- _debug("ICPT %p{%u} [%d]",
- skb, skb->mark, atomic_read(&afs_outstanding_skbs));
-
- ASSERTCMP(sk, ==, afs_socket->sk);
- atomic_inc(&afs_outstanding_skbs);
-
- if (!call) {
- /* its an incoming call for our callback service */
- skb_queue_tail(&afs_incoming_calls, skb);
- queue_work(afs_wq, &afs_collect_incoming_call_work);
- } else {
- /* route the messages directly to the appropriate call */
- skb_queue_tail(&call->rx_queue, skb);
- call->wait_mode->rx_wakeup(call);
- }
-
- _leave("");
-}
-
-/*
* deliver messages to a call
*/
static void afs_deliver_to_call(struct afs_call *call)
{
- struct sk_buff *skb;
- bool last;
u32 abort_code;
int ret;
- _enter("");
-
- while ((call->state == AFS_CALL_AWAIT_REPLY ||
- call->state == AFS_CALL_AWAIT_OP_ID ||
- call->state == AFS_CALL_AWAIT_REQUEST ||
- call->state == AFS_CALL_AWAIT_ACK) &&
- (skb = skb_dequeue(&call->rx_queue))) {
- switch (skb->mark) {
- case RXRPC_SKB_MARK_DATA:
- _debug("Rcv DATA");
- last = rxrpc_kernel_is_data_last(skb);
- ret = call->type->deliver(call, skb, last);
- switch (ret) {
- case -EAGAIN:
- if (last) {
- _debug("short data");
- goto unmarshal_error;
- }
- break;
- case 0:
- ASSERT(last);
- if (call->state == AFS_CALL_AWAIT_REPLY)
- call->state = AFS_CALL_COMPLETE;
- break;
- case -ENOTCONN:
- abort_code = RX_CALL_DEAD;
- goto do_abort;
- case -ENOTSUPP:
- abort_code = RX_INVALID_OPERATION;
- goto do_abort;
- default:
- unmarshal_error:
- abort_code = RXGEN_CC_UNMARSHAL;
- if (call->state != AFS_CALL_AWAIT_REPLY)
- abort_code = RXGEN_SS_UNMARSHAL;
- do_abort:
- rxrpc_kernel_abort_call(call->rxcall,
- abort_code);
- call->error = ret;
- call->state = AFS_CALL_ERROR;
- break;
+ _enter("%s", call->type->name);
+
+ while (call->state == AFS_CALL_AWAIT_REPLY ||
+ call->state == AFS_CALL_AWAIT_OP_ID ||
+ call->state == AFS_CALL_AWAIT_REQUEST ||
+ call->state == AFS_CALL_AWAIT_ACK
+ ) {
+ if (call->state == AFS_CALL_AWAIT_ACK) {
+ size_t offset = 0;
+ ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+ NULL, 0, &offset, false,
+ &call->abort_code);
+ if (ret == -EINPROGRESS || ret == -EAGAIN)
+ return;
+ if (ret == 1 || ret < 0) {
+ call->state = AFS_CALL_COMPLETE;
+ goto done;
}
- break;
- case RXRPC_SKB_MARK_FINAL_ACK:
- _debug("Rcv ACK");
- call->state = AFS_CALL_COMPLETE;
- break;
- case RXRPC_SKB_MARK_BUSY:
- _debug("Rcv BUSY");
- call->error = -EBUSY;
- call->state = AFS_CALL_BUSY;
- break;
- case RXRPC_SKB_MARK_REMOTE_ABORT:
- abort_code = rxrpc_kernel_get_abort_code(skb);
- call->error = call->type->abort_to_error(abort_code);
- call->state = AFS_CALL_ABORTED;
- _debug("Rcv ABORT %u -> %d", abort_code, call->error);
- break;
- case RXRPC_SKB_MARK_LOCAL_ABORT:
- abort_code = rxrpc_kernel_get_abort_code(skb);
- call->error = call->type->abort_to_error(abort_code);
- call->state = AFS_CALL_ABORTED;
- _debug("Loc ABORT %u -> %d", abort_code, call->error);
- break;
- case RXRPC_SKB_MARK_NET_ERROR:
- call->error = -rxrpc_kernel_get_error_number(skb);
- call->state = AFS_CALL_ERROR;
- _debug("Rcv NET ERROR %d", call->error);
- break;
- case RXRPC_SKB_MARK_LOCAL_ERROR:
- call->error = -rxrpc_kernel_get_error_number(skb);
- call->state = AFS_CALL_ERROR;
- _debug("Rcv LOCAL ERROR %d", call->error);
- break;
- default:
- BUG();
- break;
+ return;
}
- afs_free_skb(skb);
- }
-
- /* make sure the queue is empty if the call is done with (we might have
- * aborted the call early because of an unmarshalling error) */
- if (call->state >= AFS_CALL_COMPLETE) {
- while ((skb = skb_dequeue(&call->rx_queue)))
- afs_free_skb(skb);
- if (call->incoming)
- afs_end_call(call);
+ ret = call->type->deliver(call);
+ switch (ret) {
+ case 0:
+ if (call->state == AFS_CALL_AWAIT_REPLY)
+ call->state = AFS_CALL_COMPLETE;
+ goto done;
+ case -EINPROGRESS:
+ case -EAGAIN:
+ goto out;
+ case -ENOTCONN:
+ abort_code = RX_CALL_DEAD;
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ abort_code, -ret, "KNC");
+ goto do_abort;
+ case -ENOTSUPP:
+ abort_code = RX_INVALID_OPERATION;
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ abort_code, -ret, "KIV");
+ goto do_abort;
+ case -ENODATA:
+ case -EBADMSG:
+ case -EMSGSIZE:
+ default:
+ abort_code = RXGEN_CC_UNMARSHAL;
+ if (call->state != AFS_CALL_AWAIT_REPLY)
+ abort_code = RXGEN_SS_UNMARSHAL;
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ abort_code, EBADMSG, "KUM");
+ goto do_abort;
+ }
}
+done:
+ if (call->state == AFS_CALL_COMPLETE && call->incoming)
+ afs_end_call(call);
+out:
_leave("");
+ return;
+
+do_abort:
+ call->error = ret;
+ call->state = AFS_CALL_COMPLETE;
+ goto done;
}
/*
@@ -572,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call)
*/
static int afs_wait_for_call_to_complete(struct afs_call *call)
{
- struct sk_buff *skb;
+ const char *abort_why;
int ret;
DECLARE_WAITQUEUE(myself, current);
@@ -584,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
set_current_state(TASK_INTERRUPTIBLE);
/* deliver any messages that are in the queue */
- if (!skb_queue_empty(&call->rx_queue)) {
+ if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+ call->need_attention = false;
__set_current_state(TASK_RUNNING);
afs_deliver_to_call(call);
continue;
}
+ abort_why = "KWC";
ret = call->error;
- if (call->state >= AFS_CALL_COMPLETE)
+ if (call->state == AFS_CALL_COMPLETE)
break;
+ abort_why = "KWI";
ret = -EINTR;
if (signal_pending(current))
break;
@@ -605,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
/* kill the call */
if (call->state < AFS_CALL_COMPLETE) {
_debug("call incomplete");
- rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
- while ((skb = skb_dequeue(&call->rx_queue)))
- afs_free_skb(skb);
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ RX_CALL_DEAD, -ret, abort_why);
}
_debug("call complete");
@@ -619,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
/*
* wake up a waiting call
*/
-static void afs_wake_up_call_waiter(struct afs_call *call)
+static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall,
+ unsigned long call_user_ID)
{
+ struct afs_call *call = (struct afs_call *)call_user_ID;
+
+ call->need_attention = true;
wake_up(&call->waitq);
}
/*
* wake up an asynchronous call
*/
-static void afs_wake_up_async_call(struct afs_call *call)
+static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
+ unsigned long call_user_ID)
{
- _enter("");
+ struct afs_call *call = (struct afs_call *)call_user_ID;
+
+ call->need_attention = true;
queue_work(afs_async_calls, &call->async_work);
}
@@ -647,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
/*
* delete an asynchronous call
*/
-static void afs_delete_async_call(struct afs_call *call)
+static void afs_delete_async_call(struct work_struct *work)
{
+ struct afs_call *call = container_of(work, struct afs_call, async_work);
+
_enter("");
afs_free_call(call);
@@ -658,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call)
/*
* perform processing on an asynchronous call
- * - on a multiple-thread workqueue this work item may try to run on several
- * CPUs at the same time
*/
-static void afs_process_async_call(struct afs_call *call)
+static void afs_process_async_call(struct work_struct *work)
{
+ struct afs_call *call = container_of(work, struct afs_call, async_work);
+
_enter("");
- if (!skb_queue_empty(&call->rx_queue))
+ if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+ call->need_attention = false;
afs_deliver_to_call(call);
+ }
- if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+ if (call->state == AFS_CALL_COMPLETE && call->wait_mode) {
if (call->wait_mode->async_complete)
call->wait_mode->async_complete(call->reply,
call->error);
@@ -679,122 +595,93 @@ static void afs_process_async_call(struct afs_call *call)
/* we can't just delete the call because the work item may be
* queued */
- call->async_workfn = afs_delete_async_call;
+ call->async_work.func = afs_delete_async_call;
queue_work(afs_async_calls, &call->async_work);
}
_leave("");
}
-/*
- * Empty a socket buffer into a flat reply buffer.
- */
-int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
{
- size_t len = skb->len;
-
- if (len > call->reply_max - call->reply_size) {
- _leave(" = -EBADMSG [%zu > %u]",
- len, call->reply_max - call->reply_size);
- return -EBADMSG;
- }
+ struct afs_call *call = (struct afs_call *)user_call_ID;
- if (len > 0) {
- if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
- len) < 0)
- BUG();
- call->reply_size += len;
- }
-
- afs_data_consumed(call, skb);
- if (!last)
- return -EAGAIN;
-
- if (call->reply_size != call->reply_max) {
- _leave(" = -EBADMSG [%u != %u]",
- call->reply_size, call->reply_max);
- return -EBADMSG;
- }
- return 0;
+ call->rxcall = rxcall;
}
/*
- * accept the backlog of incoming calls
+ * Charge the incoming call preallocation.
*/
-static void afs_collect_incoming_call(struct work_struct *work)
+static void afs_charge_preallocation(struct work_struct *work)
{
- struct rxrpc_call *rxcall;
- struct afs_call *call = NULL;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&afs_incoming_calls))) {
- _debug("new call");
-
- /* don't need the notification */
- afs_free_skb(skb);
+ struct afs_call *call = afs_spare_incoming_call;
+ for (;;) {
if (!call) {
call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
- if (!call) {
- rxrpc_kernel_reject_call(afs_socket);
- return;
- }
+ if (!call)
+ break;
- call->async_workfn = afs_process_async_call;
- INIT_WORK(&call->async_work, afs_async_workfn);
+ INIT_WORK(&call->async_work, afs_process_async_call);
call->wait_mode = &afs_async_incoming_call;
call->type = &afs_RXCMxxxx;
init_waitqueue_head(&call->waitq);
- skb_queue_head_init(&call->rx_queue);
call->state = AFS_CALL_AWAIT_OP_ID;
-
- _debug("CALL %p{%s} [%d]",
- call, call->type->name,
- atomic_read(&afs_outstanding_calls));
- atomic_inc(&afs_outstanding_calls);
}
- rxcall = rxrpc_kernel_accept_call(afs_socket,
- (unsigned long) call);
- if (!IS_ERR(rxcall)) {
- call->rxcall = rxcall;
- call = NULL;
- }
+ if (rxrpc_kernel_charge_accept(afs_socket,
+ afs_wake_up_async_call,
+ afs_rx_attach,
+ (unsigned long)call,
+ GFP_KERNEL) < 0)
+ break;
+ call = NULL;
}
+ afs_spare_incoming_call = call;
+}
+
+/*
+ * Discard a preallocated call when a socket is shut down.
+ */
+static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
+ unsigned long user_call_ID)
+{
+ struct afs_call *call = (struct afs_call *)user_call_ID;
- if (call)
- afs_free_call(call);
+ atomic_inc(&afs_outstanding_calls);
+ call->rxcall = NULL;
+ afs_free_call(call);
+}
+
+/*
+ * Notification of an incoming call.
+ */
+static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+ unsigned long user_call_ID)
+{
+ atomic_inc(&afs_outstanding_calls);
+ queue_work(afs_wq, &afs_charge_preallocation_work);
}
/*
* Grab the operation ID from an incoming cache manager call. The socket
* buffer is discarded on error or if we don't yet have sufficient data.
*/
-static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
- bool last)
+static int afs_deliver_cm_op_id(struct afs_call *call)
{
- size_t len = skb->len;
- void *oibuf = (void *) &call->operation_ID;
+ int ret;
- _enter("{%u},{%zu},%d", call->offset, len, last);
+ _enter("{%zu}", call->offset);
ASSERTCMP(call->offset, <, 4);
/* the operation ID forms the first four bytes of the request data */
- len = min_t(size_t, len, 4 - call->offset);
- if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
- BUG();
- if (!pskb_pull(skb, len))
- BUG();
- call->offset += len;
-
- if (call->offset < 4) {
- afs_data_consumed(call, skb);
- _leave(" = -EAGAIN");
- return -EAGAIN;
- }
+ ret = afs_extract_data(call, &call->operation_ID, 4, true);
+ if (ret < 0)
+ return ret;
call->state = AFS_CALL_AWAIT_REQUEST;
+ call->offset = 0;
/* ask the cache manager to route the call (it'll change the call type
* if successful) */
@@ -803,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
/* pass responsibility for the remainer of this message off to the
* cache manager op */
- return call->type->deliver(call, skb, last);
+ return call->type->deliver(call);
}
/*
@@ -823,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_flags = 0;
call->state = AFS_CALL_AWAIT_ACK;
- switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+ switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) {
case 0:
_leave(" [replied]");
return;
case -ENOMEM:
_debug("oom");
- rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ RX_USER_ABORT, ENOMEM, "KOO");
default:
afs_end_call(call);
_leave(" [error]");
@@ -859,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
msg.msg_flags = 0;
call->state = AFS_CALL_AWAIT_ACK;
- n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
+ n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len);
if (n >= 0) {
/* Success */
_leave(" [replied]");
@@ -868,7 +756,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
- rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+ rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+ RX_USER_ABORT, ENOMEM, "KOO");
}
afs_end_call(call);
_leave(" [error]");
@@ -877,25 +766,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
/*
* Extract a piece of data from the received data socket buffers.
*/
-int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
- bool last, void *buf, size_t count)
+int afs_extract_data(struct afs_call *call, void *buf, size_t count,
+ bool want_more)
{
- size_t len = skb->len;
+ int ret;
- _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+ _enter("{%s,%zu},,%zu,%d",
+ call->type->name, call->offset, count, want_more);
- ASSERTCMP(call->offset, <, count);
+ ASSERTCMP(call->offset, <=, count);
- len = min_t(size_t, len, count - call->offset);
- if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
- !pskb_pull(skb, len))
- BUG();
- call->offset += len;
+ ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+ buf, count, &call->offset,
+ want_more, &call->abort_code);
+ if (ret == 0 || ret == -EAGAIN)
+ return ret;
- if (call->offset < count) {
- afs_data_consumed(call, skb);
- _leave(" = -EAGAIN");
- return -EAGAIN;
+ if (ret == 1) {
+ switch (call->state) {
+ case AFS_CALL_AWAIT_REPLY:
+ call->state = AFS_CALL_COMPLETE;
+ break;
+ case AFS_CALL_AWAIT_REQUEST:
+ call->state = AFS_CALL_REPLYING;
+ break;
+ default:
+ break;
+ }
+ return 0;
}
- return 0;
+
+ if (ret == -ECONNABORTED)
+ call->error = call->type->abort_to_error(call->abort_code);
+ else
+ call->error = ret;
+ call->state = AFS_CALL_COMPLETE;
+ return ret;
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f342acf3547d..d4066ab7dd55 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -178,13 +178,18 @@ server_in_two_cells:
/*
* look up a server by its IP address
*/
-struct afs_server *afs_find_server(const struct in_addr *_addr)
+struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
{
struct afs_server *server = NULL;
struct rb_node *p;
- struct in_addr addr = *_addr;
+ struct in_addr addr = srx->transport.sin.sin_addr;
- _enter("%pI4", &addr.s_addr);
+ _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr);
+
+ if (srx->transport.family != AF_INET) {
+ WARN(true, "AFS does not yes support non-IPv4 addresses\n");
+ return NULL;
+ }
read_lock(&afs_servers_lock);
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index f94d1abdc3eb..94bcd97d22b8 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code)
/*
* deliver reply data to a VL.GetEntryByXXX call
*/
-static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
- struct sk_buff *skb, bool last)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
{
struct afs_cache_vlocation *entry;
__be32 *bp;
u32 tmp;
int loop, ret;
- _enter(",,%u", last);
+ _enter("");
- ret = afs_transfer_reply(call, skb, last);
+ ret = afs_transfer_reply(call);
if (ret < 0)
return ret;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 52976785a32c..45a86396fd2d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work)
*/
int __init afs_vlocation_update_init(void)
{
- afs_vlocation_update_worker =
- create_singlethread_workqueue("kafs_vlupdated");
+ afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated",
+ WQ_MEM_RECLAIM, 0);
return afs_vlocation_update_worker ? 0 : -ENOMEM;
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 14d506efd1aa..f865c3f05bea 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -398,8 +398,7 @@ no_more:
switch (ret) {
case -EDQUOT:
case -ENOSPC:
- set_bit(AS_ENOSPC,
- &wb->vnode->vfs_inode.i_mapping->flags);
+ mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC);
break;
case -EROFS:
case -EIO:
@@ -409,7 +408,7 @@ no_more:
case -ENOMEDIUM:
case -ENXIO:
afs_kill_pages(wb->vnode, true, first, last);
- set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
+ mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO);
break;
case -EACCES:
case -EPERM:
diff --git a/fs/aio.c b/fs/aio.c
index 4fe81d1c60f9..1157e13a36d6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -274,14 +274,17 @@ __initcall(aio_setup);
static void put_aio_ring_file(struct kioctx *ctx)
{
struct file *aio_ring_file = ctx->aio_ring_file;
+ struct address_space *i_mapping;
+
if (aio_ring_file) {
truncate_setsize(aio_ring_file->f_inode, 0);
/* Prevent further access to the kioctx from migratepages */
- spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
- aio_ring_file->f_inode->i_mapping->private_data = NULL;
+ i_mapping = aio_ring_file->f_inode->i_mapping;
+ spin_lock(&i_mapping->private_lock);
+ i_mapping->private_data = NULL;
ctx->aio_ring_file = NULL;
- spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+ spin_unlock(&i_mapping->private_lock);
fput(aio_ring_file);
}
diff --git a/fs/attr.c b/fs/attr.c
index 42bb42bb3c72..c902b3d53508 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -17,19 +17,22 @@
#include <linux/ima.h>
/**
- * inode_change_ok - check if attribute changes to an inode are allowed
- * @inode: inode to check
+ * setattr_prepare - check if attribute changes to a dentry are allowed
+ * @dentry: dentry to check
* @attr: attributes to change
*
* Check if we are allowed to change the attributes contained in @attr
- * in the given inode. This includes the normal unix access permission
- * checks, as well as checks for rlimits and others.
+ * in the given dentry. This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others. The function also clears
+ * SGID bit from mode if user is not allowed to set it. Also file capabilities
+ * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
*
* Should be called as the first thing in ->setattr implementations,
* possibly after taking additional locks.
*/
-int inode_change_ok(const struct inode *inode, struct iattr *attr)
+int setattr_prepare(struct dentry *dentry, struct iattr *attr)
{
+ struct inode *inode = d_inode(dentry);
unsigned int ia_valid = attr->ia_valid;
/*
@@ -44,7 +47,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* If force is set do it anyway. */
if (ia_valid & ATTR_FORCE)
- return 0;
+ goto kill_priv;
/* Make sure a caller can chown. */
if ((ia_valid & ATTR_UID) &&
@@ -77,9 +80,19 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
return -EPERM;
}
+kill_priv:
+ /* User has permission for the change */
+ if (ia_valid & ATTR_KILL_PRIV) {
+ int error;
+
+ error = security_inode_killpriv(dentry);
+ if (error)
+ return error;
+ }
+
return 0;
}
-EXPORT_SYMBOL(inode_change_ok);
+EXPORT_SYMBOL(setattr_prepare);
/**
* inode_newsize_ok - may this inode be truncated to a given size
@@ -202,6 +215,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return -EPERM;
}
+ /*
+ * If utimes(2) and friends are called with times == NULL (or both
+ * times are UTIME_NOW), then we need to check for write permission
+ */
+ if (ia_valid & ATTR_TOUCH) {
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ if (!inode_owner_or_capable(inode)) {
+ error = inode_permission(inode, MAY_WRITE);
+ if (error)
+ return error;
+ }
+ }
+
if ((ia_valid & ATTR_MODE)) {
umode_t amode = attr->ia_mode;
/* Flag setting protected by i_mutex */
@@ -209,7 +237,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
inode->i_flags &= ~S_NOSEC;
}
- now = current_fs_time(inode->i_sb);
+ now = current_time(inode);
attr->ia_ctime = now;
if (!(ia_valid & ATTR_ATIME_SET))
@@ -217,13 +245,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (!(ia_valid & ATTR_MTIME_SET))
attr->ia_mtime = now;
if (ia_valid & ATTR_KILL_PRIV) {
- attr->ia_valid &= ~ATTR_KILL_PRIV;
- ia_valid &= ~ATTR_KILL_PRIV;
error = security_inode_need_killpriv(dentry);
- if (error > 0)
- error = security_inode_killpriv(dentry);
- if (error)
+ if (error < 0)
return error;
+ if (error == 0)
+ ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
}
/*
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a439548de785..a1fba4285277 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -20,7 +20,8 @@
#define AUTOFS_IOC_COUNT 32
#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
-#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT \
+ (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -33,8 +34,6 @@
#include <asm/current.h>
#include <linux/uaccess.h>
-/* #define DEBUG */
-
#ifdef pr_fmt
#undef pr_fmt
#endif
@@ -111,8 +110,6 @@ struct autofs_sb_info {
int max_proto;
unsigned long exp_timeout;
unsigned int type;
- int reghost_enabled;
- int needs_reghost;
struct super_block *sb;
struct mutex wq_mutex;
struct mutex pipe_mutex;
@@ -271,4 +268,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
}
}
-extern void autofs4_kill_sb(struct super_block *);
+void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c7fcc7438843..fc09eb77ddf3 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
(param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
pr_warn("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n",
AUTOFS_DEV_IOCTL_VERSION_MAJOR,
AUTOFS_DEV_IOCTL_VERSION_MINOR,
param->ver_major, param->ver_minor, cmd);
@@ -172,6 +172,17 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
return sbi;
}
+/* Return autofs dev ioctl version */
+static int autofs_dev_ioctl_version(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ /* This should have already been set. */
+ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+ param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+ return 0;
+}
+
/* Return autofs module protocol version */
static int autofs_dev_ioctl_protover(struct file *fp,
struct autofs_sb_info *sbi,
@@ -586,41 +597,25 @@ out:
static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
{
- static struct {
- int cmd;
- ioctl_fn fn;
- } _ioctls[] = {
- {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
- {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
- autofs_dev_ioctl_protover},
- {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
- autofs_dev_ioctl_protosubver},
- {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
- autofs_dev_ioctl_openmount},
- {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
- autofs_dev_ioctl_closemount},
- {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
- autofs_dev_ioctl_ready},
- {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
- autofs_dev_ioctl_fail},
- {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
- autofs_dev_ioctl_setpipefd},
- {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
- autofs_dev_ioctl_catatonic},
- {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
- autofs_dev_ioctl_timeout},
- {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
- autofs_dev_ioctl_requester},
- {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
- autofs_dev_ioctl_expire},
- {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
- autofs_dev_ioctl_askumount},
- {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
- autofs_dev_ioctl_ismountpoint}
+ static ioctl_fn _ioctls[] = {
+ autofs_dev_ioctl_version,
+ autofs_dev_ioctl_protover,
+ autofs_dev_ioctl_protosubver,
+ autofs_dev_ioctl_openmount,
+ autofs_dev_ioctl_closemount,
+ autofs_dev_ioctl_ready,
+ autofs_dev_ioctl_fail,
+ autofs_dev_ioctl_setpipefd,
+ autofs_dev_ioctl_catatonic,
+ autofs_dev_ioctl_timeout,
+ autofs_dev_ioctl_requester,
+ autofs_dev_ioctl_expire,
+ autofs_dev_ioctl_askumount,
+ autofs_dev_ioctl_ismountpoint,
};
unsigned int idx = cmd_idx(cmd);
- return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+ return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
}
/* ioctl dispatcher */
@@ -642,7 +637,7 @@ static int _autofs_dev_ioctl(unsigned int command,
cmd = _IOC_NR(command);
if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
- cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+ cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) {
return -ENOTTY;
}
@@ -655,14 +650,11 @@ static int _autofs_dev_ioctl(unsigned int command,
if (err)
goto out;
- /* The validate routine above always sets the version */
- if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
- goto done;
-
fn = lookup_dev_ioctl(cmd);
if (!fn) {
pr_warn("unknown command 0x%08x\n", command);
- return -ENOTTY;
+ err = -ENOTTY;
+ goto out;
}
fp = NULL;
@@ -671,9 +663,11 @@ static int _autofs_dev_ioctl(unsigned int command,
/*
* For obvious reasons the openmount can't have a file
* descriptor yet. We don't take a reference to the
- * file during close to allow for immediate release.
+ * file during close to allow for immediate release,
+ * and the same for retrieving ioctl version.
*/
- if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+ cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
fp = fget(param->ioctlfd);
if (!fp) {
@@ -706,7 +700,6 @@ cont:
if (fp)
fput(fp);
-done:
if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
err = -EFAULT;
out:
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 61b21051bd5a..438b5bf675b6 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -274,6 +274,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
goto fail_dput;
}
+ /* Test versions first */
+ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
+ sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
+ pr_err("kernel does not match daemon version "
+ "daemon (%d, %d) kernel (%d, %d)\n",
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ goto fail_dput;
+ }
+
+ /* Establish highest kernel protocol version */
+ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
+ sbi->version = AUTOFS_MAX_PROTO_VERSION;
+ else
+ sbi->version = sbi->max_proto;
+ sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
@@ -291,29 +308,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
root_inode->i_fop = &autofs4_root_operations;
root_inode->i_op = &autofs4_dir_inode_operations;
- /* Couldn't this be tested earlier? */
- if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
- sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- pr_err("kernel does not match daemon version "
- "daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
- goto fail_dput;
- }
-
- /* Establish highest kernel protocol version */
- if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
- sbi->version = AUTOFS_MAX_PROTO_VERSION;
- else
- sbi->version = sbi->max_proto;
- sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-
pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
pr_err("could not open pipe file descriptor\n");
- goto fail_dput;
+ goto fail_put_pid;
}
ret = autofs_prepare_pipe(pipe);
if (ret < 0)
@@ -334,14 +334,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
fail_fput:
pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
- /* fall through */
+fail_put_pid:
+ put_pid(sbi->oz_pgrp);
fail_dput:
dput(root);
goto fail_free;
fail_ino:
- kfree(ino);
+ autofs4_free_ino(ino);
fail_free:
- put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
return ret;
@@ -359,7 +359,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
inode->i_uid = d_inode(sb->s_root)->i_uid;
inode->i_gid = d_inode(sb->s_root)->i_gid;
}
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
@@ -368,7 +368,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
inode->i_fop = &autofs4_dir_operations;
} else if (S_ISLNK(mode)) {
inode->i_op = &autofs4_symlink_inode_operations;
- }
+ } else
+ WARN_ON(1);
return inode;
}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index fa84bb8832e0..a11f73174877 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -577,8 +577,6 @@ static int autofs4_dir_symlink(struct inode *dir,
inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
if (!inode) {
kfree(cp);
- if (!dentry->d_fsdata)
- kfree(ino);
return -ENOMEM;
}
inode->i_private = cp;
@@ -591,7 +589,7 @@ static int autofs4_dir_symlink(struct inode *dir,
if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
- dir->i_mtime = CURRENT_TIME;
+ dir->i_mtime = current_time(dir);
return 0;
}
@@ -631,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = CURRENT_TIME;
+ dir->i_mtime = current_time(dir);
spin_lock(&sbi->lookup_lock);
__autofs4_add_expiring(dentry);
@@ -762,7 +760,7 @@ static int autofs4_dir_mkdir(struct inode *dir,
if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
inc_nlink(dir);
- dir->i_mtime = CURRENT_TIME;
+ dir->i_mtime = current_time(dir);
return 0;
}
@@ -842,7 +840,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
if (may_umount(mnt))
status = 1;
- pr_debug("returning %d\n", status);
+ pr_debug("may umount %d\n", status);
status = put_user(status, p);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 431fd7ee3488..e44271dfceb6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
memcpy(&wq->name, &qstr, sizeof(struct qstr));
wq->dev = autofs4_get_dev(sbi);
wq->ino = autofs4_get_ino(sbi);
- wq->uid = current_uid();
- wq->gid = current_gid();
+ wq->uid = current_real_cred()->uid;
+ wq->gid = current_real_cred()->gid;
wq->pid = pid;
wq->tgid = tgid;
wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 3ba385eaa26e..8712062275b8 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -100,29 +100,12 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
return -EIO;
}
-static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EIO;
-}
-
-static ssize_t bad_inode_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return -EIO;
-}
-
static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
size_t buffer_size)
{
return -EIO;
}
-static int bad_inode_removexattr(struct dentry *dentry, const char *name)
-{
- return -EIO;
-}
-
static const struct inode_operations bad_inode_ops =
{
.create = bad_inode_create,
@@ -133,7 +116,7 @@ static const struct inode_operations bad_inode_ops =
.mkdir = bad_inode_mkdir,
.rmdir = bad_inode_rmdir,
.mknod = bad_inode_mknod,
- .rename2 = bad_inode_rename2,
+ .rename = bad_inode_rename2,
.readlink = bad_inode_readlink,
/* follow_link must be no-op, otherwise unmounting this inode
won't work */
@@ -142,10 +125,7 @@ static const struct inode_operations bad_inode_ops =
.permission = bad_inode_permission,
.getattr = bad_inode_getattr,
.setattr = bad_inode_setattr,
- .setxattr = bad_inode_setxattr,
- .getxattr = bad_inode_getxattr,
.listxattr = bad_inode_listxattr,
- .removexattr = bad_inode_removexattr,
};
@@ -173,8 +153,9 @@ void make_bad_inode(struct inode *inode)
inode->i_mode = S_IFREG;
inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
inode->i_op = &bad_inode_ops;
+ inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &bad_file_ops;
}
EXPORT_SYMBOL(make_bad_inode);
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index e0f59263a96d..c6bad51d8ec7 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -43,7 +43,10 @@ struct befs_sb_info {
u32 ag_shift;
u32 num_ags;
- /* jornal log entry */
+ /* State of the superblock */
+ u32 flags;
+
+ /* Journal log entry */
befs_block_run log_blocks;
befs_off_t log_start;
befs_off_t log_end;
@@ -79,7 +82,7 @@ enum befs_err {
BEFS_BT_END,
BEFS_BT_EMPTY,
BEFS_BT_MATCH,
- BEFS_BT_PARMATCH,
+ BEFS_BT_OVERFLOW,
BEFS_BT_NOT_FOUND
};
@@ -140,18 +143,6 @@ befs_iaddrs_per_block(struct super_block *sb)
return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
}
-static inline int
-befs_iaddr_is_empty(const befs_inode_addr *iaddr)
-{
- return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len);
-}
-
-static inline size_t
-befs_brun_size(struct super_block *sb, befs_block_run run)
-{
- return BEFS_SB(sb)->block_size * run.len;
-}
-
#include "endian.h"
#endif /* _LINUX_BEFS_H */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 307645f9e284..7e135ea73fdd 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -85,7 +85,7 @@ struct befs_btree_node {
};
/* local constants */
-static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
+static const befs_off_t BEFS_BT_INVAL = 0xffffffffffffffffULL;
/* local functions */
static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds,
@@ -156,8 +156,6 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds,
sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth);
sup->data_type = fs32_to_cpu(sb, od_sup->data_type);
sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr);
- sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr);
- sup->max_size = fs64_to_cpu(sb, od_sup->max_size);
brelse(bh);
if (sup->magic != BEFS_BTREE_MAGIC) {
@@ -183,8 +181,8 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds,
* Calls befs_read_datastream to read in the indicated btree node and
* makes sure its header fields are in cpu byteorder, byteswapping if
* necessary.
- * Note: node->bh must be NULL when this function called first
- * time. Don't forget brelse(node->bh) after last call.
+ * Note: node->bh must be NULL when this function is called the first time.
+ * Don't forget brelse(node->bh) after last call.
*
* On success, returns BEFS_OK and *@node contains the btree node that
* starts at @node_off, with the node->head fields in cpu byte order.
@@ -244,7 +242,7 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds,
* Read the superblock and rootnode of the b+tree.
* Drill down through the interior nodes using befs_find_key().
* Once at the correct leaf node, use befs_find_key() again to get the
- * actuall value stored with the key.
+ * actual value stored with the key.
*/
int
befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
@@ -283,9 +281,9 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
while (!befs_leafnode(this_node)) {
res = befs_find_key(sb, this_node, key, &node_off);
- if (res == BEFS_BT_NOT_FOUND)
+ /* if no key set, try the overflow node */
+ if (res == BEFS_BT_OVERFLOW)
node_off = this_node->head.overflow;
- /* if no match, go to overflow node */
if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
befs_error(sb, "befs_btree_find() failed to read "
"node at %llu", node_off);
@@ -293,15 +291,15 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
}
}
- /* at the correct leaf node now */
-
+ /* at a leaf node now, check if it is correct */
res = befs_find_key(sb, this_node, key, value);
brelse(this_node->bh);
kfree(this_node);
if (res != BEFS_BT_MATCH) {
- befs_debug(sb, "<--- %s Key %s not found", __func__, key);
+ befs_error(sb, "<--- %s Key %s not found", __func__, key);
+ befs_debug(sb, "<--- %s ERROR", __func__);
*value = 0;
return BEFS_BT_NOT_FOUND;
}
@@ -324,16 +322,12 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
* @findkey: Keystring to search for
* @value: If key is found, the value stored with the key is put here
*
- * finds exact match if one exists, and returns BEFS_BT_MATCH
- * If no exact match, finds first key in node that is greater
- * (alphabetically) than the search key and returns BEFS_BT_PARMATCH
- * (for partial match, I guess). Can you think of something better to
- * call it?
- *
- * If no key was a match or greater than the search key, return
- * BEFS_BT_NOT_FOUND.
+ * Finds exact match if one exists, and returns BEFS_BT_MATCH.
+ * If there is no match and node's value array is too small for key, return
+ * BEFS_BT_OVERFLOW.
+ * If no match and node should countain this key, return BEFS_BT_NOT_FOUND.
*
- * Use binary search instead of a linear.
+ * Uses binary search instead of a linear.
*/
static int
befs_find_key(struct super_block *sb, struct befs_btree_node *node,
@@ -348,18 +342,16 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
befs_debug(sb, "---> %s %s", __func__, findkey);
- *value = 0;
-
findkey_len = strlen(findkey);
- /* if node can not contain key, just skeep this node */
+ /* if node can not contain key, just skip this node */
last = node->head.all_key_count - 1;
thiskey = befs_bt_get_key(sb, node, last, &keylen);
eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
if (eq < 0) {
- befs_debug(sb, "<--- %s %s not found", __func__, findkey);
- return BEFS_BT_NOT_FOUND;
+ befs_debug(sb, "<--- node can't contain %s", findkey);
+ return BEFS_BT_OVERFLOW;
}
valarray = befs_bt_valarray(node);
@@ -387,12 +379,15 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
else
first = mid + 1;
}
+
+ /* return an existing value so caller can arrive to a leaf node */
if (eq < 0)
*value = fs64_to_cpu(sb, valarray[mid + 1]);
else
*value = fs64_to_cpu(sb, valarray[mid]);
- befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
- return BEFS_BT_PARMATCH;
+ befs_error(sb, "<--- %s %s not found", __func__, findkey);
+ befs_debug(sb, "<--- %s ERROR", __func__);
+ return BEFS_BT_NOT_FOUND;
}
/**
@@ -405,7 +400,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
* @keysize: Length of the returned key
* @value: Value stored with the returned key
*
- * Heres how it works: Key_no is the index of the key/value pair to
+ * Here's how it works: Key_no is the index of the key/value pair to
* return in keybuf/value.
* Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
* the number of characters in the key (just a convenience).
@@ -422,7 +417,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
{
struct befs_btree_node *this_node;
befs_btree_super bt_super;
- befs_off_t node_off = 0;
+ befs_off_t node_off;
int cur_key;
fs64 *valarray;
char *keystart;
@@ -467,7 +462,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
while (key_sum + this_node->head.all_key_count <= key_no) {
/* no more nodes to look in: key_no is too large */
- if (this_node->head.right == befs_bt_inval) {
+ if (this_node->head.right == BEFS_BT_INVAL) {
*keysize = 0;
*value = 0;
befs_debug(sb,
@@ -541,7 +536,6 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
* @node_off: Pointer to offset of current node within datastream. Modified
* by the function.
*
- *
* Helper function for btree traverse. Moves the current position to the
* start of the first leaf node.
*
@@ -608,7 +602,7 @@ static int
befs_leafnode(struct befs_btree_node *node)
{
/* all interior nodes (and only interior nodes) have an overflow node */
- if (node->head.overflow == befs_bt_inval)
+ if (node->head.overflow == BEFS_BT_INVAL)
return 1;
else
return 0;
@@ -715,7 +709,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
*
* Returns 0 if @key1 and @key2 are equal.
* Returns >0 if @key1 is greater.
- * Returns <0 if @key2 is greater..
+ * Returns <0 if @key2 is greater.
*/
static int
befs_compare_strings(const void *key1, int keylen1,
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index af1bc19b7c85..b4c7ba013c0d 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -22,22 +22,22 @@ const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
static int befs_find_brun_direct(struct super_block *sb,
const befs_data_stream *data,
- befs_blocknr_t blockno, befs_block_run * run);
+ befs_blocknr_t blockno, befs_block_run *run);
static int befs_find_brun_indirect(struct super_block *sb,
const befs_data_stream *data,
befs_blocknr_t blockno,
- befs_block_run * run);
+ befs_block_run *run);
static int befs_find_brun_dblindirect(struct super_block *sb,
const befs_data_stream *data,
befs_blocknr_t blockno,
- befs_block_run * run);
+ befs_block_run *run);
/**
* befs_read_datastream - get buffer_head containing data, starting from pos.
* @sb: Filesystem superblock
- * @ds: datastrem to find data with
+ * @ds: datastream to find data with
* @pos: start of data
* @off: offset of data in buffer_head->b_data
*
@@ -46,7 +46,7 @@ static int befs_find_brun_dblindirect(struct super_block *sb,
*/
struct buffer_head *
befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
- befs_off_t pos, uint * off)
+ befs_off_t pos, uint *off)
{
struct buffer_head *bh;
befs_block_run run;
@@ -75,7 +75,13 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
return bh;
}
-/*
+/**
+ * befs_fblock2brun - give back block run for fblock
+ * @sb: the superblock
+ * @data: datastream to read from
+ * @fblock: the blocknumber with the file position to find
+ * @run: The found run is passed back through this pointer
+ *
* Takes a file position and gives back a brun who's starting block
* is block number fblock of the file.
*
@@ -88,7 +94,7 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
*/
int
befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
- befs_blocknr_t fblock, befs_block_run * run)
+ befs_blocknr_t fblock, befs_block_run *run)
{
int err;
befs_off_t pos = fblock << BEFS_SB(sb)->block_shift;
@@ -115,7 +121,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
/**
* befs_read_lsmylink - read long symlink from datastream.
* @sb: Filesystem superblock
- * @ds: Datastrem to read from
+ * @ds: Datastream to read from
* @buff: Buffer in which to place long symlink data
* @len: Length of the long symlink in bytes
*
@@ -128,6 +134,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds,
befs_off_t bytes_read = 0; /* bytes readed */
u16 plen;
struct buffer_head *bh;
+
befs_debug(sb, "---> %s length: %llu", __func__, len);
while (bytes_read < len) {
@@ -183,13 +190,13 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds)
metablocks += ds->indirect.len;
/*
- Double indir block, plus all the indirect blocks it mapps
- In the double-indirect range, all block runs of data are
- BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know
- how many data block runs are in the double-indirect region,
- and from that we know how many indirect blocks it takes to
- map them. We assume that the indirect blocks are also
- BEFS_DBLINDIR_BRUN_LEN blocks long.
+ * Double indir block, plus all the indirect blocks it maps.
+ * In the double-indirect range, all block runs of data are
+ * BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know
+ * how many data block runs are in the double-indirect region,
+ * and from that we know how many indirect blocks it takes to
+ * map them. We assume that the indirect blocks are also
+ * BEFS_DBLINDIR_BRUN_LEN blocks long.
*/
if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) {
uint dbl_bytes;
@@ -212,58 +219,50 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds)
return blocks;
}
-/*
- Finds the block run that starts at file block number blockno
- in the file represented by the datastream data, if that
- blockno is in the direct region of the datastream.
-
- sb: the superblock
- data: the datastream
- blockno: the blocknumber to find
- run: The found run is passed back through this pointer
-
- Return value is BEFS_OK if the blockrun is found, BEFS_ERR
- otherwise.
-
- Algorithm:
- Linear search. Checks each element of array[] to see if it
- contains the blockno-th filesystem block. This is necessary
- because the block runs map variable amounts of data. Simply
- keeps a count of the number of blocks searched so far (sum),
- incrementing this by the length of each block run as we come
- across it. Adds sum to *count before returning (this is so
- you can search multiple arrays that are logicaly one array,
- as in the indirect region code).
-
- When/if blockno is found, if blockno is inside of a block
- run as stored on disk, we offset the start and length members
- of the block run, so that blockno is the start and len is
- still valid (the run ends in the same place).
-
- 2001-11-15 Will Dyson
-*/
+/**
+ * befs_find_brun_direct - find a direct block run in the datastream
+ * @sb: the superblock
+ * @data: the datastream
+ * @blockno: the blocknumber to find
+ * @run: The found run is passed back through this pointer
+ *
+ * Finds the block run that starts at file block number blockno
+ * in the file represented by the datastream data, if that
+ * blockno is in the direct region of the datastream.
+ *
+ * Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+ * otherwise.
+ *
+ * Algorithm:
+ * Linear search. Checks each element of array[] to see if it
+ * contains the blockno-th filesystem block. This is necessary
+ * because the block runs map variable amounts of data. Simply
+ * keeps a count of the number of blocks searched so far (sum),
+ * incrementing this by the length of each block run as we come
+ * across it. Adds sum to *count before returning (this is so
+ * you can search multiple arrays that are logicaly one array,
+ * as in the indirect region code).
+ *
+ * When/if blockno is found, if blockno is inside of a block
+ * run as stored on disk, we offset the start and length members
+ * of the block run, so that blockno is the start and len is
+ * still valid (the run ends in the same place).
+ */
static int
befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data,
- befs_blocknr_t blockno, befs_block_run * run)
+ befs_blocknr_t blockno, befs_block_run *run)
{
int i;
const befs_block_run *array = data->direct;
befs_blocknr_t sum;
- befs_blocknr_t max_block =
- data->max_direct_range >> BEFS_SB(sb)->block_shift;
befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
- if (blockno > max_block) {
- befs_error(sb, "%s passed block outside of direct region",
- __func__);
- return BEFS_ERR;
- }
-
for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS;
sum += array[i].len, i++) {
if (blockno >= sum && blockno < sum + (array[i].len)) {
int offset = blockno - sum;
+
run->allocation_group = array[i].allocation_group;
run->start = array[i].start + offset;
run->len = array[i].len - offset;
@@ -275,38 +274,39 @@ befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data,
}
}
+ befs_error(sb, "%s failed to find file block %lu", __func__,
+ (unsigned long)blockno);
befs_debug(sb, "---> %s ERROR", __func__);
return BEFS_ERR;
}
-/*
- Finds the block run that starts at file block number blockno
- in the file represented by the datastream data, if that
- blockno is in the indirect region of the datastream.
-
- sb: the superblock
- data: the datastream
- blockno: the blocknumber to find
- run: The found run is passed back through this pointer
-
- Return value is BEFS_OK if the blockrun is found, BEFS_ERR
- otherwise.
-
- Algorithm:
- For each block in the indirect run of the datastream, read
- it in and search through it for search_blk.
-
- XXX:
- Really should check to make sure blockno is inside indirect
- region.
-
- 2001-11-15 Will Dyson
-*/
+/**
+ * befs_find_brun_indirect - find a block run in the datastream
+ * @sb: the superblock
+ * @data: the datastream
+ * @blockno: the blocknumber to find
+ * @run: The found run is passed back through this pointer
+ *
+ * Finds the block run that starts at file block number blockno
+ * in the file represented by the datastream data, if that
+ * blockno is in the indirect region of the datastream.
+ *
+ * Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+ * otherwise.
+ *
+ * Algorithm:
+ * For each block in the indirect run of the datastream, read
+ * it in and search through it for search_blk.
+ *
+ * XXX:
+ * Really should check to make sure blockno is inside indirect
+ * region.
+ */
static int
befs_find_brun_indirect(struct super_block *sb,
const befs_data_stream *data,
befs_blocknr_t blockno,
- befs_block_run * run)
+ befs_block_run *run)
{
int i, j;
befs_blocknr_t sum = 0;
@@ -326,11 +326,12 @@ befs_find_brun_indirect(struct super_block *sb,
/* Examine blocks of the indirect run one at a time */
for (i = 0; i < indirect.len; i++) {
- indirblock = befs_bread(sb, indirblockno + i);
+ indirblock = sb_bread(sb, indirblockno + i);
if (indirblock == NULL) {
- befs_debug(sb, "---> %s failed to read "
+ befs_error(sb, "---> %s failed to read "
"disk block %lu from the indirect brun",
__func__, (unsigned long)indirblockno + i);
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -370,52 +371,51 @@ befs_find_brun_indirect(struct super_block *sb,
return BEFS_ERR;
}
-/*
- Finds the block run that starts at file block number blockno
- in the file represented by the datastream data, if that
- blockno is in the double-indirect region of the datastream.
-
- sb: the superblock
- data: the datastream
- blockno: the blocknumber to find
- run: The found run is passed back through this pointer
-
- Return value is BEFS_OK if the blockrun is found, BEFS_ERR
- otherwise.
-
- Algorithm:
- The block runs in the double-indirect region are different.
- They are always allocated 4 fs blocks at a time, so each
- block run maps a constant amount of file data. This means
- that we can directly calculate how many block runs into the
- double-indirect region we need to go to get to the one that
- maps a particular filesystem block.
-
- We do this in two stages. First we calculate which of the
- inode addresses in the double-indirect block will point us
- to the indirect block that contains the mapping for the data,
- then we calculate which of the inode addresses in that
- indirect block maps the data block we are after.
-
- Oh, and once we've done that, we actually read in the blocks
- that contain the inode addresses we calculated above. Even
- though the double-indirect run may be several blocks long,
- we can calculate which of those blocks will contain the index
- we are after and only read that one. We then follow it to
- the indirect block and perform a similar process to find
- the actual block run that maps the data block we are interested
- in.
-
- Then we offset the run as in befs_find_brun_array() and we are
- done.
-
- 2001-11-15 Will Dyson
-*/
+/**
+ * befs_find_brun_dblindirect - find a block run in the datastream
+ * @sb: the superblock
+ * @data: the datastream
+ * @blockno: the blocknumber to find
+ * @run: The found run is passed back through this pointer
+ *
+ * Finds the block run that starts at file block number blockno
+ * in the file represented by the datastream data, if that
+ * blockno is in the double-indirect region of the datastream.
+ *
+ * Return value is BEFS_OK if the blockrun is found, BEFS_ERR
+ * otherwise.
+ *
+ * Algorithm:
+ * The block runs in the double-indirect region are different.
+ * They are always allocated 4 fs blocks at a time, so each
+ * block run maps a constant amount of file data. This means
+ * that we can directly calculate how many block runs into the
+ * double-indirect region we need to go to get to the one that
+ * maps a particular filesystem block.
+ *
+ * We do this in two stages. First we calculate which of the
+ * inode addresses in the double-indirect block will point us
+ * to the indirect block that contains the mapping for the data,
+ * then we calculate which of the inode addresses in that
+ * indirect block maps the data block we are after.
+ *
+ * Oh, and once we've done that, we actually read in the blocks
+ * that contain the inode addresses we calculated above. Even
+ * though the double-indirect run may be several blocks long,
+ * we can calculate which of those blocks will contain the index
+ * we are after and only read that one. We then follow it to
+ * the indirect block and perform a similar process to find
+ * the actual block run that maps the data block we are interested
+ * in.
+ *
+ * Then we offset the run as in befs_find_brun_array() and we are
+ * done.
+ */
static int
befs_find_brun_dblindirect(struct super_block *sb,
const befs_data_stream *data,
befs_blocknr_t blockno,
- befs_block_run * run)
+ befs_block_run *run)
{
int dblindir_indx;
int indir_indx;
@@ -430,10 +430,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *indir_block;
befs_block_run indir_run;
befs_disk_inode_addr *iaddr_array;
- struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
- data->max_indirect_range >> befs_sb->block_shift;
+ data->max_indirect_range >> BEFS_SB(sb)->block_shift;
off_t dbl_indir_off = blockno - indir_start_blk;
@@ -471,7 +470,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
}
dbl_indir_block =
- befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
+ sb_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
dbl_which_block);
if (dbl_indir_block == NULL) {
befs_error(sb, "%s couldn't read the "
@@ -479,7 +478,6 @@ befs_find_brun_dblindirect(struct super_block *sb,
(unsigned long)
iaddr2blockno(sb, &data->double_indirect) +
dbl_which_block);
- brelse(dbl_indir_block);
return BEFS_ERR;
}
@@ -499,12 +497,11 @@ befs_find_brun_dblindirect(struct super_block *sb,
}
indir_block =
- befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
+ sb_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
if (indir_block == NULL) {
befs_error(sb, "%s couldn't read the indirect block "
"at blockno %lu", __func__, (unsigned long)
iaddr2blockno(sb, &indir_run) + which_block);
- brelse(indir_block);
return BEFS_ERR;
}
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 4de7cffcd662..85c13392e9e8 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -169,6 +169,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
+ befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, sup->inode_size));
befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2));
befs_debug(sb, " blocks_per_ag %u",
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 523c8af2d770..b4a558126ee1 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -27,7 +27,7 @@ struct buffer_head *
befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
{
struct buffer_head *bh;
- befs_blocknr_t block = 0;
+ befs_blocknr_t block;
struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_debug(sb, "---> Enter %s "
@@ -59,27 +59,3 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
befs_debug(sb, "<--- %s ERROR", __func__);
return NULL;
}
-
-struct buffer_head *
-befs_bread(struct super_block *sb, befs_blocknr_t block)
-{
- struct buffer_head *bh;
-
- befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
-
- bh = sb_bread(sb, block);
-
- if (bh == NULL) {
- befs_error(sb, "Failed to read block %lu",
- (unsigned long)block);
- goto error;
- }
-
- befs_debug(sb, "<--- %s", __func__);
-
- return bh;
-
- error:
- befs_debug(sb, "<--- %s ERROR", __func__);
- return NULL;
-}
diff --git a/fs/befs/io.h b/fs/befs/io.h
index 9b78266b6aa5..78d7bc6e60de 100644
--- a/fs/befs/io.h
+++ b/fs/befs/io.h
@@ -5,5 +5,3 @@
struct buffer_head *befs_bread_iaddr(struct super_block *sb,
befs_inode_addr iaddr);
-struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block);
-
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 7da05b159ade..647a276eba56 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -120,7 +120,7 @@ befs_get_block(struct inode *inode, sector_t block,
struct super_block *sb = inode->i_sb;
befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
befs_block_run run = BAD_IADDR;
- int res = 0;
+ int res;
ulong disk_off;
befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
@@ -179,15 +179,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
kfree(utfname);
} else {
- ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset);
+ ret = befs_btree_find(sb, ds, name, &offset);
}
if (ret == BEFS_BT_NOT_FOUND) {
befs_debug(sb, "<--- %s %pd not found", __func__, dentry);
+ d_add(dentry, NULL);
return ERR_PTR(-ENOENT);
} else if (ret != BEFS_OK || offset == 0) {
- befs_warning(sb, "<--- %s Error", __func__);
+ befs_error(sb, "<--- %s Error", __func__);
return ERR_PTR(-ENODATA);
}
@@ -211,56 +212,55 @@ befs_readdir(struct file *file, struct dir_context *ctx)
befs_off_t value;
int result;
size_t keysize;
- unsigned char d_type;
char keybuf[BEFS_NAME_LEN + 1];
befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld",
__func__, file, inode->i_ino, ctx->pos);
-more:
- result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
- keybuf, &keysize, &value);
+ while (1) {
+ result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
+ keybuf, &keysize, &value);
- if (result == BEFS_ERR) {
- befs_debug(sb, "<--- %s ERROR", __func__);
- befs_error(sb, "IO error reading %pD (inode %lu)",
- file, inode->i_ino);
- return -EIO;
-
- } else if (result == BEFS_BT_END) {
- befs_debug(sb, "<--- %s END", __func__);
- return 0;
-
- } else if (result == BEFS_BT_EMPTY) {
- befs_debug(sb, "<--- %s Empty directory", __func__);
- return 0;
- }
+ if (result == BEFS_ERR) {
+ befs_debug(sb, "<--- %s ERROR", __func__);
+ befs_error(sb, "IO error reading %pD (inode %lu)",
+ file, inode->i_ino);
+ return -EIO;
- d_type = DT_UNKNOWN;
+ } else if (result == BEFS_BT_END) {
+ befs_debug(sb, "<--- %s END", __func__);
+ return 0;
- /* Convert to NLS */
- if (BEFS_SB(sb)->nls) {
- char *nlsname;
- int nlsnamelen;
- result =
- befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
- if (result < 0) {
- befs_debug(sb, "<--- %s ERROR", __func__);
- return result;
+ } else if (result == BEFS_BT_EMPTY) {
+ befs_debug(sb, "<--- %s Empty directory", __func__);
+ return 0;
}
- if (!dir_emit(ctx, nlsname, nlsnamelen,
- (ino_t) value, d_type)) {
+
+ /* Convert to NLS */
+ if (BEFS_SB(sb)->nls) {
+ char *nlsname;
+ int nlsnamelen;
+
+ result =
+ befs_utf2nls(sb, keybuf, keysize, &nlsname,
+ &nlsnamelen);
+ if (result < 0) {
+ befs_debug(sb, "<--- %s ERROR", __func__);
+ return result;
+ }
+ if (!dir_emit(ctx, nlsname, nlsnamelen,
+ (ino_t) value, DT_UNKNOWN)) {
+ kfree(nlsname);
+ return 0;
+ }
kfree(nlsname);
- return 0;
+ } else {
+ if (!dir_emit(ctx, keybuf, keysize,
+ (ino_t) value, DT_UNKNOWN))
+ return 0;
}
- kfree(nlsname);
- } else {
- if (!dir_emit(ctx, keybuf, keysize,
- (ino_t) value, d_type))
- return 0;
+ ctx->pos++;
}
- ctx->pos++;
- goto more;
}
static struct inode *
@@ -299,7 +299,6 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
struct befs_sb_info *befs_sb = BEFS_SB(sb);
struct befs_inode_info *befs_ino;
struct inode *inode;
- long ret = -EIO;
befs_debug(sb, "---> %s inode = %lu", __func__, ino);
@@ -318,7 +317,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
befs_ino->i_inode_num.allocation_group,
befs_ino->i_inode_num.start, befs_ino->i_inode_num.len);
- bh = befs_bread(sb, inode->i_ino);
+ bh = sb_bread(sb, inode->i_ino);
if (!bh) {
befs_error(sb, "unable to read inode block - "
"inode = %lu", inode->i_ino);
@@ -421,7 +420,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
unacquire_none:
iget_failed(inode);
befs_debug(sb, "<--- %s - Bad inode", __func__);
- return ERR_PTR(ret);
+ return ERR_PTR(-EIO);
}
/* Initialize the inode cache. Called at fs setup.
@@ -436,10 +435,9 @@ befs_init_inodecache(void)
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
- if (befs_inode_cachep == NULL) {
- pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
+ if (befs_inode_cachep == NULL)
return -ENOMEM;
- }
+
return 0;
}
@@ -524,8 +522,6 @@ befs_utf2nls(struct super_block *sb, const char *in,
*out = result = kmalloc(maxlen, GFP_NOFS);
if (!*out) {
- befs_error(sb, "%s cannot allocate memory", __func__);
- *out_len = 0;
return -ENOMEM;
}
@@ -604,7 +600,6 @@ befs_nls2utf(struct super_block *sb, const char *in,
*out = result = kmalloc(maxlen, GFP_NOFS);
if (!*out) {
- befs_error(sb, "%s cannot allocate memory", __func__);
*out_len = 0;
return -ENOMEM;
}
@@ -637,10 +632,6 @@ befs_nls2utf(struct super_block *sb, const char *in,
return -EILSEQ;
}
-/**
- * Use the
- *
- */
enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
@@ -760,19 +751,19 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
long ret = -EINVAL;
const unsigned long sb_block = 0;
const off_t x86_sb_off = 512;
+ int blocksize;
save_mount_options(sb, data);
sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
- if (sb->s_fs_info == NULL) {
- pr_err("(%s): Unable to allocate memory for private "
- "portion of superblock. Bailing.\n", sb->s_id);
+ if (sb->s_fs_info == NULL)
goto unacquire_none;
- }
+
befs_sb = BEFS_SB(sb);
if (!parse_options((char *) data, &befs_sb->mount_opts)) {
- befs_error(sb, "cannot parse mount options");
+ if (!silent)
+ befs_error(sb, "cannot parse mount options");
goto unacquire_priv_sbp;
}
@@ -789,14 +780,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
* Will be set to real fs blocksize later.
*
* Linux 2.4.10 and later refuse to read blocks smaller than
- * the hardsect size for the device. But we also need to read at
+ * the logical block size for the device. But we also need to read at
* least 1k to get the second 512 bytes of the volume.
* -WD 10-26-01
*/
- sb_min_blocksize(sb, 1024);
+ blocksize = sb_min_blocksize(sb, 1024);
+ if (!blocksize) {
+ if (!silent)
+ befs_error(sb, "unable to set blocksize");
+ goto unacquire_priv_sbp;
+ }
if (!(bh = sb_bread(sb, sb_block))) {
- befs_error(sb, "unable to read superblock");
+ if (!silent)
+ befs_error(sb, "unable to read superblock");
goto unacquire_priv_sbp;
}
@@ -820,9 +817,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
if( befs_sb->num_blocks > ~((sector_t)0) ) {
- befs_error(sb, "blocks count: %llu "
- "is larger than the host can use",
- befs_sb->num_blocks);
+ if (!silent)
+ befs_error(sb, "blocks count: %llu is larger than the host can use",
+ befs_sb->num_blocks);
goto unacquire_priv_sbp;
}
@@ -841,7 +838,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_root = d_make_root(root);
if (!sb->s_root) {
- befs_error(sb, "get root inode failed");
+ if (!silent)
+ befs_error(sb, "get root inode failed");
goto unacquire_priv_sbp;
}
@@ -870,9 +868,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
unacquire_priv_sbp:
kfree(befs_sb->mount_opts.iocharset);
kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
unacquire_none:
- sb->s_fs_info = NULL;
return ret;
}
diff --git a/fs/befs/super.c b/fs/befs/super.c
index aeafc4d84278..7c50025c99d8 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -13,24 +13,20 @@
#include "befs.h"
#include "super.h"
-/**
- * load_befs_sb -- Read from disk and properly byteswap all the fields
+/*
+ * befs_load_sb -- Read from disk and properly byteswap all the fields
* of the befs superblock
- *
- *
- *
- *
*/
int
-befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
+befs_load_sb(struct super_block *sb, befs_super_block *disk_sb)
{
struct befs_sb_info *befs_sb = BEFS_SB(sb);
/* Check the byte order of the filesystem */
if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
- befs_sb->byte_order = BEFS_BYTESEX_LE;
+ befs_sb->byte_order = BEFS_BYTESEX_LE;
else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
- befs_sb->byte_order = BEFS_BYTESEX_BE;
+ befs_sb->byte_order = BEFS_BYTESEX_BE;
befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
@@ -45,6 +41,8 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift);
befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags);
+ befs_sb->flags = fs32_to_cpu(sb, disk_sb->flags);
+
befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks);
befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start);
befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end);
@@ -84,15 +82,15 @@ befs_check_sb(struct super_block *sb)
}
if (befs_sb->block_size > PAGE_SIZE) {
- befs_error(sb, "blocksize(%u) cannot be larger"
+ befs_error(sb, "blocksize(%u) cannot be larger "
"than system pagesize(%lu)", befs_sb->block_size,
PAGE_SIZE);
return BEFS_ERR;
}
/*
- * block_shift and block_size encode the same information
- * in different ways as a consistency check.
+ * block_shift and block_size encode the same information
+ * in different ways as a consistency check.
*/
if ((1 << befs_sb->block_shift) != befs_sb->block_size) {
@@ -101,10 +99,18 @@ befs_check_sb(struct super_block *sb)
return BEFS_ERR;
}
- if (befs_sb->log_start != befs_sb->log_end) {
+
+ /* ag_shift also encodes the same information as blocks_per_ag in a
+ * different way, non-fatal consistency check
+ */
+ if ((1 << befs_sb->ag_shift) != befs_sb->blocks_per_ag)
+ befs_error(sb, "ag_shift disagrees with blocks_per_ag.");
+
+ if (befs_sb->log_start != befs_sb->log_end ||
+ befs_sb->flags == BEFS_DIRTY) {
befs_error(sb, "Filesystem not clean! There are blocks in the "
- "journal. You must boot into BeOS and mount this volume "
- "to make it clean.");
+ "journal. You must boot into BeOS and mount this "
+ "volume to make it clean.");
return BEFS_ERR;
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 34a5bc2f1290..3e5ac30e8b6f 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
set_bit(ino, info->si_imap);
info->si_freei--;
inode_init_owner(inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
@@ -165,7 +165,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
return err;
}
inc_nlink(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
ihold(inode);
d_instantiate(new, inode);
@@ -194,7 +194,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
inode->i_ctime = dir->i_ctime;
inode_dec_link_count(inode);
@@ -207,7 +207,8 @@ out_brelse:
}
static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh;
@@ -215,6 +216,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct bfs_sb_info *info;
int error = -ENOENT;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_bh = new_bh = NULL;
old_inode = d_inode(old_dentry);
if (S_ISDIR(old_inode->i_mode))
@@ -249,10 +253,10 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
}
old_de->ino = 0;
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
mark_inode_dirty(old_dir);
if (new_inode) {
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = current_time(new_inode);
inode_dec_link_count(new_inode);
}
mark_buffer_dirty_inode(old_bh, old_dir);
@@ -300,9 +304,9 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
pos = (block - sblock) * BFS_BSIZE + off;
if (pos >= dir->i_size) {
dir->i_size += BFS_DIRENT_SIZE;
- dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_ctime = current_time(dir);
}
- dir->i_mtime = CURRENT_TIME_SEC;
+ dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
de->ino = cpu_to_le16((u16)ino);
for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5495f37c6ed..2472af2798c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task,
regset->writeback(task, regset, 1);
}
-#ifndef PR_REG_SIZE
-#define PR_REG_SIZE(S) sizeof(S)
-#endif
-
#ifndef PRSTATUS_SIZE
-#define PRSTATUS_SIZE(S) sizeof(S)
-#endif
-
-#ifndef PR_REG_PTR
-#define PR_REG_PTR(S) (&((S)->pr_reg))
+#define PRSTATUS_SIZE(S, R) sizeof(S)
#endif
#ifndef SET_PR_FPVALID
-#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
#endif
static int fill_thread_core_info(struct elf_thread_core_info *t,
@@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
long signr, size_t *total)
{
unsigned int i;
+ unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
* We assume that regset 0 is NT_PRSTATUS.
*/
fill_prstatus(&t->prstatus, t->task, signr);
- (void) view->regsets[0].get(t->task, &view->regsets[0],
- 0, PR_REG_SIZE(t->prstatus.pr_reg),
- PR_REG_PTR(&t->prstatus), NULL);
+ (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+ &t->prstatus.pr_reg, NULL);
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
- PRSTATUS_SIZE(t->prstatus), &t->prstatus);
+ PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
*total += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset->core_note_type,
size, data);
else {
- SET_PR_FPVALID(&t->prstatus, 1);
+ SET_PR_FPVALID(&t->prstatus,
+ 1, regset_size);
fill_note(&t->notes[i], "CORE",
NT_PRFPREG, size, data);
}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6103a6362ccd..9b4688ab1d8e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -584,7 +584,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
}
return inode;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 08ae99343d92..05b553368bb4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -30,6 +30,7 @@
#include <linux/cleancache.h>
#include <linux/dax.h>
#include <linux/badblocks.h>
+#include <linux/falloc.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -180,9 +181,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
- if (IS_DAX(inode))
- return dax_do_io(iocb, inode, iter, blkdev_get_block,
- NULL, DIO_SKIP_DIO_COUNT);
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
blkdev_get_block, NULL, NULL,
DIO_SKIP_DIO_COUNT);
@@ -302,14 +300,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
error = sb->s_op->thaw_super(sb);
else
error = thaw_super(sb);
- if (error) {
+ if (error)
bdev->bd_fsfreeze_count++;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
- }
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return 0;
+ return error;
}
EXPORT_SYMBOL(thaw_bdev);
@@ -1275,7 +1270,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- bdev->bd_inode->i_flags = 0;
if (!partno) {
ret = -ENXIO;
@@ -1303,11 +1297,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret) {
+ if (!ret)
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- if (!bdev_dax_capable(bdev))
- bdev->bd_inode->i_flags &= ~S_DAX;
- }
/*
* If the device is invalidated, rescan partition
@@ -1342,8 +1333,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- if (!bdev_dax_capable(bdev))
- bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
if (bdev->bd_contains == bdev) {
@@ -1787,6 +1776,81 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#define BLKDEV_FALLOC_FL_SUPPORTED \
+ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+
+static long blkdev_fallocate(struct file *file, int mode, loff_t start,
+ loff_t len)
+{
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct address_space *mapping;
+ loff_t end = start + len - 1;
+ loff_t isize;
+ int error;
+
+ /* Fail if we don't recognize the flags. */
+ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /* Don't go off the end of the device. */
+ isize = i_size_read(bdev->bd_inode);
+ if (start >= isize)
+ return -EINVAL;
+ if (end >= isize) {
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ len = isize - start;
+ end = start + len - 1;
+ } else
+ return -EINVAL;
+ }
+
+ /*
+ * Don't allow IO that isn't aligned to logical block size.
+ */
+ if ((start | len) & (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ /* Invalidate the page cache, including dirty pages. */
+ mapping = bdev->bd_inode->i_mapping;
+ truncate_inode_pages_range(mapping, start, end);
+
+ switch (mode) {
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+ error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, false);
+ break;
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ /* Only punch if the device can do zeroing discard. */
+ if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
+ return -EOPNOTSUPP;
+ error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, 0);
+ break;
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+ error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, 0);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Invalidate again; if someone wandered in and dirtied a page,
+ * the caller will be given -EBUSY. The third argument is
+ * inclusive, so the rounding here is safe.
+ */
+ return invalidate_inode_pages2_range(mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+}
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
@@ -1801,6 +1865,7 @@ const struct file_operations def_blk_fops = {
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .fallocate = blkdev_fallocate,
};
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 53bb7af4e5f0..247b8dfaf6e5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -79,11 +79,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- ret = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (ret < 0)
+ ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (ret)
return ret;
- if (ret == 0)
- acl = NULL;
}
ret = 0;
break;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ccc70d96958d..d4d8b7e36b2f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
if (ret) {
- bio->bi_error = ret;
+ comp_bio->bi_error = ret;
bio_endio(comp_bio);
}
@@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
if (ret) {
- bio->bi_error = ret;
+ comp_bio->bi_error = ret;
bio_endio(comp_bio);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9d8edcb0813c..0b8ce2b9f7d0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3181,7 +3181,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72a180d3503e..3a14c87d9c92 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1782,7 +1782,7 @@ static void update_time_for_write(struct inode *inode)
if (IS_NOCMTIME(inode))
return;
- now = current_fs_time(inode->i_sb);
+ now = current_time(inode);
if (!timespec_equal(&inode->i_mtime, &now))
inode->i_mtime = now;
@@ -2065,7 +2065,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* flags for any errors that might have happened while doing
* writeback of file data.
*/
- ret = btrfs_inode_check_errors(inode);
+ ret = filemap_check_errors(inode->i_mapping);
inode_unlock(inode);
goto out;
}
@@ -2603,7 +2603,7 @@ out_trans:
goto out_free;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
@@ -2867,7 +2867,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
i_size_write(inode, actual_end);
btrfs_ordered_update_i_size(inode, actual_end, NULL);
ret = btrfs_update_inode(trans, root, inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50ba4ca167e7..2b790bda7998 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4071,7 +4071,7 @@ err:
inode_inc_iversion(inode);
inode_inc_iversion(dir);
inode->i_ctime = dir->i_mtime =
- dir->i_ctime = current_fs_time(inode->i_sb);
+ dir->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4214,7 +4214,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -4977,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
inode->i_ctime = inode->i_mtime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
}
if (newsize > oldsize) {
@@ -5084,7 +5084,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (btrfs_root_readonly(root))
return -EROFS;
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -5684,7 +5684,7 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = current_fs_time(inode->i_sb);
+ inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6270,7 +6270,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
- inode->i_mtime = current_fs_time(inode->i_sb);
+ inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6384,7 +6384,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
name_len * 2);
inode_inc_iversion(parent_inode);
parent_inode->i_mtime = parent_inode->i_ctime =
- current_fs_time(parent_inode->i_sb);
+ current_time(parent_inode);
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -6602,7 +6602,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inc_nlink(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
if (!bio)
return -ENOMEM;
- bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+ bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
@@ -8465,7 +8465,8 @@ next_block:
start_sector, GFP_NOFS);
if (!bio)
goto out_err;
- bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+ bio_set_op_attrs(bio, bio_op(orig_bio),
+ bio_flags(orig_bio));
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
@@ -8633,7 +8634,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
goto out;
/* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) == WRITE)
+ if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
return 0;
/*
* Check to make sure we don't have duplicate iov_base's in this
@@ -9508,7 +9509,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
- struct timespec ctime = CURRENT_TIME;
+ struct timespec ctime = current_time(old_inode);
struct dentry *parent;
u64 old_ino = btrfs_ino(old_inode);
u64 new_ino = btrfs_ino(new_inode);
@@ -9876,7 +9877,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_inc_iversion(old_inode);
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
- old_inode->i_ctime = current_fs_time(old_dir->i_sb);
+ old_inode->i_ctime = current_time(old_dir);
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
@@ -9901,7 +9902,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode) {
inode_inc_iversion(new_inode);
- new_inode->i_ctime = current_fs_time(new_inode->i_sb);
+ new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(new_inode) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
root_objectid = BTRFS_I(new_inode)->location.objectid;
@@ -10419,7 +10420,7 @@ next:
*alloc_hint = ins.objectid + ins.offset;
inode_inc_iversion(inode);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) &&
@@ -10559,21 +10560,6 @@ out_inode:
}
-/* Inspired by filemap_check_errors() */
-int btrfs_inode_check_errors(struct inode *inode)
-{
- int ret = 0;
-
- if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
- test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
- ret = -ENOSPC;
- if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
- test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
- ret = -EIO;
-
- return ret;
-}
-
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10582,14 +10568,11 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.link = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
- .rename2 = btrfs_rename2,
+ .rename = btrfs_rename2,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = generic_removexattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
@@ -10663,10 +10646,7 @@ static const struct address_space_operations btrfs_symlink_aops = {
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = generic_removexattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
@@ -10677,10 +10657,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = generic_removexattr,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
@@ -10691,10 +10668,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
- .removexattr = generic_removexattr,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index af69129d7e0e..18e1aa0f85f5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -349,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_update_iflags(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
@@ -445,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
- struct timespec cur_time = current_fs_time(dir->i_sb);
+ struct timespec cur_time = current_time(dir);
struct inode *inode;
int ret;
int err;
@@ -3292,7 +3292,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
inode_inc_iversion(inode);
if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -5107,7 +5107,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
- struct timespec ct = current_fs_time(inode->i_sb);
+ struct timespec ct = current_time(inode);
int ret = 0;
int received_uuid_changed;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0b4628999b77..71261b459863 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4335,7 +4335,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- posix_acl_xattr_header dummy_acl;
+ struct posix_acl_xattr_header dummy_acl;
p = fs_path_alloc();
if (!p)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e66a18ed4588..9517de0e668c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1470,7 +1470,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root, 0);
- cur_time = current_fs_time(parent_inode->i_sb);
+ cur_time = current_time(parent_inode);
/*
* insert the directory item
@@ -1626,7 +1626,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->i_size +
dentry->d_name.len * 2);
parent_inode->i_mtime = parent_inode->i_ctime =
- current_fs_time(parent_inode->i_sb);
+ current_time(parent_inode);
ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4b1c0a6eee04..3d33c4e41e5f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3953,7 +3953,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
* i_mapping flags, so that the next fsync won't get
* an outdated io error too.
*/
- btrfs_inode_check_errors(inode);
+ filemap_check_errors(inode->i_mapping);
*ordered_io_error = true;
break;
}
@@ -4190,7 +4190,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
* without writing to the log tree and the fsync must report the
* file data write error and not commit the current transaction.
*/
- ret = btrfs_inode_check_errors(inode);
+ ret = filemap_check_errors(inode->i_mapping);
if (ret)
ctx->io_err = ret;
process:
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d1a177a3dbe8..fccbf5567e78 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -252,7 +252,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
goto out;
inode_inc_iversion(inode);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9c8eb9b6db6a..b205a629001d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -351,7 +351,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
- set_bit(AS_EIO, &page->mapping->flags);
+ mapping_set_error(page->mapping, -EIO);
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
@@ -1078,7 +1078,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-struct buffer_head *
+static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
@@ -1109,7 +1109,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
free_more_memory();
}
}
-EXPORT_SYMBOL(__getblk_slow);
/*
* The relationship between dirty buffers and dirty pages:
@@ -3250,7 +3249,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = head;
do {
if (buffer_write_io_error(bh) && page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
+ mapping_set_error(page->mapping, -EIO);
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 6af790fc3df8..3ff867f87d73 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -20,6 +20,7 @@
#include <linux/mount.h>
#include <linux/statfs.h>
#include <linux/ctype.h>
+#include <linux/xattr.h>
#include "internal.h"
static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
@@ -126,8 +127,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
if (d_is_negative(root) ||
!d_backing_inode(root)->i_op->lookup ||
!d_backing_inode(root)->i_op->mkdir ||
- !d_backing_inode(root)->i_op->setxattr ||
- !d_backing_inode(root)->i_op->getxattr ||
+ !(d_backing_inode(root)->i_opflags & IOP_XATTR) ||
!root->d_sb->s_op->statfs ||
!root->d_sb->s_op->sync_fs)
goto error_unsupported;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ce5f345d70f5..e7f16a77a22a 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
struct cachefiles_object *object;
struct cachefiles_cache *cache;
const struct cred *saved_cred;
+ struct inode *inode;
+ blkcnt_t i_blocks = 0;
ASSERT(_object);
@@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object)
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
+ inode = d_backing_inode(object->dentry);
+ if (inode)
+ i_blocks = inode->i_blocks;
+
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_delete_object(cache, object);
cachefiles_end_secure(cache, saved_cred);
@@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/* note that the object is now inactive */
if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
- cachefiles_mark_object_inactive(cache, object);
+ cachefiles_mark_object_inactive(cache, object, i_blocks);
dput(object->dentry);
object->dentry = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 2fcde1a34b7c..cd1effee8a49 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
* namei.c
*/
extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object);
+ struct cachefiles_object *object,
+ blkcnt_t i_blocks);
extern int cachefiles_delete_object(struct cachefiles_cache *cache,
struct cachefiles_object *object);
extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 3f7c2cd41f8f..41df8a27d7eb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
#include "internal.h"
#define CACHEFILES_KEYBUF_SIZE 512
@@ -261,10 +262,9 @@ requeue:
* Mark an object as being inactive.
*/
void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object)
+ struct cachefiles_object *object,
+ blkcnt_t i_blocks)
{
- blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks;
-
write_lock(&cache->active_lock);
rb_erase(&object->active_node, &cache->active_nodes);
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
@@ -707,7 +707,8 @@ mark_active_timed_out:
check_error:
_debug("check error %d", ret);
- cachefiles_mark_object_inactive(cache, object);
+ cachefiles_mark_object_inactive(
+ cache, object, d_backing_inode(object->dentry)->i_blocks);
release_dentry:
dput(object->dentry);
object->dentry = NULL;
@@ -799,13 +800,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
}
ret = -EPERM;
- if (!d_backing_inode(subdir)->i_op->setxattr ||
- !d_backing_inode(subdir)->i_op->getxattr ||
+ if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) ||
!d_backing_inode(subdir)->i_op->lookup ||
!d_backing_inode(subdir)->i_op->mkdir ||
!d_backing_inode(subdir)->i_op->create ||
- (!d_backing_inode(subdir)->i_op->rename &&
- !d_backing_inode(subdir)->i_op->rename2) ||
+ !d_backing_inode(subdir)->i_op->rename ||
!d_backing_inode(subdir)->i_op->rmdir ||
!d_backing_inode(subdir)->i_op->unlink)
goto check_error;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 4f67227f69a5..987044bca1c2 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -95,11 +95,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- ret = posix_acl_equiv_mode(acl, &new_mode);
- if (ret < 0)
+ ret = posix_acl_update_mode(inode, &new_mode, &acl);
+ if (ret)
goto out;
- if (ret == 0)
- acl = NULL;
}
break;
case ACL_TYPE_DEFAULT:
@@ -127,6 +125,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out_free;
+ }
+
if (new_mode != old_mode) {
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d5b6f959a3c3..ef3ebd780aff 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -175,9 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
static int ceph_releasepage(struct page *page, gfp_t g)
{
- dout("%p releasepage %p idx %lu\n", page->mapping->host,
- page, page->index);
- WARN_ON(PageDirty(page));
+ dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
+ page, page->index, PageDirty(page) ? "" : "not ");
/* Can we release the page from the cache? */
if (!ceph_release_fscache_page(page, g))
@@ -298,14 +297,6 @@ unlock:
kfree(osd_data->pages);
}
-static void ceph_unlock_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- unlock_page(pages[i]);
-}
-
/*
* start an async read(ahead) operation. return nr_pages we submitted
* a read for on success, or negative error code.
@@ -370,6 +361,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
nr_pages = i;
+ if (nr_pages > 0) {
+ len = nr_pages << PAGE_SHIFT;
+ break;
+ }
goto out_pages;
}
pages[i] = page;
@@ -386,8 +381,11 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
return nr_pages;
out_pages:
- ceph_unlock_page_vector(pages, nr_pages);
- ceph_release_page_vector(pages, nr_pages);
+ for (i = 0; i < nr_pages; ++i) {
+ ceph_fscache_readpage_cancel(inode, pages[i]);
+ unlock_page(pages[i]);
+ }
+ ceph_put_page_vector(pages, nr_pages, false);
out:
ceph_osdc_put_request(req);
return ret;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index df4b3e6fa563..78180d151730 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1061,7 +1061,8 @@ out:
}
static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1069,6 +1070,9 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
int op = CEPH_MDS_OP_RENAME;
int err;
+ if (flags)
+ return -EINVAL;
+
if (ceph_snap(old_dir) != ceph_snap(new_dir))
return -EXDEV;
if (ceph_snap(old_dir) != CEPH_NOSNAP) {
@@ -1486,10 +1490,7 @@ const struct inode_operations ceph_dir_iops = {
.permission = ceph_permission,
.getattr = ceph_getattr,
.setattr = ceph_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
.mknod = ceph_mknod,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0f5375d8e030..18630e800208 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -886,7 +886,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int num_pages = 0;
int flags;
int ret;
- struct timespec mtime = current_fs_time(inode->i_sb);
+ struct timespec mtime = current_time(inode);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
@@ -902,10 +902,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
return ret;
if (write) {
- ret = invalidate_inode_pages2_range(inode->i_mapping,
+ int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count) >> PAGE_SHIFT);
- if (ret < 0)
+ if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP |
@@ -1091,7 +1091,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int flags;
int check_caps = 0;
int ret;
- struct timespec mtime = current_fs_time(inode->i_sb);
+ struct timespec mtime = current_time(inode);
size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -1272,7 +1272,8 @@ again:
statret = __ceph_do_getattr(inode, page,
CEPH_STAT_CAP_INLINE_DATA, !!page);
if (statret < 0) {
- __free_page(page);
+ if (page)
+ __free_page(page);
if (statret == -ENODATA) {
BUG_ON(retry_op != READ_INLINE);
goto again;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index dd3a6dbf71eb..ef4d04647325 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -94,10 +94,7 @@ const struct inode_operations ceph_file_iops = {
.permission = ceph_permission,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
};
@@ -1514,7 +1511,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
}
- if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+ if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
+ !(rinfo->hash_order && req->r_path2)) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
@@ -1885,10 +1883,7 @@ static const struct inode_operations ceph_symlink_iops = {
.get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = generic_removexattr,
};
int __ceph_setattr(struct inode *inode, struct iattr *attr)
@@ -1905,13 +1900,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- err = inode_change_ok(inode, attr);
- if (err != 0)
- return err;
-
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
@@ -2080,7 +2068,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
}
release &= issued;
@@ -2124,7 +2112,17 @@ out_put:
*/
int ceph_setattr(struct dentry *dentry, struct iattr *attr)
{
- return __ceph_setattr(d_inode(dentry), attr);
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ err = setattr_prepare(dentry, attr);
+ if (err != 0)
+ return err;
+
+ return __ceph_setattr(inode, attr);
}
/*
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index a2cb0c254060..6806dbeaee19 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -210,8 +210,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
/* No mandatory locks */
- if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
+ if (fl->fl_type & LOCK_MAND)
+ return -EOPNOTSUPP;
dout("ceph_flock, fl_file: %p", fl->fl_file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f72d4ae303b2..815acd1a56d4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -370,6 +370,7 @@ const char *ceph_session_state_name(int s)
case CEPH_MDS_SESSION_CLOSING: return "closing";
case CEPH_MDS_SESSION_RESTARTING: return "restarting";
case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+ case CEPH_MDS_SESSION_REJECTED: return "rejected";
default: return "???";
}
}
@@ -1150,8 +1151,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
while (!list_empty(&ci->i_cap_flush_list)) {
cf = list_first_entry(&ci->i_cap_flush_list,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- list_add(&cf->i_list, &to_remove);
+ list_move(&cf->i_list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock);
@@ -1378,7 +1378,7 @@ static int request_close_session(struct ceph_mds_client *mdsc,
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
- return 0;
+ return 1;
}
/*
@@ -2131,6 +2131,10 @@ static int __do_request(struct ceph_mds_client *mdsc,
ceph_session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
+ if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
+ err = -EACCES;
+ goto out_session;
+ }
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING)
__open_session(mdsc, session);
@@ -2652,6 +2656,15 @@ static void handle_session(struct ceph_mds_session *session,
wake_up_session_caps(session, 0);
break;
+ case CEPH_SESSION_REJECT:
+ WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
+ pr_info("mds%d rejected session\n", session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_REJECTED;
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ wake = 2; /* for good measure */
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -3557,11 +3570,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
/*
* true if all sessions are closed, or we force unmount
*/
-static bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
{
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
- return atomic_read(&mdsc->num_sessions) == 0;
+ return atomic_read(&mdsc->num_sessions) <= skipped;
}
/*
@@ -3572,6 +3585,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_session *session;
int i;
+ int skipped = 0;
dout("close_sessions\n");
@@ -3583,7 +3597,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
continue;
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- __close_session(mdsc, session);
+ if (__close_session(mdsc, session) <= 0)
+ skipped++;
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex);
@@ -3591,7 +3606,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
dout("waiting for sessions to close\n");
- wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+ wait_event_timeout(mdsc->session_close_wq,
+ done_closing_sessions(mdsc, skipped),
ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining sessions */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 6b3679737d4a..3c6f77b7bb02 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -121,6 +121,7 @@ enum {
CEPH_MDS_SESSION_CLOSING = 5,
CEPH_MDS_SESSION_RESTARTING = 6,
CEPH_MDS_SESSION_RECONNECTING = 7,
+ CEPH_MDS_SESSION_REJECTED = 8,
};
struct ceph_mds_session {
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89e6bc321df3..913dea163d5c 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -43,6 +43,8 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_RECALL_STATE: return "recall_state";
case CEPH_SESSION_FLUSHMSG: return "flushmsg";
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+ case CEPH_SESSION_FORCE_RO: return "force_ro";
+ case CEPH_SESSION_REJECT: return "reject";
}
return "???";
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e247f6f0feb7..b382e5910eea 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -396,10 +396,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*/
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
- if (!fsopt->server_path) {
- err = -ENOMEM;
- goto out;
+ if (strlen(dev_name_end) > 1) {
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path) {
+ err = -ENOMEM;
+ goto out;
+ }
}
} else {
dev_name_end = dev_name + strlen(dev_name);
@@ -788,15 +790,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
struct inode *inode = req->r_target_inode;
req->r_target_inode = NULL;
dout("open_root_inode success\n");
- if (ceph_ino(inode) == CEPH_INO_ROOT &&
- fsc->sb->s_root == NULL) {
- root = d_make_root(inode);
- if (!root) {
- root = ERR_PTR(-ENOMEM);
- goto out;
- }
- } else {
- root = d_obtain_root(inode);
+ root = d_make_root(inode);
+ if (!root) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
}
ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
@@ -825,35 +822,31 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
+ const char *path;
err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
- dout("mount opening root\n");
- root = open_root_dentry(fsc, "", started);
+ if (!fsc->mount_options->server_path) {
+ path = "";
+ dout("mount opening path \\t\n");
+ } else {
+ path = fsc->mount_options->server_path + 1;
+ dout("mount opening path %s\n", path);
+ }
+ root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
}
- fsc->sb->s_root = root;
+ fsc->sb->s_root = dget(root);
first = 1;
err = ceph_fs_debugfs_init(fsc);
if (err < 0)
goto fail;
- }
-
- if (!fsc->mount_options->server_path) {
- root = fsc->sb->s_root;
- dget(root);
} else {
- const char *path = fsc->mount_options->server_path + 1;
- dout("mount opening path %s\n", path);
- root = open_root_dentry(fsc, path, started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto fail;
- }
+ root = dget(fsc->sb->s_root);
}
fsc->mount_state = CEPH_MOUNT_MOUNTED;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index adc231892b0d..febc28f9e2c2 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -16,7 +16,7 @@
static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr);
-const struct xattr_handler ceph_other_xattr_handler;
+static const struct xattr_handler ceph_other_xattr_handler;
/*
* List of handlers for synthetic system.* attributes. Other
@@ -1034,7 +1034,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
}
spin_unlock(&ci->i_ceph_lock);
@@ -1086,7 +1086,7 @@ static int ceph_set_xattr_handler(const struct xattr_handler *handler,
return __ceph_setxattr(inode, name, value, size, flags);
}
-const struct xattr_handler ceph_other_xattr_handler = {
+static const struct xattr_handler ceph_other_xattr_handler = {
.prefix = "", /* match any name => handlers called with full name */
.get = ceph_get_xattr_handler,
.set = ceph_set_xattr_handler,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 6edd825231c5..44a240c4bb65 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -406,6 +406,7 @@ void cd_forget(struct inode *inode)
spin_lock(&cdev_lock);
list_del_init(&inode->i_devices);
inode->i_cdev = NULL;
+ inode->i_mapping = &inode->i_data;
spin_unlock(&cdev_lock);
}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 6c58e13fed2f..3d03e48a9213 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &cifs_tcp_ses_list) {
server = list_entry(tmp1, struct TCP_Server_Info,
tcp_ses_list);
+ seq_printf(m, "\nNumber of credits: %d", server->credits);
i++;
list_for_each(tmp2, &server->smb_ses_list) {
ses = list_entry(tmp2, struct cifs_ses,
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 1418daa03d95..07ed81cf1552 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -49,6 +49,7 @@
#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible
* root mountable
*/
+#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 0065256881d8..57ff0756e30c 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -36,7 +36,15 @@ struct smb_mnt_fs_info {
__u64 cifs_posix_caps;
} __packed;
+struct smb_snapshot_array {
+ __u32 number_of_snapshots;
+ __u32 number_of_snapshots_returned;
+ __u32 snapshot_array_size;
+ /* snapshots[]; */
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
+#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 71e8a56e9479..15bac390dff9 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,6 +42,35 @@ static const struct cifs_sid sid_authusers = {
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
+/* S-1-22-1 Unmapped Unix users */
+static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
+ {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-22-2 Unmapped Unix groups */
+static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22},
+ {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/*
+ * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx
+ */
+
+/* S-1-5-88 MS NFS and Apple style UID/GID/mode */
+
+/* S-1-5-88-1 Unix uid */
+static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-5-88-2 Unix gid */
+static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-5-88-3 Unix mode */
+static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
static const struct cred *root_cred;
static int
@@ -183,6 +212,62 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
return 0; /* sids compare/match */
}
+static bool
+is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group)
+{
+ int i;
+ int num_subauth;
+ const struct cifs_sid *pwell_known_sid;
+
+ if (!psid || (puid == NULL))
+ return false;
+
+ num_subauth = psid->num_subauth;
+
+ /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */
+ if (num_subauth == 2) {
+ if (is_group)
+ pwell_known_sid = &sid_unix_groups;
+ else
+ pwell_known_sid = &sid_unix_users;
+ } else if (num_subauth == 3) {
+ if (is_group)
+ pwell_known_sid = &sid_unix_NFS_groups;
+ else
+ pwell_known_sid = &sid_unix_NFS_users;
+ } else
+ return false;
+
+ /* compare the revision */
+ if (psid->revision != pwell_known_sid->revision)
+ return false;
+
+ /* compare all of the six auth values */
+ for (i = 0; i < NUM_AUTHS; ++i) {
+ if (psid->authority[i] != pwell_known_sid->authority[i]) {
+ cifs_dbg(FYI, "auth %d did not match\n", i);
+ return false;
+ }
+ }
+
+ if (num_subauth == 2) {
+ if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0])
+ return false;
+
+ *puid = le32_to_cpu(psid->sub_auth[1]);
+ } else /* 3 subauths, ie Windows/Mac style */ {
+ *puid = le32_to_cpu(psid->sub_auth[0]);
+ if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) ||
+ (psid->sub_auth[1] != pwell_known_sid->sub_auth[1]))
+ return false;
+
+ *puid = le32_to_cpu(psid->sub_auth[2]);
+ }
+
+ cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid);
+ return true; /* well known sid found, uid returned */
+}
+
static void
cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
{
@@ -276,6 +361,43 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
return -EIO;
}
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) {
+ uint32_t unix_id;
+ bool is_group;
+
+ if (sidtype != SIDOWNER)
+ is_group = true;
+ else
+ is_group = false;
+
+ if (is_well_known_sid(psid, &unix_id, is_group) == false)
+ goto try_upcall_to_get_id;
+
+ if (is_group) {
+ kgid_t gid;
+ gid_t id;
+
+ id = (gid_t)unix_id;
+ gid = make_kgid(&init_user_ns, id);
+ if (gid_valid(gid)) {
+ fgid = gid;
+ goto got_valid_id;
+ }
+ } else {
+ kuid_t uid;
+ uid_t id;
+
+ id = (uid_t)unix_id;
+ uid = make_kuid(&init_user_ns, id);
+ if (uid_valid(uid)) {
+ fuid = uid;
+ goto got_valid_id;
+ }
+ }
+ /* If unable to find uid/gid easily from SID try via upcall */
+ }
+
+try_upcall_to_get_id:
sidstr = sid_to_key_str(psid, sidtype);
if (!sidstr)
return -ENOMEM;
@@ -329,6 +451,7 @@ out_revert_creds:
* Note that we return 0 here unconditionally. If the mapping
* fails then we just fall back to using the mnt_uid/mnt_gid.
*/
+got_valid_id:
if (sidtype == SIDOWNER)
fattr->cf_uid = fuid;
else
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 14ae4b8e1a3c..15261ba464c5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,15 +64,15 @@ unsigned int global_secflags = CIFSSEC_DEF;
unsigned int sign_CIFS_PDUs = 1;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
-module_param(CIFSMaxBufSize, uint, 0);
+module_param(CIFSMaxBufSize, uint, 0444);
MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
"Default: 16384 Range: 8192 to 130048");
unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
-module_param(cifs_min_rcv, uint, 0);
+module_param(cifs_min_rcv, uint, 0444);
MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
"1 to 64");
unsigned int cifs_min_small = 30;
-module_param(cifs_min_small, uint, 0);
+module_param(cifs_min_small, uint, 0444);
MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
@@ -271,7 +271,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
#ifdef CONFIG_CIFS_SMB2
- get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
+ generate_random_uuid(cifs_inode->lease_key);
#endif
/*
* Can not set i_flags here - they get immediately overwritten to zero
@@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
seq_puts(s, ",setuids");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
+ seq_puts(s, ",idsfromsid");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
seq_puts(s, ",serverino");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
@@ -896,35 +898,26 @@ const struct inode_operations cifs_dir_inode_ops = {
.link = cifs_hardlink,
.mkdir = cifs_mkdir,
.rmdir = cifs_rmdir,
- .rename2 = cifs_rename2,
+ .rename = cifs_rename2,
.permission = cifs_permission,
.setattr = cifs_setattr,
.symlink = cifs_symlink,
.mknod = cifs_mknod,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = generic_removexattr,
};
const struct inode_operations cifs_file_inode_ops = {
.setattr = cifs_setattr,
.getattr = cifs_getattr,
.permission = cifs_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = generic_removexattr,
};
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
.get_link = cifs_get_link,
.permission = cifs_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = cifs_listxattr,
- .removexattr = generic_removexattr,
};
static int cifs_clone_file_range(struct file *src_file, loff_t off,
@@ -1271,7 +1264,6 @@ init_cifs(void)
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
spin_lock_init(&cifs_tcp_ses_lock);
- spin_lock_init(&cifs_file_list_lock);
spin_lock_init(&GlobalMid_Lock);
get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret));
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 9dcf974acc47..c9c00a862036 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,6 +41,16 @@ cifs_uniqueid_to_ino_t(u64 fileid)
}
+static inline void cifs_set_time(struct dentry *dentry, unsigned long time)
+{
+ dentry->d_fsdata = (void *) time;
+}
+
+static inline unsigned long cifs_get_time(struct dentry *dentry)
+{
+ return (unsigned long) dentry->d_fsdata;
+}
+
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8f1d8c1e72be..1f17f6bd7a60 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -75,6 +75,18 @@
#define SMB_ECHO_INTERVAL_MAX 600
#define SMB_ECHO_INTERVAL_DEFAULT 60
+/*
+ * Default number of credits to keep available for SMB3.
+ * This value is chosen somewhat arbitrarily. The Windows client
+ * defaults to 128 credits, the Windows server allows clients up to
+ * 512 credits (or 8K for later versions), and the NetApp server
+ * does not limit clients at all. Choose a high enough default value
+ * such that the client shouldn't limit performance, but allow mount
+ * to override (until you approach 64K, where we limit credits to 65000
+ * to reduce possibility of seeing more server credit overflow bugs.
+ */
+#define SMB2_MAX_CREDITS_AVAILABLE 32000
+
#include "cifspdu.h"
#ifndef XATTR_DOS_ATTRIB
@@ -376,6 +388,8 @@ struct smb_version_operations {
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
+ int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *src_file, void __user *);
int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const unsigned char *,
char *, unsigned int *);
@@ -464,6 +478,7 @@ struct smb_vol {
bool retry:1;
bool intr:1;
bool setuids:1;
+ bool setuidfromacl:1;
bool override_uid:1;
bool override_gid:1;
bool dynperm:1;
@@ -510,6 +525,7 @@ struct smb_vol {
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
unsigned int echo_interval; /* echo interval in secs */
+ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
};
#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
@@ -567,7 +583,8 @@ struct TCP_Server_Info {
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
bool tcp_nodelay;
- int credits; /* send no more requests at once */
+ unsigned int credits; /* send no more requests at once */
+ unsigned int max_credits; /* can override large 32000 default at mnt */
unsigned int in_flight; /* number of requests on the wire to server */
spinlock_t req_lock; /* protect the two values above */
struct mutex srv_mutex;
@@ -833,6 +850,7 @@ struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
struct list_head openFileList;
+ spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
char *nativeFileSystem;
@@ -889,7 +907,7 @@ struct cifs_tcon {
#endif /* CONFIG_CIFS_STATS2 */
__u64 bytes_read;
__u64 bytes_written;
- spinlock_t stat_lock;
+ spinlock_t stat_lock; /* protects the two fields above */
#endif /* CONFIG_CIFS_STATS */
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
@@ -1040,20 +1058,24 @@ struct cifs_fid_locks {
};
struct cifsFileInfo {
+ /* following two lists are protected by tcon->open_file_lock */
struct list_head tlist; /* pointer to next fid owned by tcon */
struct list_head flist; /* next fid (file instance) for this inode */
+ /* lock list below protected by cifsi->lock_sem */
struct cifs_fid_locks *llist; /* brlocks held by this fid */
kuid_t uid; /* allows finding which FileInfo structure */
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
+ struct list_head rlist; /* reconnect list */
/* BB add lock scope info here if needed */ ;
/* lock scope id (0 if none) */
struct dentry *dentry;
- unsigned int f_flags;
struct tcon_link *tlink;
+ unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
bool oplock_break_cancelled:1;
- int count; /* refcount protected by cifs_file_list_lock */
+ int count;
+ spinlock_t file_info_lock; /* protects four flag/count fields above */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
@@ -1120,7 +1142,7 @@ struct cifs_writedata {
/*
* Take a reference on the file private data. Must be called with
- * cifs_file_list_lock held.
+ * cfile->file_info_lock held.
*/
static inline void
cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
@@ -1514,8 +1536,10 @@ require use of the stronger protocol */
* GlobalMid_Lock protects:
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
- * cifs_file_list_lock protects:
- * list operations on tcp and SMB session lists and tCon lists
+ * tcp_ses_lock protects:
+ * list operations on tcp and SMB session lists
+ * tcon->open_file_lock protects the list of open files hanging off the tcon
+ * cfile->file_info_lock protects counters and fields in cifs file struct
* f_owner.lock protects certain per file struct operations
* mapping->page_lock protects certain per page operations
*
@@ -1547,18 +1571,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
* tcp session, and the list of tcon's per smb session. It also protects
* the reference counters for the server, smb session, and tcon. Finally,
* changes to the tcon->tidStatus should be done while holding this lock.
+ * generally the locks should be taken in order tcp_ses_lock before
+ * tcon->open_file_lock and that before file->file_info_lock since the
+ * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file
*/
GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
-/*
- * This lock protects the cifs_file->llist and cifs_file->flist
- * list operations, and updates to some flags (cifs_file->invalidHandle)
- * It will be moved to either use the tcon->stat_lock or equivalent later.
- * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
- * the cifs_tcp_ses_lock must be grabbed first and released last.
- */
-GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
-
#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 95dab43646f0..ced0e42ce460 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -193,6 +193,8 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,
extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
extern void cifs_umount(struct cifs_sb_info *);
extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
+extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon);
+
extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
__u64 length, __u8 type,
struct cifsLockInfo **conf_lock,
@@ -392,8 +394,7 @@ extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, char **buf,
int *return_buf_type);
extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
- unsigned int *nbytes, const char *buf,
- const char __user *ubuf, const int long_op);
+ unsigned int *nbytes, const char *buf);
extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, struct kvec *iov, const int nvec);
extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d47197ea4ab6..3f3185febc58 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
struct list_head *tmp1;
/* list all files open on tree connection and mark them invalid */
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
open_file = list_entry(tmp, struct cifsFileInfo, tlist);
open_file->invalidHandle = true;
open_file->oplock_break_cancelled = true;
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
/*
* BB Add call to invalidate_inodes(sb) for all superblocks mounted
* to this tcon.
@@ -1228,7 +1228,6 @@ OldOpenRetry:
inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
- /* long_op set to 1 to allow for oplock break timeouts */
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
@@ -1768,8 +1767,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
int
CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
- unsigned int *nbytes, const char *buf,
- const char __user *ubuf, const int long_op)
+ unsigned int *nbytes, const char *buf)
{
int rc = -EACCES;
WRITE_REQ *pSMB = NULL;
@@ -1838,12 +1836,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
if (buf)
memcpy(pSMB->Data, buf, bytes_sent);
- else if (ubuf) {
- if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) {
- cifs_buf_release(pSMB);
- return -EFAULT;
- }
- } else if (count != 0) {
+ else if (count != 0) {
/* No buffer */
cifs_buf_release(pSMB);
return -EINVAL;
@@ -1867,7 +1860,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
}
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
+ (struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
if (rc) {
cifs_dbg(FYI, "Send error in write = %d\n", rc);
@@ -3334,7 +3327,7 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
#ifdef CONFIG_CIFS_POSIX
/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(posix_acl_xattr_entry *ace,
+static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
struct cifs_posix_ace *cifs_ace)
{
/* u8 cifs fields do not need le conversion */
@@ -3358,7 +3351,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
__u16 count;
struct cifs_posix_ace *pACE;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
- posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt;
+ struct posix_acl_xattr_header *local_acl = (void *)trgt;
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
@@ -3396,9 +3389,11 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
} else if (size > buflen) {
return -ERANGE;
} else /* buffer big enough */ {
+ struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
+
local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
for (i = 0; i < count ; i++) {
- cifs_convert_ace(&local_acl->a_entries[i], pACE);
+ cifs_convert_ace(&ace[i], pACE);
pACE++;
}
}
@@ -3406,7 +3401,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
}
static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
- const posix_acl_xattr_entry *local_ace)
+ const struct posix_acl_xattr_entry *local_ace)
{
__u16 rc = 0; /* 0 = ACL converted ok */
@@ -3431,7 +3426,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
{
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
- posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL;
+ struct posix_acl_xattr_header *local_acl = (void *)pACL;
int count;
int i;
@@ -3459,7 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
}
for (i = 0; i < count; i++) {
rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
- &local_acl->a_entries[i]);
+ (struct posix_acl_xattr_entry *)(local_acl + 1));
if (rc != 0) {
/* ACE not converted */
break;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2e4f4bad8b1e..aab5227979e2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -63,7 +63,6 @@ extern mempool_t *cifs_req_poolp;
#define TLINK_IDLE_EXPIRE (600 * HZ)
enum {
-
/* Mount options that take no arguments */
Opt_user_xattr, Opt_nouser_xattr,
Opt_forceuid, Opt_noforceuid,
@@ -76,7 +75,7 @@ enum {
Opt_noposixpaths, Opt_nounix,
Opt_nocase,
Opt_brl, Opt_nobrl,
- Opt_forcemandatorylock, Opt_setuids,
+ Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids,
Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
Opt_nohard, Opt_nosoft,
Opt_nointr, Opt_intr,
@@ -95,7 +94,7 @@ enum {
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
Opt_rsize, Opt_wsize, Opt_actimeo,
- Opt_echo_interval,
+ Opt_echo_interval, Opt_max_credits,
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
@@ -148,6 +147,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_forcemandatorylock, "forcemand" },
{ Opt_setuids, "setuids" },
{ Opt_nosetuids, "nosetuids" },
+ { Opt_setuidfromacl, "idsfromsid" },
{ Opt_dynperm, "dynperm" },
{ Opt_nodynperm, "nodynperm" },
{ Opt_nohard, "nohard" },
@@ -190,6 +190,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
{ Opt_echo_interval, "echo_interval=%s" },
+ { Opt_max_credits, "max_credits=%s" },
{ Opt_blank_user, "user=" },
{ Opt_blank_user, "username=" },
@@ -1376,6 +1377,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_nosetuids:
vol->setuids = 0;
break;
+ case Opt_setuidfromacl:
+ vol->setuidfromacl = 1;
+ break;
case Opt_dynperm:
vol->dynperm = true;
break;
@@ -1586,6 +1590,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
vol->echo_interval = option;
break;
+ case Opt_max_credits:
+ if (get_option_ul(args, &option) || (option < 20) ||
+ (option > 60000)) {
+ cifs_dbg(VFS, "%s: Invalid max_credits value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->max_credits = option;
+ break;
/* String Arguments */
@@ -2163,7 +2176,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
sizeof(tcp_ses->dstaddr));
#ifdef CONFIG_CIFS_SMB2
- get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE);
+ generate_random_uuid(tcp_ses->client_guid);
#endif
/*
* at this point we are the only ones with the pointer
@@ -3270,6 +3283,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
if (pvolume_info->setuids)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+ if (pvolume_info->setuidfromacl)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL;
if (pvolume_info->server_ino)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
if (pvolume_info->remap)
@@ -3598,7 +3613,11 @@ try_mount_again:
bdi_destroy(&cifs_sb->bdi);
goto out;
}
-
+ if ((volume_info->max_credits < 20) ||
+ (volume_info->max_credits > 60000))
+ server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
+ else
+ server->max_credits = volume_info->max_credits;
/* get a reference to a SMB session */
ses = cifs_get_smb_ses(server, volume_info);
if (IS_ERR(ses)) {
@@ -3688,14 +3707,16 @@ remote_path_check:
goto mount_fail_check;
}
- rc = cifs_are_all_path_components_accessible(server,
+ if (rc != -EREMOTE) {
+ rc = cifs_are_all_path_components_accessible(server,
xid, tcon, cifs_sb,
full_path);
- if (rc != 0) {
- cifs_dbg(VFS, "cannot query dirs between root and final path, "
- "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- rc = 0;
+ if (rc != 0) {
+ cifs_dbg(VFS, "cannot query dirs between root and final path, "
+ "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = 0;
+ }
}
kfree(full_path);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4716c54dbfc6..789ff1df2d8d 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -40,7 +40,7 @@ renew_parental_timestamps(struct dentry *direntry)
/* BB check if there is a way to get the kernel to do this or if we
really need this */
do {
- direntry->d_time = jiffies;
+ cifs_set_time(direntry, jiffies);
direntry = direntry->d_parent;
} while (!IS_ROOT(direntry));
}
@@ -802,7 +802,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
} else if (rc == -ENOENT) {
rc = 0;
- direntry->d_time = jiffies;
+ cifs_set_time(direntry, jiffies);
d_add(direntry, NULL);
/* if it was once a directory (but how can we tell?) we could do
shrink_dcache_parent(direntry); */
@@ -862,7 +862,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
return 0;
- if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+ if (time_after(jiffies, cifs_get_time(direntry) + HZ) || !lookupCacheEnabled)
return 0;
return 1;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 579e41b350a2..7f5f6176c6f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
mutex_init(&cfile->fh_mutex);
+ spin_lock_init(&cfile->file_info_lock);
cifs_sb_active(inode->i_sb);
@@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
oplock = 0;
}
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
oplock = fid->pending_open->oplock;
list_del(&fid->pending_open->olist);
@@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
server->ops->set_fid(cfile, fid, oplock);
list_add(&cfile->tlist, &tcon->openFileList);
+
/* if readable file instance put first in list*/
if (file->f_mode & FMODE_READ)
list_add(&cfile->flist, &cinode->openFileList);
else
list_add_tail(&cfile->flist, &cinode->openFileList);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
if (fid->purge_cache)
cifs_zap_mapping(inode);
@@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct cifsFileInfo *
cifsFileInfo_get(struct cifsFileInfo *cifs_file)
{
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&cifs_file->file_info_lock);
cifsFileInfo_get_locked(cifs_file);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cifs_file->file_info_lock);
return cifs_file;
}
/*
* Release a reference on the file private data. This may involve closing
* the filehandle out on the server. Must be called without holding
- * cifs_file_list_lock.
+ * tcon->open_file_lock and cifs_file->file_info_lock.
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
@@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
struct cifs_pending_open open;
bool oplock_break_cancelled;
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
+
+ spin_lock(&cifs_file->file_info_lock);
if (--cifs_file->count > 0) {
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cifs_file->file_info_lock);
+ spin_unlock(&tcon->open_file_lock);
return;
}
+ spin_unlock(&cifs_file->file_info_lock);
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &fid);
@@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
cifs_set_oplock_level(cifsi, 0);
}
- spin_unlock(&cifs_file_list_lock);
+
+ spin_unlock(&tcon->open_file_lock);
oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
@@ -732,6 +739,15 @@ reopen_success:
* to the server to get the new inode info.
*/
+ /*
+ * If the server returned a read oplock and we have mandatory brlocks,
+ * set oplock level to None.
+ */
+ if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) {
+ cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
+ oplock = 0;
+ }
+
server->ops->set_fid(cfile, &cfile->fid, oplock);
if (oparms.reconnect)
cifs_relock_file(cfile);
@@ -753,6 +769,36 @@ int cifs_close(struct inode *inode, struct file *file)
return 0;
}
+void
+cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
+{
+ struct cifsFileInfo *open_file;
+ struct list_head *tmp;
+ struct list_head *tmp1;
+ struct list_head tmp_list;
+
+ cifs_dbg(FYI, "Reopen persistent handles");
+ INIT_LIST_HEAD(&tmp_list);
+
+ /* list all files open on tree connection, reopen resilient handles */
+ spin_lock(&tcon->open_file_lock);
+ list_for_each(tmp, &tcon->openFileList) {
+ open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+ if (!open_file->invalidHandle)
+ continue;
+ cifsFileInfo_get(open_file);
+ list_add_tail(&open_file->rlist, &tmp_list);
+ }
+ spin_unlock(&tcon->open_file_lock);
+
+ list_for_each_safe(tmp, tmp1, &tmp_list) {
+ open_file = list_entry(tmp, struct cifsFileInfo, rlist);
+ cifs_reopen_file(open_file, false /* do not flush */);
+ list_del_init(&open_file->rlist);
+ cifsFileInfo_put(open_file);
+ }
+}
+
int cifs_closedir(struct inode *inode, struct file *file)
{
int rc = 0;
@@ -772,10 +818,10 @@ int cifs_closedir(struct inode *inode, struct file *file)
server = tcon->ses->server;
cifs_dbg(FYI, "Freeing private data in close dir\n");
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&cfile->file_info_lock);
if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cfile->file_info_lock);
if (server->ops->close_dir)
rc = server->ops->close_dir(xid, tcon, &cfile->fid);
else
@@ -784,7 +830,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
/* not much we can do if it fails anyway, ignore rc */
rc = 0;
} else
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cfile->file_info_lock);
buf = cfile->srch_inf.ntwrk_buf_start;
if (buf) {
@@ -1728,12 +1774,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
{
struct cifsFileInfo *open_file = NULL;
struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
/* we could simply get the first_list_entry since write-only entries
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
@@ -1744,8 +1791,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
if (!open_file->invalidHandle) {
/* found a good file */
/* lock it so it will not be closed on us */
- cifsFileInfo_get_locked(open_file);
- spin_unlock(&cifs_file_list_lock);
+ cifsFileInfo_get(open_file);
+ spin_unlock(&tcon->open_file_lock);
return open_file;
} /* else might as well continue, and look for
another, or simply have the caller reopen it
@@ -1753,7 +1800,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
} else /* write only file */
break; /* write only files are last so must be done */
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
return NULL;
}
@@ -1762,6 +1809,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
bool any_available = false;
int rc;
unsigned int refind = 0;
@@ -1777,15 +1825,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
refind_writable:
if (refind > MAX_REOPEN_ATT) {
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
return NULL;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
@@ -1796,8 +1845,8 @@ refind_writable:
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
- cifsFileInfo_get_locked(open_file);
- spin_unlock(&cifs_file_list_lock);
+ cifsFileInfo_get(open_file);
+ spin_unlock(&tcon->open_file_lock);
return open_file;
} else {
if (!inv_file)
@@ -1813,24 +1862,24 @@ refind_writable:
if (inv_file) {
any_available = false;
- cifsFileInfo_get_locked(inv_file);
+ cifsFileInfo_get(inv_file);
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
if (!rc)
return inv_file;
else {
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
list_move_tail(&inv_file->flist,
&cifs_inode->openFileList);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
cifsFileInfo_put(inv_file);
- spin_lock(&cifs_file_list_lock);
++refind;
inv_file = NULL;
+ spin_lock(&tcon->open_file_lock);
goto refind_writable;
}
}
@@ -1878,7 +1927,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
- inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
+ inode->i_atime = inode->i_mtime = current_time(inode);
if ((bytes_written > 0) && (offset))
rc = 0;
else if (bytes_written < 0)
@@ -2478,7 +2527,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
size_t cur_len;
unsigned long nr_pages, num_pages, i;
struct cifs_writedata *wdata;
- struct iov_iter saved_from;
+ struct iov_iter saved_from = *from;
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
@@ -2489,7 +2538,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
- memcpy(&saved_from, from, sizeof(struct iov_iter));
do {
unsigned int wsize, credits;
@@ -2551,8 +2599,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
- memcpy(from, &saved_from,
- sizeof(struct iov_iter));
+ *from = saved_from;
iov_iter_advance(from, offset - saved_offset);
continue;
}
@@ -2576,7 +2623,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
struct cifs_sb_info *cifs_sb;
struct cifs_writedata *wdata, *tmp;
struct list_head wdata_list;
- struct iov_iter saved_from;
+ struct iov_iter saved_from = *from;
int rc;
/*
@@ -2597,8 +2644,6 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
if (!tcon->ses->server->ops->async_writev)
return -ENOSYS;
- memcpy(&saved_from, from, sizeof(struct iov_iter));
-
rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
open_file, cifs_sb, &wdata_list);
@@ -2631,13 +2676,11 @@ restart_loop:
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
struct list_head tmp_list;
- struct iov_iter tmp_from;
+ struct iov_iter tmp_from = saved_from;
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
- memcpy(&tmp_from, &saved_from,
- sizeof(struct iov_iter));
iov_iter_advance(&tmp_from,
wdata->offset - iocb->ki_pos);
@@ -3571,7 +3614,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
cifs_dbg(FYI, "Bytes read %d\n", rc);
file_inode(file)->i_atime =
- current_fs_time(file_inode(file)->i_sb);
+ current_time(file_inode(file));
if (PAGE_SIZE > rc)
memset(read_data + rc, 0, PAGE_SIZE - rc);
@@ -3618,15 +3661,17 @@ static int cifs_readpage(struct file *file, struct page *page)
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
+ struct cifs_tcon *tcon =
+ cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb));
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
return 1;
}
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
return 0;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b87efd0c92d6..7ab5be7944aa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1951,7 +1951,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
full_path, inode, inode->i_count.counter,
- dentry, dentry->d_time, jiffies);
+ dentry, cifs_get_time(dentry), jiffies);
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
@@ -2154,7 +2154,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
attrs->ia_valid |= ATTR_FORCE;
- rc = inode_change_ok(inode, attrs);
+ rc = setattr_prepare(direntry, attrs);
if (rc < 0)
goto out;
@@ -2294,7 +2294,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
attrs->ia_valid |= ATTR_FORCE;
- rc = inode_change_ok(inode, attrs);
+ rc = setattr_prepare(direntry, attrs);
if (rc < 0) {
free_xid(xid);
return rc;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 7a3b84e300f8..9f51b81119f2 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
cifs_sb = CIFS_SB(inode->i_sb);
-
+ cifs_dbg(VFS, "cifs ioctl 0x%x\n", command);
switch (command) {
case FS_IOC_GETFLAGS:
if (pSMBFile == NULL)
@@ -267,11 +267,23 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
+ case CIFS_ENUMERATE_SNAPSHOTS:
+ if (arg == 0) {
+ rc = -EINVAL;
+ goto cifs_ioc_exit;
+ }
+ tcon = tlink_tcon(pSMBFile->tlink);
+ if (tcon->ses->server->ops->enum_snapshots)
+ rc = tcon->ses->server->ops->enum_snapshots(xid, tcon,
+ pSMBFile, (void __user *)arg);
+ else
+ rc = -EOPNOTSUPP;
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
}
-
+cifs_ioc_exit:
free_xid(xid);
return rc;
}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 062c2375549a..d031af8d3d4d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -399,7 +399,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
io_parms.offset = 0;
io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
- rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+ rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf);
CIFSSMBClose(xid, tcon, fid.netfid);
return rc;
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 813fe13c2ae1..c6729156f9a0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -120,6 +120,7 @@ tconInfoAlloc(void)
++ret_buf->tc_count;
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
+ spin_lock_init(&ret_buf->open_file_lock);
#ifdef CONFIG_CIFS_STATS
spin_lock_init(&ret_buf->stat_lock);
#endif
@@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
continue;
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
netfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
@@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
@@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb)
void
cifs_del_pending_open(struct cifs_pending_open *open)
{
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tlink_tcon(open->tlink)->open_file_lock);
list_del(&open->olist);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
void
@@ -635,7 +636,7 @@ void
cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
struct cifs_pending_open *open)
{
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tlink_tcon(tlink)->open_file_lock);
cifs_add_pending_open_locked(fid, tlink, open);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 65cf85dcda09..8f6a2a5863b9 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -597,14 +597,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
/* close and restart search */
cifs_dbg(FYI, "search backing up - close and restart search\n");
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&cfile->file_info_lock);
if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cfile->file_info_lock);
if (server->ops->close_dir)
server->ops->close_dir(xid, tcon, &cfile->fid);
} else
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&cfile->file_info_lock);
if (cfile->srch_inf.ntwrk_buf_start) {
cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n");
if (cfile->srch_inf.smallBuf)
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 4f0231e685a9..1238cd3552f9 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -266,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
struct tcon_link *tlink;
int rc;
+ if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) &&
+ (buf->LastWriteTime == 0) && (buf->ChangeTime) &&
+ (buf->Attributes == 0))
+ return 0; /* would be a no op, no sense sending this */
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
+
rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
SMB2_OP_SET_INFO);
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 389fb9f8c84e..3d383489b9cf 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -549,19 +549,19 @@ smb2_is_valid_lease_break(char *buffer)
list_for_each(tmp1, &server->smb_ses_list) {
ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
- spin_lock(&cifs_file_list_lock);
list_for_each(tmp2, &ses->tcon_list) {
tcon = list_entry(tmp2, struct cifs_tcon,
tcon_list);
+ spin_lock(&tcon->open_file_lock);
cifs_stats_inc(
&tcon->stats.cifs_stats.num_oplock_brks);
if (smb2_tcon_has_lease(tcon, rsp, lw)) {
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
+ spin_unlock(&tcon->open_file_lock);
}
- spin_unlock(&cifs_file_list_lock);
}
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -603,7 +603,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
- spin_lock(&cifs_file_list_lock);
+ spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
@@ -615,7 +615,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "file id match, oplock break\n");
cinode = CIFS_I(d_inode(cfile->dentry));
-
+ spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
cfile->oplock_break_cancelled = true;
@@ -637,14 +637,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
clear_bit(
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
-
+ spin_unlock(&cfile->file_info_lock);
queue_work(cifsiod_wq, &cfile->oplock_break);
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
- spin_unlock(&cifs_file_list_lock);
+ spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No matching file for oplock break\n");
return true;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d203c0329626..5d456ebb3813 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -28,6 +28,7 @@
#include "cifs_unicode.h"
#include "smb2status.h"
#include "smb2glob.h"
+#include "cifs_ioctl.h"
static int
change_conf(struct TCP_Server_Info *server)
@@ -70,6 +71,10 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
*val += add;
+ if (*val > 65000) {
+ *val = 65000; /* Don't get near 64K credits, avoid srv bugs */
+ printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
+ }
server->in_flight--;
if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
rc = change_conf(server);
@@ -287,7 +292,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
cifs_dbg(FYI, "Link Speed %lld\n",
le64_to_cpu(out_buf->LinkSpeed));
}
-
+ kfree(out_buf);
return rc;
}
#endif /* STATS2 */
@@ -541,6 +546,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
server->ops->set_oplock_level(cinode, oplock, fid->epoch,
&fid->purge_cache);
cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
+ memcpy(cfile->fid.create_guid, fid->create_guid, 16);
}
static void
@@ -699,6 +705,7 @@ smb2_clone_range(const unsigned int xid,
cchunk_out:
kfree(pcchunk);
+ kfree(retbuf);
return rc;
}
@@ -823,7 +830,6 @@ smb2_duplicate_extents(const unsigned int xid,
{
int rc;
unsigned int ret_data_len;
- char *retbuf = NULL;
struct duplicate_extents_to_file dup_ext_buf;
struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink);
@@ -849,7 +855,7 @@ smb2_duplicate_extents(const unsigned int xid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
true /* is_fsctl */, (char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
- (char **)&retbuf,
+ NULL,
&ret_data_len);
if (ret_data_len > 0)
@@ -872,7 +878,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile)
{
struct fsctl_set_integrity_information_req integr_info;
- char *retbuf = NULL;
unsigned int ret_data_len;
integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED);
@@ -884,9 +889,53 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
FSCTL_SET_INTEGRITY_INFORMATION,
true /* is_fsctl */, (char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
+ NULL,
+ &ret_data_len);
+
+}
+
+static int
+smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, void __user *ioc_buf)
+{
+ char *retbuf = NULL;
+ unsigned int ret_data_len = 0;
+ int rc;
+ struct smb_snapshot_array snapshot_in;
+
+ rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FSCTL_SRV_ENUMERATE_SNAPSHOTS,
+ true /* is_fsctl */, NULL, 0 /* no input data */,
(char **)&retbuf,
&ret_data_len);
+ cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n",
+ rc, ret_data_len);
+ if (rc)
+ return rc;
+ if (ret_data_len && (ioc_buf != NULL) && (retbuf != NULL)) {
+ /* Fixup buffer */
+ if (copy_from_user(&snapshot_in, ioc_buf,
+ sizeof(struct smb_snapshot_array))) {
+ rc = -EFAULT;
+ kfree(retbuf);
+ return rc;
+ }
+ if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) {
+ rc = -ERANGE;
+ return rc;
+ }
+
+ if (ret_data_len > snapshot_in.snapshot_array_size)
+ ret_data_len = snapshot_in.snapshot_array_size;
+
+ if (copy_to_user(ioc_buf, retbuf, ret_data_len))
+ rc = -EFAULT;
+ }
+
+ kfree(retbuf);
+ return rc;
}
static int
@@ -1041,7 +1090,7 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid)
static void
smb2_new_lease_key(struct cifs_fid *fid)
{
- get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
+ generate_random_uuid(fid->lease_key);
}
#define SMB2_SYMLINK_STRUCT_SIZE \
@@ -1654,6 +1703,7 @@ struct smb_version_operations smb21_operations = {
.clone_range = smb2_clone_range,
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
+ .enum_snapshots = smb3_enum_snapshots,
};
struct smb_version_operations smb30_operations = {
@@ -1740,6 +1790,7 @@ struct smb_version_operations smb30_operations = {
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
.fallocate = smb3_fallocate,
+ .enum_snapshots = smb3_enum_snapshots,
};
#ifdef CONFIG_CIFS_SMB311
@@ -1827,6 +1878,7 @@ struct smb_version_operations smb311_operations = {
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
.fallocate = smb3_fallocate,
+ .enum_snapshots = smb3_enum_snapshots,
};
#endif /* CIFS_SMB311 */
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 29e06db5f187..5ca5ea4668a1 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -100,7 +100,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
hdr->ProtocolId = SMB2_PROTO_NUMBER;
hdr->StructureSize = cpu_to_le16(64);
hdr->Command = smb2_cmd;
- hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
+ if (tcon && tcon->ses && tcon->ses->server) {
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ spin_lock(&server->req_lock);
+ /* Request up to 2 credits but don't go over the limit. */
+ if (server->credits >= server->max_credits)
+ hdr->CreditRequest = cpu_to_le16(0);
+ else
+ hdr->CreditRequest = cpu_to_le16(
+ min_t(int, server->max_credits -
+ server->credits, 2));
+ spin_unlock(&server->req_lock);
+ } else {
+ hdr->CreditRequest = cpu_to_le16(2);
+ }
hdr->ProcessId = cpu_to_le32((__u16)current->tgid);
if (!tcon)
@@ -236,8 +250,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
}
cifs_mark_open_files_invalid(tcon);
+
rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
mutex_unlock(&tcon->ses->session_mutex);
+
+ if (tcon->use_persistent)
+ cifs_reopen_persistent_handles(tcon);
+
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc)
goto out;
@@ -574,59 +593,42 @@ vneg_out:
return -EIO;
}
-int
-SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+struct SMB2_sess_data {
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct nls_table *nls_cp;
+ void (*func)(struct SMB2_sess_data *);
+ int result;
+ u64 previous_session;
+
+ /* we will send the SMB in three pieces:
+ * a fixed length beginning part, an optional
+ * SPNEGO blob (which can be zero length), and a
+ * last part which will include the strings
+ * and rest of bcc area. This allows us to avoid
+ * a large buffer 17K allocation
+ */
+ int buf0_type;
+ struct kvec iov[2];
+};
+
+static int
+SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
{
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
struct smb2_sess_setup_req *req;
- struct smb2_sess_setup_rsp *rsp = NULL;
- struct kvec iov[2];
- int rc = 0;
- int resp_buftype = CIFS_NO_BUFFER;
- __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
struct TCP_Server_Info *server = ses->server;
- u16 blob_length = 0;
- struct key *spnego_key = NULL;
- char *security_blob = NULL;
- unsigned char *ntlmssp_blob = NULL;
- bool use_spnego = false; /* else use raw ntlmssp */
-
- cifs_dbg(FYI, "Session Setup\n");
-
- if (!server) {
- WARN(1, "%s: server is NULL!\n", __func__);
- return -EIO;
- }
-
- /*
- * If we are here due to reconnect, free per-smb session key
- * in case signing was required.
- */
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
-
- /*
- * If memory allocation is successful, caller of this function
- * frees it.
- */
- ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
- if (!ses->ntlmssp)
- return -ENOMEM;
- ses->ntlmssp->sesskey_per_smbsess = true;
-
- /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
- if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP)
- ses->sectype = RawNTLMSSP;
-
-ssetup_ntlmssp_authenticate:
- if (phase == NtLmChallenge)
- phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req);
if (rc)
return rc;
req->hdr.SessionId = 0; /* First session, not a reauthenticate */
+
+ /* if reconnect, we need to send previous sess id, otherwise it is 0 */
+ req->PreviousSessionId = sess_data->previous_session;
+
req->Flags = 0; /* MBZ */
/* to enable echos and oplocks */
req->hdr.CreditRequest = cpu_to_le16(3);
@@ -642,199 +644,368 @@ ssetup_ntlmssp_authenticate:
req->Capabilities = 0;
req->Channel = 0; /* MBZ */
- iov[0].iov_base = (char *)req;
+ sess_data->iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field and 1 for pad */
- iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ /*
+ * This variable will be used to clear the buffer
+ * allocated above in case of any error in the calling function.
+ */
+ sess_data->buf0_type = CIFS_SMALL_BUFFER;
- if (ses->sectype == Kerberos) {
-#ifdef CONFIG_CIFS_UPCALL
- struct cifs_spnego_msg *msg;
+ return 0;
+}
- spnego_key = cifs_get_spnego_key(ses);
- if (IS_ERR(spnego_key)) {
- rc = PTR_ERR(spnego_key);
- spnego_key = NULL;
- goto ssetup_exit;
- }
+static void
+SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data)
+{
+ free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+}
- msg = spnego_key->payload.data[0];
- /*
- * check version field to make sure that cifs.upcall is
- * sending us a response in an expected form
- */
- if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
- cifs_dbg(VFS,
- "bad cifs.upcall version. Expected %d got %d",
- CIFS_SPNEGO_UPCALL_VERSION, msg->version);
- rc = -EKEYREJECTED;
- goto ssetup_exit;
- }
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
- GFP_KERNEL);
- if (!ses->auth_key.response) {
- cifs_dbg(VFS,
- "Kerberos can't allocate (%u bytes) memory",
- msg->sesskey_len);
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- ses->auth_key.len = msg->sesskey_len;
- blob_length = msg->secblob_len;
- iov[1].iov_base = msg->data + msg->sesskey_len;
- iov[1].iov_len = blob_length;
-#else
- rc = -EOPNOTSUPP;
- goto ssetup_exit;
-#endif /* CONFIG_CIFS_UPCALL */
- } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */
- ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
- GFP_KERNEL);
- if (ntlmssp_blob == NULL) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- build_ntlmssp_negotiate_blob(ntlmssp_blob, ses);
- if (use_spnego) {
- /* blob_length = build_spnego_ntlmssp_blob(
- &security_blob,
- sizeof(struct _NEGOTIATE_MESSAGE),
- ntlmssp_blob); */
- /* BB eventually need to add this */
- cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
- rc = -EOPNOTSUPP;
- kfree(ntlmssp_blob);
- goto ssetup_exit;
- } else {
- blob_length = sizeof(struct _NEGOTIATE_MESSAGE);
- /* with raw NTLMSSP we don't encapsulate in SPNEGO */
- security_blob = ntlmssp_blob;
- }
- iov[1].iov_base = security_blob;
- iov[1].iov_len = blob_length;
- } else if (phase == NtLmAuthenticate) {
- req->hdr.SessionId = ses->Suid;
- rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
- nls_cp);
- if (rc) {
- cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
- rc);
- goto ssetup_exit; /* BB double check error handling */
- }
- if (use_spnego) {
- /* blob_length = build_spnego_ntlmssp_blob(
- &security_blob,
- blob_length,
- ntlmssp_blob); */
- cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
- rc = -EOPNOTSUPP;
- kfree(ntlmssp_blob);
- goto ssetup_exit;
- } else {
- security_blob = ntlmssp_blob;
- }
- iov[1].iov_base = security_blob;
- iov[1].iov_len = blob_length;
- } else {
- cifs_dbg(VFS, "illegal ntlmssp phase\n");
- rc = -EIO;
- goto ssetup_exit;
- }
+static int
+SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
+{
+ int rc;
+ struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base;
/* Testing shows that buffer offset must be at location of Buffer[0] */
req->SecurityBufferOffset =
- cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
- 1 /* pad */ - 4 /* rfc1001 len */);
- req->SecurityBufferLength = cpu_to_le16(blob_length);
+ cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
+ 1 /* pad */ - 4 /* rfc1001 len */);
+ req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len);
- inc_rfc1001_len(req, blob_length - 1 /* pad */);
+ inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */);
/* BB add code to build os and lm fields */
- rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
- CIFS_LOG_ERROR | CIFS_NEG_OP);
+ rc = SendReceive2(sess_data->xid, sess_data->ses,
+ sess_data->iov, 2,
+ &sess_data->buf0_type,
+ CIFS_LOG_ERROR | CIFS_NEG_OP);
- kfree(security_blob);
- rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
- ses->Suid = rsp->hdr.SessionId;
- if (resp_buftype != CIFS_NO_BUFFER &&
- rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
- if (phase != NtLmNegotiate) {
- cifs_dbg(VFS, "Unexpected more processing error\n");
- goto ssetup_exit;
- }
- if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
- le16_to_cpu(rsp->SecurityBufferOffset)) {
- cifs_dbg(VFS, "Invalid security buffer offset %d\n",
- le16_to_cpu(rsp->SecurityBufferOffset));
- rc = -EIO;
- goto ssetup_exit;
+ return rc;
+}
+
+static int
+SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
+{
+ int rc = 0;
+ struct cifs_ses *ses = sess_data->ses;
+
+ mutex_lock(&ses->server->srv_mutex);
+ if (ses->server->sign && ses->server->ops->generate_signingkey) {
+ rc = ses->server->ops->generate_signingkey(ses);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ if (rc) {
+ cifs_dbg(FYI,
+ "SMB3 session key generation failed\n");
+ mutex_unlock(&ses->server->srv_mutex);
+ goto keygen_exit;
}
+ }
+ if (!ses->server->session_estab) {
+ ses->server->sequence_number = 0x2;
+ ses->server->session_estab = true;
+ }
+ mutex_unlock(&ses->server->srv_mutex);
+
+ cifs_dbg(FYI, "SMB2/3 session established successfully\n");
+ spin_lock(&GlobalMid_Lock);
+ ses->status = CifsGood;
+ ses->need_reconnect = false;
+ spin_unlock(&GlobalMid_Lock);
- /* NTLMSSP Negotiate sent now processing challenge (response) */
- phase = NtLmChallenge; /* process ntlmssp challenge */
- rc = 0; /* MORE_PROCESSING is not an error here but expected */
- rc = decode_ntlmssp_challenge(rsp->Buffer,
- le16_to_cpu(rsp->SecurityBufferLength), ses);
+keygen_exit:
+ if (!ses->server->sign) {
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ }
+ return rc;
+}
+
+#ifdef CONFIG_CIFS_UPCALL
+static void
+SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
+{
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
+ struct cifs_spnego_msg *msg;
+ struct key *spnego_key = NULL;
+ struct smb2_sess_setup_rsp *rsp = NULL;
+
+ rc = SMB2_sess_alloc_buffer(sess_data);
+ if (rc)
+ goto out;
+
+ spnego_key = cifs_get_spnego_key(ses);
+ if (IS_ERR(spnego_key)) {
+ rc = PTR_ERR(spnego_key);
+ spnego_key = NULL;
+ goto out;
}
+ msg = spnego_key->payload.data[0];
/*
- * BB eventually add code for SPNEGO decoding of NtlmChallenge blob,
- * but at least the raw NTLMSSP case works.
+ * check version field to make sure that cifs.upcall is
+ * sending us a response in an expected form
*/
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cifs_dbg(VFS,
+ "bad cifs.upcall version. Expected %d got %d",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+ rc = -EKEYREJECTED;
+ goto out_put_spnego_key;
+ }
+
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS,
+ "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto out_put_spnego_key;
+ }
+ ses->auth_key.len = msg->sesskey_len;
+
+ sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
+ sess_data->iov[1].iov_len = msg->secblob_len;
+
+ rc = SMB2_sess_sendreceive(sess_data);
+ if (rc)
+ goto out_put_spnego_key;
+
+ rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
+ ses->Suid = rsp->hdr.SessionId;
+
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
+
+ rc = SMB2_sess_establish_session(sess_data);
+out_put_spnego_key:
+ key_invalidate(spnego_key);
+ key_put(spnego_key);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ SMB2_sess_free_buffer(sess_data);
+}
+#else
+static void
+SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
+{
+ cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+ sess_data->result = -EOPNOTSUPP;
+ sess_data->func = NULL;
+}
+#endif
+
+static void
+SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data);
+
+static void
+SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
+{
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
+ struct smb2_sess_setup_rsp *rsp = NULL;
+ char *ntlmssp_blob = NULL;
+ bool use_spnego = false; /* else use raw ntlmssp */
+ u16 blob_length = 0;
+
/*
- * No tcon so can't do
- * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+ * If memory allocation is successful, caller of this function
+ * frees it.
*/
- if (rc != 0)
- goto ssetup_exit;
+ ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+ if (!ses->ntlmssp) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ ses->ntlmssp->sesskey_per_smbsess = true;
+
+ rc = SMB2_sess_alloc_buffer(sess_data);
+ if (rc)
+ goto out_err;
+
+ ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
+ GFP_KERNEL);
+ if (ntlmssp_blob == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ build_ntlmssp_negotiate_blob(ntlmssp_blob, ses);
+ if (use_spnego) {
+ /* BB eventually need to add this */
+ cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+ rc = -EOPNOTSUPP;
+ goto out;
+ } else {
+ blob_length = sizeof(struct _NEGOTIATE_MESSAGE);
+ /* with raw NTLMSSP we don't encapsulate in SPNEGO */
+ }
+ sess_data->iov[1].iov_base = ntlmssp_blob;
+ sess_data->iov[1].iov_len = blob_length;
+
+ rc = SMB2_sess_sendreceive(sess_data);
+ rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
+
+ /* If true, rc here is expected and not an error */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER &&
+ rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rc = 0;
+
+ if (rc)
+ goto out;
+
+ if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
+ le16_to_cpu(rsp->SecurityBufferOffset)) {
+ cifs_dbg(VFS, "Invalid security buffer offset %d\n",
+ le16_to_cpu(rsp->SecurityBufferOffset));
+ rc = -EIO;
+ goto out;
+ }
+ rc = decode_ntlmssp_challenge(rsp->Buffer,
+ le16_to_cpu(rsp->SecurityBufferLength), ses);
+ if (rc)
+ goto out;
+
+ cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+
+ ses->Suid = rsp->hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
-ssetup_exit:
- free_rsp_buf(resp_buftype, rsp);
-
- /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
- if ((phase == NtLmChallenge) && (rc == 0))
- goto ssetup_ntlmssp_authenticate;
+out:
+ kfree(ntlmssp_blob);
+ SMB2_sess_free_buffer(sess_data);
if (!rc) {
- mutex_lock(&server->srv_mutex);
- if (server->sign && server->ops->generate_signingkey) {
- rc = server->ops->generate_signingkey(ses);
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
- if (rc) {
- cifs_dbg(FYI,
- "SMB3 session key generation failed\n");
- mutex_unlock(&server->srv_mutex);
- goto keygen_exit;
- }
- }
- if (!server->session_estab) {
- server->sequence_number = 0x2;
- server->session_estab = true;
- }
- mutex_unlock(&server->srv_mutex);
-
- cifs_dbg(FYI, "SMB2/3 session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
+ sess_data->result = 0;
+ sess_data->func = SMB2_sess_auth_rawntlmssp_authenticate;
+ return;
}
+out_err:
+ kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+ sess_data->result = rc;
+ sess_data->func = NULL;
+}
-keygen_exit:
- if (!server->sign) {
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
+static void
+SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
+{
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
+ struct smb2_sess_setup_req *req;
+ struct smb2_sess_setup_rsp *rsp = NULL;
+ unsigned char *ntlmssp_blob = NULL;
+ bool use_spnego = false; /* else use raw ntlmssp */
+ u16 blob_length = 0;
+
+ rc = SMB2_sess_alloc_buffer(sess_data);
+ if (rc)
+ goto out;
+
+ req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
+ req->hdr.SessionId = ses->Suid;
+
+ rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
+ sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc);
+ goto out;
}
- if (spnego_key) {
- key_invalidate(spnego_key);
- key_put(spnego_key);
+
+ if (use_spnego) {
+ /* BB eventually need to add this */
+ cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+ rc = -EOPNOTSUPP;
+ goto out;
}
+ sess_data->iov[1].iov_base = ntlmssp_blob;
+ sess_data->iov[1].iov_len = blob_length;
+
+ rc = SMB2_sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
+
+ rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
+
+ ses->Suid = rsp->hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
+
+ rc = SMB2_sess_establish_session(sess_data);
+out:
+ kfree(ntlmssp_blob);
+ SMB2_sess_free_buffer(sess_data);
kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+ sess_data->result = rc;
+ sess_data->func = NULL;
+}
+static int
+SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data)
+{
+ if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP)
+ ses->sectype = RawNTLMSSP;
+
+ switch (ses->sectype) {
+ case Kerberos:
+ sess_data->func = SMB2_auth_kerberos;
+ break;
+ case RawNTLMSSP:
+ sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate;
+ break;
+ default:
+ cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int
+SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ int rc = 0;
+ struct TCP_Server_Info *server = ses->server;
+ struct SMB2_sess_data *sess_data;
+
+ cifs_dbg(FYI, "Session Setup\n");
+
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
+ }
+
+ sess_data = kzalloc(sizeof(struct SMB2_sess_data), GFP_KERNEL);
+ if (!sess_data)
+ return -ENOMEM;
+
+ rc = SMB2_select_sec(ses, sess_data);
+ if (rc)
+ goto out;
+ sess_data->xid = xid;
+ sess_data->ses = ses;
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ sess_data->nls_cp = (struct nls_table *) nls_cp;
+
+ while (sess_data->func)
+ sess_data->func(sess_data);
+
+ rc = sess_data->result;
+out:
+ kfree(sess_data);
return rc;
}
@@ -1164,7 +1335,7 @@ create_durable_v2_buf(struct cifs_fid *pfid)
buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
- get_random_bytes(buf->dcontext.CreateGuid, 16);
+ generate_random_uuid(buf->dcontext.CreateGuid);
memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
@@ -2057,6 +2228,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rdata->credits) {
buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
+ buf->CreditRequest = buf->CreditCharge;
spin_lock(&server->req_lock);
server->credits += rdata->credits -
le16_to_cpu(buf->CreditCharge);
@@ -2243,6 +2415,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (wdata->credits) {
req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
+ req->hdr.CreditRequest = req->hdr.CreditCharge;
spin_lock(&server->req_lock);
server->credits += wdata->credits -
le16_to_cpu(req->hdr.CreditCharge);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ff88d9feb01e..fd3709e8de33 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -276,7 +276,7 @@ struct smb2_sess_setup_req {
__le32 Channel;
__le16 SecurityBufferOffset;
__le16 SecurityBufferLength;
- __le64 PreviousSessionId;
+ __u64 PreviousSessionId;
__u8 Buffer[1]; /* variable length GSS security buffer */
} __packed;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 5e23f64c0804..20af5187ba63 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -33,7 +33,8 @@
#define MAX_EA_VALUE_SIZE 65535
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-
+#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */
+#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */
/* BB need to add server (Samba e.g) support for security and trusted prefix */
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
@@ -144,6 +145,54 @@ out:
return rc;
}
+static int cifs_attrib_get(struct dentry *dentry,
+ struct inode *inode, void *value,
+ size_t size)
+{
+ ssize_t rc;
+ __u32 *pattribute;
+
+ rc = cifs_revalidate_dentry_attr(dentry);
+
+ if (rc)
+ return rc;
+
+ if ((value == NULL) || (size == 0))
+ return sizeof(__u32);
+ else if (size < sizeof(__u32))
+ return -ERANGE;
+
+ /* return dos attributes as pseudo xattr */
+ pattribute = (__u32 *)value;
+ *pattribute = CIFS_I(inode)->cifsAttrs;
+
+ return sizeof(__u32);
+}
+
+static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode,
+ void *value, size_t size)
+{
+ ssize_t rc;
+ __u64 * pcreatetime;
+
+ rc = cifs_revalidate_dentry_attr(dentry);
+ if (rc)
+ return rc;
+
+ if ((value == NULL) || (size == 0))
+ return sizeof(__u64);
+ else if (size < sizeof(__u64))
+ return -ERANGE;
+
+ /* return dos attributes as pseudo xattr */
+ pcreatetime = (__u64 *)value;
+ *pcreatetime = CIFS_I(inode)->createtime;
+ return sizeof(__u64);
+
+ return rc;
+}
+
+
static int cifs_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
@@ -168,10 +217,19 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
rc = -ENOMEM;
goto out;
}
- /* return dos attributes as pseudo xattr */
+
/* return alt name if available as pseudo attr */
switch (handler->flags) {
case XATTR_USER:
+ cifs_dbg(FYI, "%s:querying user xattr %s\n", __func__, name);
+ if (strcmp(name, CIFS_XATTR_ATTRIB) == 0) {
+ rc = cifs_attrib_get(dentry, inode, value, size);
+ break;
+ } else if (strcmp(name, CIFS_XATTR_CREATETIME) == 0) {
+ rc = cifs_creation_time_get(dentry, inode, value, size);
+ break;
+ }
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto out;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 6fb8672c0892..c0474ac6cbf2 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -109,7 +109,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
/* optimistically we can also act as if our nose bleeds. The
* granularity of the mtime is coarse anyways so we might actually be
* right most of the time. Note: we only do this for directories. */
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
#endif
}
@@ -291,7 +291,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
/* rename */
static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
const char *old_name = old_dentry->d_name.name;
const char *new_name = new_dentry->d_name.name;
@@ -299,6 +300,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
int new_length = new_dentry->d_name.len;
int error;
+ if (flags)
+ return -EINVAL;
+
error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index f47c7483863b..6e0154eb6fcc 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -38,27 +38,6 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
static ssize_t
-coda_file_splice_read(struct file *coda_file, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t count,
- unsigned int flags)
-{
- ssize_t (*splice_read)(struct file *, loff_t *,
- struct pipe_inode_info *, size_t, unsigned int);
- struct coda_file_info *cfi;
- struct file *host_file;
-
- cfi = CODA_FTOC(coda_file);
- BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- host_file = cfi->cfi_container;
-
- splice_read = host_file->f_op->splice_read;
- if (!splice_read)
- splice_read = default_file_splice_read;
-
- return splice_read(host_file, ppos, pipe, count, flags);
-}
-
-static ssize_t
coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *coda_file = iocb->ki_filp;
@@ -75,7 +54,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
- coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
+ coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
inode_unlock(coda_inode);
file_end_write(host_file);
return ret;
@@ -225,6 +204,6 @@ const struct file_operations coda_file_operations = {
.open = coda_open,
.release = coda_release,
.fsync = coda_fsync,
- .splice_read = coda_file_splice_read,
+ .splice_read = generic_file_splice_read,
};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 57e81cbba0fa..71dbe7e287ce 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -271,7 +271,7 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
memset(&vattr, 0, sizeof(vattr));
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
coda_iattr_to_vattr(iattr, &vattr);
vattr.va_type = C_VNON; /* cannot set type */
diff --git a/fs/compat.c b/fs/compat.c
index be6e48b0a46c..bd064a2c3550 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -54,20 +54,6 @@
#include <asm/ioctls.h>
#include "internal.h"
-int compat_log = 1;
-
-int compat_printk(const char *fmt, ...)
-{
- va_list ap;
- int ret;
- if (!compat_log)
- return 0;
- va_start(ap, fmt);
- ret = vprintk(fmt, ap);
- va_end(ap);
- return ret;
-}
-
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
@@ -562,7 +548,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
goto out;
ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+ if (nr_segs > UIO_MAXIOV)
goto out;
if (nr_segs > fast_segs) {
ret = -ENOMEM;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c1e9f29c924c..f2d7402abe02 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT)
/* Big R */
COMPATIBLE_IOCTL(RNDGETENTCNT)
COMPATIBLE_IOCTL(RNDADDTOENTCNT)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0387968e6f47..ad718e5e37bb 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -76,7 +76,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
sd_iattr->ia_uid = GLOBAL_ROOT_UID;
sd_iattr->ia_gid = GLOBAL_ROOT_GID;
sd_iattr->ia_atime = sd_iattr->ia_mtime =
- sd_iattr->ia_ctime = current_fs_time(inode->i_sb);
+ sd_iattr->ia_ctime = current_time(inode);
sd->s_iattr = sd_iattr;
}
/* attributes were changed atleast once in past */
@@ -113,7 +113,7 @@ static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -197,7 +197,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
return -ENOMEM;
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb);
+ p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode);
configfs_set_inode_lock_class(sd, inode);
init(inode);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index c502c116924c..98f87fe8f186 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -28,7 +28,6 @@
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/fscrypto.h>
-#include <linux/ecryptfs.h>
static unsigned int num_prealloc_crypto_pages = 32;
static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
EXPORT_SYMBOL(fscrypt_get_ctx);
/**
- * fscrypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
+ * page_crypt_complete() - completion callback for page crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
*/
-static void fscrypt_complete(struct crypto_async_request *req, int res)
+static void page_crypt_complete(struct crypto_async_request *req, int res)
{
struct fscrypt_completion_result *ecr = req->data;
@@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode,
struct page *src_page, struct page *dest_page,
gfp_t gfp_flags)
{
- u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+ struct {
+ __le64 index;
+ u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
+ } xts_tweak;
struct skcipher_request *req = NULL;
DECLARE_FS_COMPLETION_RESULT(ecr);
struct scatterlist dst, src;
@@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode,
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- fscrypt_complete, &ecr);
+ page_crypt_complete, &ecr);
- BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
- memcpy(xts_tweak, &index, sizeof(index));
- memset(&xts_tweak[sizeof(index)], 0,
- FS_XTS_TWEAK_SIZE - sizeof(index));
+ BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
+ xts_tweak.index = cpu_to_le64(index);
+ memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, PAGE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
- xts_tweak);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
else
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 5d6d49113efa..9a28133ac3b8 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -10,21 +10,16 @@
* This has not yet undergone a rigorous security audit.
*/
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
#include <linux/fscrypto.h>
-static u32 size_round_up(size_t size, size_t blksize)
-{
- return ((size + blksize - 1) / blksize) * blksize;
-}
-
/**
- * dir_crypt_complete() -
+ * fname_crypt_complete() - completion callback for filename crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
*/
-static void dir_crypt_complete(struct crypto_async_request *req, int res)
+static void fname_crypt_complete(struct crypto_async_request *req, int res)
{
struct fscrypt_completion_result *ecr = req->data;
@@ -35,11 +30,11 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res)
}
/**
- * fname_encrypt() -
+ * fname_encrypt() - encrypt a filename
*
- * This function encrypts the input filename, and returns the length of the
- * ciphertext. Errors are returned as negative numbers. We trust the caller to
- * allocate sufficient memory to oname string.
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
*/
static int fname_encrypt(struct inode *inode,
const struct qstr *iname, struct fscrypt_str *oname)
@@ -60,10 +55,9 @@ static int fname_encrypt(struct inode *inode,
if (iname->len <= 0 || iname->len > lim)
return -EIO;
- ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
- FS_CRYPTO_BLOCK_SIZE : iname->len;
- ciphertext_len = size_round_up(ciphertext_len, padding);
- ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
+ ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE);
+ ciphertext_len = round_up(ciphertext_len, padding);
+ ciphertext_len = min(ciphertext_len, lim);
if (ciphertext_len <= sizeof(buf)) {
workbuf = buf;
@@ -84,7 +78,7 @@ static int fname_encrypt(struct inode *inode,
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- dir_crypt_complete, &ecr);
+ fname_crypt_complete, &ecr);
/* Copy the input */
memcpy(workbuf, iname->name, iname->len);
@@ -105,20 +99,22 @@ static int fname_encrypt(struct inode *inode,
}
kfree(alloc_buf);
skcipher_request_free(req);
- if (res < 0)
+ if (res < 0) {
printk_ratelimited(KERN_ERR
"%s: Error (error code %d)\n", __func__, res);
+ return res;
+ }
oname->len = ciphertext_len;
- return res;
+ return 0;
}
-/*
- * fname_decrypt()
- * This function decrypts the input filename, and returns
- * the length of the plaintext.
- * Errors are returned as negative numbers.
- * We trust the caller to allocate sufficient memory to oname string.
+/**
+ * fname_decrypt() - decrypt a filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
*/
static int fname_decrypt(struct inode *inode,
const struct fscrypt_str *iname,
@@ -146,7 +142,7 @@ static int fname_decrypt(struct inode *inode,
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- dir_crypt_complete, &ecr);
+ fname_crypt_complete, &ecr);
/* Initialize IV */
memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -168,7 +164,7 @@ static int fname_decrypt(struct inode *inode,
}
oname->len = strnlen(oname->name, iname->len);
- return oname->len;
+ return 0;
}
static const char *lookup_table =
@@ -231,9 +227,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
if (ci)
padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
- if (ilen < FS_CRYPTO_BLOCK_SIZE)
- ilen = FS_CRYPTO_BLOCK_SIZE;
- return size_round_up(ilen, padding);
+ ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE);
+ return round_up(ilen, padding);
}
EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
@@ -279,6 +274,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
/**
* fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
* space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
*/
int fscrypt_fname_disk_to_usr(struct inode *inode,
u32 hash, u32 minor_hash,
@@ -287,13 +286,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
{
const struct qstr qname = FSTR_TO_QSTR(iname);
char buf[24];
- int ret;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
- return oname->len;
+ return 0;
}
if (iname->len < FS_CRYPTO_BLOCK_SIZE)
@@ -303,9 +301,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
return fname_decrypt(inode, iname, oname);
if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
- ret = digest_encode(iname->name, iname->len, oname->name);
- oname->len = ret;
- return ret;
+ oname->len = digest_encode(iname->name, iname->len,
+ oname->name);
+ return 0;
}
if (hash) {
memcpy(buf, &hash, 4);
@@ -315,15 +313,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
}
memcpy(buf + 8, iname->name + iname->len - 16, 16);
oname->name[0] = '_';
- ret = digest_encode(buf, 24, oname->name + 1);
- oname->len = ret + 1;
- return ret + 1;
+ oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+ return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
/**
* fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
* space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
*/
int fscrypt_fname_usr_to_disk(struct inode *inode,
const struct qstr *iname,
@@ -333,7 +334,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
- return oname->len;
+ return 0;
}
if (inode->i_crypt_info)
return fname_encrypt(inode, iname, oname);
@@ -367,10 +368,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (dir->i_crypt_info) {
ret = fscrypt_fname_alloc_buffer(dir, iname->len,
&fname->crypto_buf);
- if (ret < 0)
+ if (ret)
return ret;
ret = fname_encrypt(dir, iname, &fname->crypto_buf);
- if (ret < 0)
+ if (ret)
goto errout;
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1ac263eddc4e..82f0285f5d08 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -8,11 +8,8 @@
* Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
*/
-#include <keys/encrypted-type.h>
#include <keys/user-type.h>
-#include <linux/random.h>
#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
#include <linux/fscrypto.h>
static void derive_crypt_complete(struct crypto_async_request *req, int rc)
@@ -139,6 +136,38 @@ out:
return res;
}
+static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
+ const char **cipher_str_ret, int *keysize_ret)
+{
+ if (S_ISREG(inode->i_mode)) {
+ if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
+ *cipher_str_ret = "xts(aes)";
+ *keysize_ret = FS_AES_256_XTS_KEY_SIZE;
+ return 0;
+ }
+ pr_warn_once("fscrypto: unsupported contents encryption mode "
+ "%d for inode %lu\n",
+ ci->ci_data_mode, inode->i_ino);
+ return -ENOKEY;
+ }
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
+ *cipher_str_ret = "cts(cbc(aes))";
+ *keysize_ret = FS_AES_256_CTS_KEY_SIZE;
+ return 0;
+ }
+ pr_warn_once("fscrypto: unsupported filenames encryption mode "
+ "%d for inode %lu\n",
+ ci->ci_filename_mode, inode->i_ino);
+ return -ENOKEY;
+ }
+
+ pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
+ (inode->i_mode & S_IFMT), inode->i_ino);
+ return -ENOKEY;
+}
+
static void put_crypt_info(struct fscrypt_info *ci)
{
if (!ci)
@@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode)
struct fscrypt_context ctx;
struct crypto_skcipher *ctfm;
const char *cipher_str;
+ int keysize;
u8 raw_key[FS_MAX_KEY_SIZE];
- u8 mode;
int res;
res = fscrypt_initialize();
@@ -179,13 +208,19 @@ retry:
if (res < 0) {
if (!fscrypt_dummy_context_enabled(inode))
return res;
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
ctx.flags = 0;
} else if (res != sizeof(ctx)) {
return -EINVAL;
}
- res = 0;
+
+ if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
if (!crypt_info)
@@ -198,27 +233,11 @@ retry:
crypt_info->ci_keyring_key = NULL;
memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
sizeof(crypt_info->ci_master_key));
- if (S_ISREG(inode->i_mode))
- mode = crypt_info->ci_data_mode;
- else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- mode = crypt_info->ci_filename_mode;
- else
- BUG();
-
- switch (mode) {
- case FS_ENCRYPTION_MODE_AES_256_XTS:
- cipher_str = "xts(aes)";
- break;
- case FS_ENCRYPTION_MODE_AES_256_CTS:
- cipher_str = "cts(cbc(aes))";
- break;
- default:
- printk_once(KERN_WARNING
- "%s: unsupported key mode %d (ino %u)\n",
- __func__, mode, (unsigned) inode->i_ino);
- res = -ENOKEY;
+
+ res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
+ if (res)
goto out;
- }
+
if (fscrypt_dummy_context_enabled(inode)) {
memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
goto got_key;
@@ -253,7 +272,7 @@ got_key:
crypt_info->ci_ctfm = ctfm;
crypto_skcipher_clear_flags(ctfm, ~0);
crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+ res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
if (res)
goto out;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ed115acb5dee..6865663aac69 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp,
if (ret)
return ret;
+ inode_lock(inode);
+
if (!inode_has_encryption_context(inode)) {
if (!S_ISDIR(inode->i_mode))
ret = -EINVAL;
@@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp,
ret = -EINVAL;
}
+ inode_unlock(inode);
+
mnt_drop_write_file(filp);
return ret;
}
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..014defd2e744 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
/*
* We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct inode *inode,
- struct buffer_head *bh, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
+ struct page *to, unsigned long vaddr)
{
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, inode),
- .size = bh->b_size,
+ .sector = sector,
+ .size = size,
};
- struct block_device *bdev = bh->b_bdev;
void *vto;
if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
- struct buffer_head *bh, void **entryp,
- struct vm_area_struct *vma, struct vm_fault *vmf)
+ struct block_device *bdev, sector_t sector, size_t size,
+ void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- struct block_device *bdev = bh->b_bdev;
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, mapping->host),
- .size = bh->b_size,
+ .sector = sector,
+ .size = size,
};
void *ret;
void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, inode, &bh, vaddr);
+ error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+ bh.b_size, new_page, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
/* Filesystem should not return unwritten buffers to us! */
WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+ bh.b_size, &entry, vma, vmf);
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
@@ -1034,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (!write && !buffer_mapped(&bh)) {
spinlock_t *ptl;
pmd_t entry;
- struct page *zero_page = get_huge_zero_page();
+ struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
if (unlikely(!zero_page)) {
dax_pmd_dbg(&bh, address, "no zero page");
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
return dax_zero_page_range(inode, from, length, get_block);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *iter = data;
+ loff_t end = pos + length, done = 0;
+ ssize_t ret = 0;
+
+ if (iov_iter_rw(iter) == READ) {
+ end = min(end, i_size_read(inode));
+ if (pos >= end)
+ return 0;
+
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return iov_iter_zero(min(length, end - pos), iter);
+ }
+
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ return -EIO;
+
+ while (pos < end) {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ struct blk_dax_ctl dax = { 0 };
+ ssize_t map_len;
+
+ dax.sector = iomap->blkno +
+ (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+ map_len = dax_map_atomic(iomap->bdev, &dax);
+ if (map_len < 0) {
+ ret = map_len;
+ break;
+ }
+
+ dax.addr += offset;
+ map_len -= offset;
+ if (map_len > end - pos)
+ map_len = end - pos;
+
+ if (iov_iter_rw(iter) == WRITE)
+ map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+ else
+ map_len = copy_to_iter(dax.addr, map_len, iter);
+ dax_unmap_atomic(iomap->bdev, &dax);
+ if (map_len <= 0) {
+ ret = map_len ? map_len : -EFAULT;
+ break;
+ }
+
+ pos += map_len;
+ length -= map_len;
+ done += map_len;
+ }
+
+ return done ? done : ret;
+}
+
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb: The control block for this I/O
+ * @iter: The addresses to do I/O from or to
+ * @ops: iomap ops passed from the file system
+ *
+ * This function performs read and write operations to directly mapped
+ * persistent memory. The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+ unsigned flags = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
+ flags |= IOMAP_WRITE;
+
+ /*
+ * Yes, even DAX files can have page cache attached to them: A zeroed
+ * page is inserted into the pagecache when we have to serve a write
+ * fault on a hole. It should never be dirtied and can simply be
+ * dropped from the pagecache once we get real data for the page.
+ *
+ * XXX: This is racy against mmap, and there's nothing we can do about
+ * it. We'll eventually need to shift this down even further so that
+ * we can check if we allocated blocks over a hole first.
+ */
+ if (mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT,
+ (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
+ iter, iomap_dax_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ done += ret;
+ }
+
+ iocb->ki_pos += done;
+ return done ? done : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+
+/**
+ * iomap_dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in their fault
+ * or mkwrite handler for DAX files. Assumes the caller has done all the
+ * necessary locking for the page fault to proceed successfully.
+ */
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
+ sector_t sector;
+ struct iomap iomap = { 0 };
+ unsigned flags = 0;
+ int error, major = 0;
+ void *entry;
+
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is supposed
+ * to hold locks serializing us with truncate / punch hole so this is
+ * a reliable test.
+ */
+ if (pos >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
+
+ entry = grab_mapping_entry(mapping, vmf->pgoff);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto out;
+ }
+
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ flags |= IOMAP_WRITE;
+
+ /*
+ * Note that we don't bother to use iomap_apply here: DAX required
+ * the file system block size to be equal the page size, which means
+ * that we never have to deal with more than a single extent here.
+ */
+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+ if (error)
+ goto unlock_entry;
+ if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+ error = -EIO; /* fs corruption? */
+ goto unlock_entry;
+ }
+
+ sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+
+ if (vmf->cow_page) {
+ switch (iomap.type) {
+ case IOMAP_HOLE:
+ case IOMAP_UNWRITTEN:
+ clear_user_highpage(vmf->cow_page, vaddr);
+ break;
+ case IOMAP_MAPPED:
+ error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
+ vmf->cow_page, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ if (error)
+ goto unlock_entry;
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
+ }
+ vmf->entry = entry;
+ return VM_FAULT_DAX_LOCKED;
+ }
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ if (iomap.flags & IOMAP_F_NEW) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ }
+ error = dax_insert_mapping(mapping, iomap.bdev, sector,
+ PAGE_SIZE, &entry, vma, vmf);
+ break;
+ case IOMAP_UNWRITTEN:
+ case IOMAP_HOLE:
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return dax_load_hole(mapping, entry, vmf);
+ /*FALLTHRU*/
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM | major;
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
+ if (error < 0 && error != -EBUSY)
+ return VM_FAULT_SIGBUS | major;
+ return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_fault);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 592059f88e04..354e2ab62031 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
#define F_DENTRY(filp) ((filp)->f_path.dentry)
-#define REAL_FOPS_DEREF(dentry) \
- ((const struct file_operations *)(dentry)->d_fsdata)
-
static int open_proxy_open(struct inode *inode, struct file *filp)
{
const struct dentry *dentry = F_DENTRY(filp);
@@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
goto out;
}
- real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = debugfs_real_fops(filp);
real_fops = fops_get(real_fops);
if (!real_fops) {
/* Huh? Module did not clean up after itself at exit? */
@@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \
{ \
const struct dentry *dentry = F_DENTRY(filp); \
const struct file_operations *real_fops = \
- REAL_FOPS_DEREF(dentry); \
+ debugfs_real_fops(filp); \
int srcu_idx; \
ret_type r; \
\
@@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp,
struct poll_table_struct *wait)
{
const struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ const struct file_operations *real_fops = debugfs_real_fops(filp);
int srcu_idx;
unsigned int r = 0;
@@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp,
static int full_proxy_release(struct inode *inode, struct file *filp)
{
const struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry);
+ const struct file_operations *real_fops = debugfs_real_fops(filp);
const struct file_operations *proxy_fops = filp->f_op;
int r = 0;
@@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
replace_fops(filp, d_inode(dentry)->i_fop);
kfree((void *)proxy_fops);
fops_put(real_fops);
- return 0;
+ return r;
}
static void __full_proxy_fops_init(struct file_operations *proxy_fops,
@@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
goto out;
}
- real_fops = REAL_FOPS_DEREF(dentry);
+ real_fops = debugfs_real_fops(filp);
real_fops = fops_get(real_fops);
if (!real_fops) {
/* Huh? Module did not cleanup after itself at exit? */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 72361baf9da7..f17fcf89e18e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -45,7 +45,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_fs_time(sb);
+ inode->i_ctime = current_time(inode);
}
return inode;
}
@@ -748,7 +748,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
old_name = fsnotify_oldname_init(old_dentry->d_name.name);
error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
- dentry);
+ dentry, 0);
if (error) {
fsnotify_oldname_free(old_name);
goto exit;
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index bba52634b995..b3e8443a1f47 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations;
extern const struct file_operations debugfs_open_proxy_file_operations;
extern const struct file_operations debugfs_full_proxy_file_operations;
-struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops);
-
#endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79a5941c2474..108df2e3602c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb)
struct dentry *root = sb->s_root;
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
- kuid_t root_uid;
- kgid_t root_gid;
-
- root_uid = make_kuid(current_user_ns(), 0);
- root_gid = make_kgid(current_user_ns(), 0);
- if (!uid_valid(root_uid) || !gid_valid(root_gid))
- return -EINVAL;
+ kuid_t ptmx_uid = current_fsuid();
+ kgid_t ptmx_gid = current_fsgid();
inode_lock(d_inode(root));
@@ -305,12 +300,12 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
- inode->i_uid = root_uid;
- inode->i_gid = root_gid;
+ inode->i_uid = ptmx_uid;
+ inode->i_gid = ptmx_gid;
d_add(dentry, inode);
@@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
- sync_filesystem(sb);
err = parse_mount_options(data, PARSE_REMOUNT, opts);
/*
@@ -395,6 +389,7 @@ static int
devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ int error;
s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
@@ -403,28 +398,42 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
+ error = -ENOMEM;
s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
+ error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto fail;
+
+ error = -ENOMEM;
inode = new_inode(s);
if (!inode)
goto fail;
inode->i_ino = 1;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
s->s_root = d_make_root(inode);
- if (s->s_root)
- return 0;
+ if (!s->s_root) {
+ pr_err("get root dentry failed\n");
+ goto fail;
+ }
- pr_err("get root dentry failed\n");
+ error = mknod_ptmx(s);
+ if (error)
+ goto fail_dput;
+ return 0;
+fail_dput:
+ dput(s->s_root);
+ s->s_root = NULL;
fail:
- return -ENOMEM;
+ return error;
}
/*
@@ -436,43 +445,15 @@ fail:
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int error;
- struct pts_mount_opts opts;
- struct super_block *s;
-
- error = parse_mount_options(data, PARSE_MOUNT, &opts);
- if (error)
- return ERR_PTR(error);
-
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- if (!s->s_root) {
- error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
- if (error)
- goto out_undo_sget;
- s->s_flags |= MS_ACTIVE;
- }
-
- memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
-
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
-
- return dget(s->s_root);
-
-out_undo_sget:
- deactivate_locked_super(s);
- return ERR_PTR(error);
+ return mount_nodev(fs_type, flags, data, devpts_fill_super);
}
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
- ida_destroy(&fsi->allocated_ptys);
+ if (fsi)
+ ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
kill_litter_super(sb);
}
@@ -559,7 +540,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7c3ce73cb617..fb9aa16a7727 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -246,6 +246,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
if ((dio->op == REQ_OP_READ) &&
((offset + transferred) > dio->i_size))
transferred = dio->i_size - offset;
+ /* ignore EFAULT if some IO has been done */
+ if (unlikely(ret == -EFAULT) && transferred)
+ ret = 0;
}
if (ret == 0)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 963016c8f3d1..609998de533e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1656,16 +1656,12 @@ void dlm_lowcomms_stop(void)
mutex_lock(&connections_lock);
dlm_allow_conn = 0;
foreach_conn(stop_conn);
+ clean_writequeues();
+ foreach_conn(free_conn);
mutex_unlock(&connections_lock);
work_stop();
- mutex_lock(&connections_lock);
- clean_writequeues();
-
- foreach_conn(free_conn);
-
- mutex_unlock(&connections_lock);
kmem_cache_destroy(con_cache);
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 4ba1547bb9ad..599a29237cfe 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -715,4 +715,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
+extern const struct xattr_handler *ecryptfs_xattr_handlers[];
+
#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d153b6a1d72..cf390dceddd2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -577,7 +577,8 @@ out:
static int
ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
int rc;
struct dentry *lower_old_dentry;
@@ -587,6 +588,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *trap = NULL;
struct inode *target_inode;
+ if (flags)
+ return -EINVAL;
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
@@ -927,7 +931,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
mutex_unlock(&crypt_stat->cs_mutex);
- rc = inode_change_ok(inode, ia);
+ rc = setattr_prepare(dentry, ia);
if (rc)
goto out;
if (ia->ia_valid & ATTR_SIZE) {
@@ -1005,15 +1009,14 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
- int rc = 0;
+ int rc;
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!d_inode(lower_dentry)->i_op->setxattr) {
+ if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
rc = -EOPNOTSUPP;
goto out;
}
-
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
if (!rc && inode)
fsstack_copy_attr_all(inode, d_inode(lower_dentry));
@@ -1025,15 +1028,14 @@ ssize_t
ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
const char *name, void *value, size_t size)
{
- int rc = 0;
+ int rc;
- if (!lower_inode->i_op->getxattr) {
+ if (!(lower_inode->i_opflags & IOP_XATTR)) {
rc = -EOPNOTSUPP;
goto out;
}
inode_lock(lower_inode);
- rc = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
- name, value, size);
+ rc = __vfs_getxattr(lower_dentry, lower_inode, name, value, size);
inode_unlock(lower_inode);
out:
return rc;
@@ -1066,19 +1068,22 @@ out:
return rc;
}
-static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode,
+ const char *name)
{
- int rc = 0;
+ int rc;
struct dentry *lower_dentry;
+ struct inode *lower_inode;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!d_inode(lower_dentry)->i_op->removexattr) {
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ if (!(lower_inode->i_opflags & IOP_XATTR)) {
rc = -EOPNOTSUPP;
goto out;
}
- inode_lock(d_inode(lower_dentry));
- rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
- inode_unlock(d_inode(lower_dentry));
+ inode_lock(lower_inode);
+ rc = __vfs_removexattr(lower_dentry, name);
+ inode_unlock(lower_inode);
out:
return rc;
}
@@ -1089,10 +1094,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr_link,
- .setxattr = ecryptfs_setxattr,
- .getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
- .removexattr = ecryptfs_removexattr
};
const struct inode_operations ecryptfs_dir_iops = {
@@ -1107,18 +1109,43 @@ const struct inode_operations ecryptfs_dir_iops = {
.rename = ecryptfs_rename,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
- .setxattr = ecryptfs_setxattr,
- .getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
- .removexattr = ecryptfs_removexattr
};
const struct inode_operations ecryptfs_main_iops = {
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr,
- .setxattr = ecryptfs_setxattr,
- .getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
- .removexattr = ecryptfs_removexattr
+};
+
+static int ecryptfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ecryptfs_getxattr(dentry, inode, name, buffer, size);
+}
+
+static int ecryptfs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ if (value)
+ return ecryptfs_setxattr(dentry, inode, name, value, size, flags);
+ else {
+ BUG_ON(flags != XATTR_REPLACE);
+ return ecryptfs_removexattr(dentry, inode, name);
+ }
+}
+
+const struct xattr_handler ecryptfs_xattr_handler = {
+ .prefix = "", /* match anything */
+ .get = ecryptfs_xattr_get,
+ .set = ecryptfs_xattr_set,
+};
+
+const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+ &ecryptfs_xattr_handler,
+ NULL
};
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 612004495141..151872dcc1f4 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -529,6 +529,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
s->s_op = &ecryptfs_sops;
+ s->s_xattr = ecryptfs_xattr_handlers;
s->s_d_op = &ecryptfs_dops;
err = "Reading sb failed";
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 9c3437c8a5b1..1f0c471b4ba3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
#include <linux/file.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
@@ -422,7 +423,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
struct inode *lower_inode = d_inode(lower_dentry);
int rc;
- if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
+ if (!(lower_inode->i_opflags & IOP_XATTR)) {
printk(KERN_WARNING
"No support for setting xattr in lower filesystem\n");
rc = -ENOSYS;
@@ -436,15 +437,13 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
goto out;
}
inode_lock(lower_inode);
- size = lower_inode->i_op->getxattr(lower_dentry, lower_inode,
- ECRYPTFS_XATTR_NAME,
- xattr_virt, PAGE_SIZE);
+ size = __vfs_getxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
+ xattr_virt, PAGE_SIZE);
if (size < 0)
size = 8;
put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
- rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
- ECRYPTFS_XATTR_NAME,
- xattr_virt, size, 0);
+ rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
+ xattr_virt, size, 0);
inode_unlock(lower_inode);
if (rc)
printk(KERN_ERR "Error whilst attempting to write inode size "
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 1d73fc6dba13..71fccccf317e 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -24,7 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
@@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
inode->i_private = var;
- efivar_entry_add(var, &efivarfs_list);
+ err = efivar_entry_add(var, &efivarfs_list);
+ if (err)
+ goto out;
+
d_instantiate(dentry, inode);
dget(dentry);
out:
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 688ccc16b702..d7a7c53803c1 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
goto fail_inode;
}
+ efivar_entry_size(entry, &size);
+ err = efivar_entry_add(entry, &efivarfs_list);
+ if (err)
+ goto fail_inode;
+
/* copied by the above to local storage in the dentry. */
kfree(name);
- efivar_entry_size(entry, &size);
- efivar_entry_add(entry, &efivarfs_list);
-
inode_lock(inode);
inode->i_private = entry;
i_size_write(inode, size + sizeof(entry->var.Attributes));
@@ -182,7 +184,10 @@ fail:
static int efivarfs_destroy(struct efivar_entry *entry, void *data)
{
- efivar_entry_remove(entry);
+ int err = efivar_entry_remove(entry);
+
+ if (err)
+ return err;
kfree(entry);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f7b137..4e497b9ee71e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
struct page *page;
int ret;
+ unsigned int gup_flags = FOLL_FORCE;
#ifdef CONFIG_STACK_GROWSUP
if (write) {
@@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return NULL;
}
#endif
+
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
/*
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
- ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
- 1, &page, NULL);
+ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
+ &page, NULL);
if (ret <= 0)
return NULL;
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index f69a1b5826a5..42f9a0a0c4ca 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -137,7 +137,7 @@ Espan:
bad_entry:
EXOFS_ERR(
"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
- "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
+ "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)),
rec_len, p->name_len);
@@ -416,7 +416,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
if (likely(!err))
err = exofs_commit_chunk(page, pos, len);
exofs_put_page(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
return err;
}
@@ -503,7 +503,7 @@ got_it:
de->inode_no = cpu_to_le64(inode->i_ino);
exofs_set_de_type(de, inode);
err = exofs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
sbi->s_numfiles++;
@@ -554,7 +554,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
dir->inode_no = 0;
if (likely(!err))
err = exofs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
sbi->s_numfiles--;
out:
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 9dc4c6dbf3c9..d8072bc074a4 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -778,7 +778,7 @@ try_again:
fail:
EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
inode->i_ino, page->index, ret);
- set_bit(AS_EIO, &page->mapping->flags);
+ mapping_set_error(page->mapping, -EIO);
unlock_page(page);
return ret;
}
@@ -1007,7 +1007,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
int ret;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
if (likely(!ret))
@@ -1034,7 +1034,7 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
if (unlikely(error))
return error;
- error = inode_change_ok(inode, iattr);
+ error = setattr_prepare(dentry, iattr);
if (unlikely(error))
return error;
@@ -1313,7 +1313,7 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(inode, dir, mode);
inode->i_ino = sbi->s_nextid++;
inode->i_blkbits = EXOFS_BLKSHIFT;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
oi->i_commit_size = inode->i_size = 0;
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 622a686bb08b..7295cd722770 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -142,7 +142,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -227,7 +227,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
}
static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
@@ -237,6 +238,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct exofs_dir_entry *old_de;
int err = -ENOENT;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
if (!old_de)
goto out;
@@ -261,7 +265,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_de)
goto out_dir;
err = exofs_set_link(new_dir, new_de, new_page, old_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -275,7 +279,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_inc_link_count(new_dir);
}
- old_inode->i_ctime = CURRENT_TIME;
+ old_inode->i_ctime = current_time(old_inode);
exofs_delete_entry(old_de, old_page);
mark_inode_dirty(old_inode);
@@ -310,7 +314,7 @@ const struct inode_operations exofs_dir_inode_operations = {
.mkdir = exofs_mkdir,
.rmdir = exofs_rmdir,
.mknod = exofs_mknod,
- .rename = exofs_rename,
+ .rename = exofs_rename,
.setattr = exofs_setattr,
};
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 207ba8d627ca..a4b531be9168 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
if (!nop || !nop->fh_to_dentry)
return ERR_PTR(-ESTALE);
result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
- if (!result)
- result = ERR_PTR(-ESTALE);
- if (IS_ERR(result))
- return result;
+ if (PTR_ERR(result) == -ENOMEM)
+ return ERR_CAST(result);
+ if (IS_ERR_OR_NULL(result))
+ return ERR_PTR(-ESTALE);
if (d_is_dir(result)) {
/*
@@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
err_result:
dput(result);
+ if (err != -ENOMEM)
+ err = -ESTALE;
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index c634874e12d9..36bea5adcaba 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
config EXT2_FS
tristate "Second extended fs support"
+ select FS_IOMAP if FS_DAX
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 42f1d1814083..79dafa71effd 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -190,15 +190,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
case ACL_TYPE_ACCESS:
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
- else {
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- if (error == 0)
- acl = NULL;
- }
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
}
break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 61ad490ed67b..d9650c9508e4 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -471,7 +471,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
err = ext2_commit_chunk(page, pos, len);
ext2_put_page(page);
if (update_times)
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
}
@@ -561,7 +561,7 @@ got_it:
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
err = ext2_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
/* OFFSET_CACHE */
@@ -610,7 +610,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
err = ext2_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
out:
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 06af2f92226c..37e2be784ac7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
/* inode.c */
extern const struct address_space_operations ext2_aops;
extern const struct address_space_operations ext2_nobh_aops;
+extern struct iomap_ops ext2_iomap_ops;
/* namei.c */
extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5efeefe17abb..a0e1478dfd04 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
+#include <linux/iomap.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
#ifdef CONFIG_FS_DAX
+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ ssize_t ret;
+
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+ inode_lock_shared(inode);
+ ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out_unlock;
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+ ret = file_update_time(file);
+ if (ret)
+ goto out_unlock;
+
+ ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+ if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+ i_size_write(inode, iocb->ki_pos);
+ mark_inode_dirty(inode);
+ }
+
+out_unlock:
+ inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
/*
* The lock ordering for ext2 DAX fault paths is:
*
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = dax_fault(vma, vmf, ext2_get_block);
+ ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
-/*
- * We have mostly NULL's here: the current defaults are ok for
- * the ext2 filesystem.
- */
+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ return ext2_dax_read_iter(iocb, to);
+#endif
+ return generic_file_read_iter(iocb, to);
+}
+
+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ return ext2_dax_write_iter(iocb, from);
+#endif
+ return generic_file_write_iter(iocb, from);
+}
+
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .read_iter = ext2_file_read_iter,
+ .write_iter = ext2_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
@@ -172,16 +234,14 @@ const struct file_operations ext2_file_operations = {
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
- .removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index efe5fb21c533..395fc074c0db 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext2_get_group_desc(sb, group, &bh2);
+ if (!gdp) {
+ if (++group == sbi->s_groups_count)
+ group = 0;
+ continue;
+ }
brelse(bitmap_bh);
bitmap_bh = read_inode_bitmap(sb, group);
if (!bitmap_bh) {
@@ -551,7 +556,7 @@ got:
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags =
ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index d5c7d09919f3..41b8b44a391c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include "ext2.h"
@@ -594,7 +595,7 @@ static void ext2_splice_branch(struct inode *inode,
if (where->bh)
mark_buffer_dirty_inode(where->bh, inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
}
@@ -618,10 +619,10 @@ static void ext2_splice_branch(struct inode *inode,
*/
static int ext2_get_blocks(struct inode *inode,
sector_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
+ u32 *bno, bool *new, bool *boundary,
int create)
{
- int err = -EIO;
+ int err;
int offsets[4];
Indirect chain[4];
Indirect *partial;
@@ -638,13 +639,12 @@ static int ext2_get_blocks(struct inode *inode,
depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
if (depth == 0)
- return (err);
+ return -EIO;
partial = ext2_get_branch(inode, depth, offsets, chain, &err);
/* Simplest case - block found, no allocation needed */
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result); /* What's this do? */
count++;
/*map more blocks*/
while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
- clear_buffer_new(bh_result);
goto got_it;
}
}
@@ -733,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode,
}
if (IS_DAX(inode)) {
+ int i;
+
+ /*
+ * We must unmap blocks before zeroing so that writeback cannot
+ * overwrite zeros with stale data from block device page cache.
+ */
+ for (i = 0; i < count; i++) {
+ unmap_underlying_metadata(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key) + i);
+ }
/*
* block must be initialised before we put it in the tree
* so that it's not found by another thread before it's
@@ -745,15 +754,15 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- } else
- set_buffer_new(bh_result);
+ } else {
+ *new = true;
+ }
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
+ *boundary = true;
err = count;
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
@@ -762,22 +771,87 @@ cleanup:
brelse(partial->bh);
partial--;
}
+ if (err > 0)
+ *bno = le32_to_cpu(chain[depth-1].key);
return err;
}
-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+int ext2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- int ret = ext2_get_blocks(inode, iblock, max_blocks,
- bh_result, create);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
+ bool new = false, boundary = false;
+ u32 bno;
+ int ret;
+
+ ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
+ create);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, bno);
+ bh_result->b_size = (ret << inode->i_blkbits);
+ if (new)
+ set_buffer_new(bh_result);
+ if (boundary)
+ set_buffer_boundary(bh_result);
+ return 0;
+
+}
+
+#ifdef CONFIG_FS_DAX
+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap)
+{
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned long first_block = offset >> blkbits;
+ unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+ bool new = false, boundary = false;
+ u32 bno;
+ int ret;
+
+ ret = ext2_get_blocks(inode, first_block, max_blocks,
+ &bno, &new, &boundary, flags & IOMAP_WRITE);
+ if (ret < 0)
+ return ret;
+
+ iomap->flags = 0;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = (u64)first_block << blkbits;
+
+ if (ret == 0) {
+ iomap->type = IOMAP_HOLE;
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->length = 1 << blkbits;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->blkno = (sector_t)bno << (blkbits - 9);
+ iomap->length = (u64)ret << blkbits;
+ iomap->flags |= IOMAP_F_MERGED;
}
- return ret;
+ if (new)
+ iomap->flags |= IOMAP_F_NEW;
+ return 0;
}
+static int
+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
+{
+ if (iomap->type == IOMAP_MAPPED &&
+ written < length &&
+ (flags & IOMAP_WRITE))
+ ext2_write_failed(inode->i_mapping, offset + length);
+ return 0;
+}
+
+struct iomap_ops ext2_iomap_ops = {
+ .iomap_begin = ext2_iomap_begin,
+ .iomap_end = ext2_iomap_end,
+};
+#endif /* CONFIG_FS_DAX */
+
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -863,11 +937,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
- DIO_LOCKING);
- else
- ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
+ if (WARN_ON_ONCE(IS_DAX(inode)))
+ return -EIO;
+
+ ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
ext2_write_failed(mapping, offset + count);
return ret;
@@ -1236,7 +1309,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
__ext2_truncate_blocks(inode, newsize);
dax_sem_up_write(EXT2_I(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
sync_inode_metadata(inode, 1);
@@ -1580,7 +1653,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, iattr);
+ error = setattr_prepare(dentry, iattr);
if (error)
return error;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b386af2e45f4..9d617423e936 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -79,7 +79,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ei->i_flags = flags;
ext2_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode_unlock(inode);
mark_inode_dirty(inode);
@@ -103,7 +103,7 @@ setflags_out:
}
inode_lock(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode->i_generation = generation;
inode_unlock(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index d446203127fc..814e405a2da6 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -221,7 +221,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
if (err)
return err;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -328,7 +328,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
}
static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
- struct inode * new_dir, struct dentry * new_dentry )
+ struct inode * new_dir, struct dentry * new_dentry,
+ unsigned int flags)
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
@@ -338,6 +339,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct ext2_dir_entry_2 * old_de;
int err;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
err = dquot_initialize(old_dir);
if (err)
goto out;
@@ -372,7 +376,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
if (!new_de)
goto out_dir;
ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -388,7 +392,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = CURRENT_TIME_SEC;
+ old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
ext2_delete_entry (old_de, old_page);
@@ -428,10 +432,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.mknod = ext2_mknod,
.rename = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
- .removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
@@ -441,10 +442,7 @@ const struct inode_operations ext2_dir_inode_operations = {
const struct inode_operations ext2_special_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
- .removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1d9379568aa8..6cb042b53b5b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1543,7 +1543,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode->i_version++;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 3495d8ae4b33..8437b191bf5d 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -25,10 +25,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
.get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
- .removexattr = generic_removexattr,
#endif
};
@@ -37,9 +34,6 @@ const struct inode_operations ext2_fast_symlink_inode_operations = {
.get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
- .removexattr = generic_removexattr,
#endif
};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index b7f896f3f7a7..fbdb8f171893 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -691,7 +691,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* Update the inode. */
EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
if (IS_SYNC(inode)) {
error = sync_inode_metadata(inode, 1);
/* In case sync failed due to ENOSPC the inode was actually
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index c6601a476c02..dfa519979038 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -193,15 +193,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
- else {
- inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- if (error == 0)
- acl = NULL;
- }
+ inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
}
break;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 02ddec6d8a7d..fdb19543af1e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -128,12 +128,12 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
node = rb_first(&sbi->system_blks);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
- printk("%s%llu-%llu", first ? "" : ", ",
+ printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
entry->start_blk, entry->start_blk + entry->count - 1);
first = 0;
node = rb_next(node);
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
int ext4_setup_system_zone(struct super_block *sb)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 67415e0e6af0..e8b365000d73 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -260,11 +260,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
/* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode,
0, 0, &de_name, &fstr);
+ de_name = fstr;
fstr.len = save_len;
- if (err < 0)
+ if (err)
goto errout;
if (!dir_emit(ctx,
- fstr.name, err,
+ de_name.name, de_name.len,
le32_to_cpu(de->inode),
get_dtype(sb, de->file_type)))
goto done;
@@ -627,7 +628,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
int buf_size)
{
struct ext4_dir_entry_2 *de;
- int nlen, rlen;
+ int rlen;
unsigned int offset = 0;
char *top;
@@ -637,7 +638,6 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
return -EFSCORRUPTED;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ea31931386ec..282a51b07c57 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -262,6 +262,9 @@ struct ext4_io_submit {
(s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
+#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
+ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
+ blkbits))
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@@ -1117,9 +1120,15 @@ struct ext4_inode_info {
#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
-#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
-#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
-#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota,
+ * enable enforcement for hidden
+ * quota files */
+#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable
+ * enforcement for hidden quota
+ * files */
+#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota
+ * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
@@ -1636,26 +1645,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
* Feature set definitions
*/
-/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */
-#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
- ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
-#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
- ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
-#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
- ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
-#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
-#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
-#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
-#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
-#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
-#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \
- EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
-
#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d7ccb7f51dfc..c930a0110fb4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4679,6 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
unsigned int credits;
loff_t epos;
+ BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
map.m_len = len;
/*
@@ -4693,13 +4694,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
- /*
- * We can only call ext_depth() on extent based inodes
- */
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- depth = ext_depth(inode);
- else
- depth = -1;
+ depth = ext_depth(inode);
retry:
while (ret >= 0 && len) {
@@ -4966,13 +4961,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
trace_ext4_fallocate_enter(inode, offset, len, mode);
lblk = offset >> blkbits;
- /*
- * We can't just convert len to max_blocks because
- * If blocksize = 4096 offset = 3072 and len = 2048
- */
- max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - lblk;
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
@@ -5035,12 +5025,8 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
unsigned int credits, blkbits = inode->i_blkbits;
map.m_lblk = offset >> blkbits;
- /*
- * We can't just convert len to max_blocks because
- * If blocksize = 4096 offset = 3072 and len = 2048
- */
- max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
- map.m_lblk);
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
/*
* This is somewhat ugly but the idea is clear: When transaction is
* reserved, everything goes into it. Otherwise we rather start several
@@ -5734,6 +5720,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
+ } else {
+ ext4_ext_drop_refs(path);
+ kfree(path);
}
ret = ext4_es_remove_extent(inode, offset_lblk,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 261ac3734c58..2a822d30e73f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -91,9 +91,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
- struct blk_plug plug;
int o_direct = iocb->ki_flags & IOCB_DIRECT;
int unaligned_aio = 0;
int overwrite = 0;
@@ -134,18 +132,16 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct) {
size_t length = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
- blk_start_plug(&plug);
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ pos + length <= i_size_read(inode)) {
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
int err, len;
map.m_lblk = pos >> blkbits;
- map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
- - map.m_lblk;
+ map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
len = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
@@ -171,8 +167,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret > 0)
ret = generic_write_sync(iocb, ret);
- if (o_direct)
- blk_finish_plug(&plug);
return ret;
@@ -703,6 +697,7 @@ const struct file_operations ext4_file_operations = {
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
@@ -711,10 +706,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5c4372512ef7..88effb1053c7 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -61,6 +61,13 @@ static int ext4_sync_parent(struct inode *inode)
break;
iput(inode);
inode = next;
+ /*
+ * The directory inode may have gone through rmdir by now. But
+ * the inode itself and its blocks are still allocated (we hold
+ * a reference to the inode so it didn't go through
+ * ext4_evict_inode()) and so we are safe to flush metadata
+ * blocks and the inode.
+ */
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
break;
@@ -107,7 +114,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!journal) {
ret = __generic_file_fsync(file, start, end, datasync);
- if (!ret && !hlist_empty(&inode->i_dentry))
+ if (!ret)
ret = ext4_sync_parent(inode);
if (test_opt(inode->i_sb, BARRIER))
goto issue_flush;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9e66cd1d7b78..170421edfdfe 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -802,7 +802,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
} else
inode_init_owner(inode, dir, mode);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
ei->i_projid = EXT4_I(dir)->i_projid;
else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6ea25a190f8..9c064727ed62 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -647,11 +647,19 @@ found:
/*
* We have to zeroout blocks before inserting them into extent
* status tree. Otherwise someone could look them up there and
- * use them before they are really zeroed.
+ * use them before they are really zeroed. We also have to
+ * unmap metadata before zeroing as otherwise writeback can
+ * overwrite zeros with stale data from block device.
*/
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
+ ext4_lblk_t i;
+
+ for (i = 0; i < map->m_len; i++) {
+ unmap_underlying_metadata(inode->i_sb->s_bdev,
+ map->m_pblk + i);
+ }
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -1649,6 +1657,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (invalidate) {
+ if (page_mapped(page))
+ clear_page_dirty_for_io(page);
block_invalidatepage(page, 0, PAGE_SIZE);
ClearPageUptodate(page);
}
@@ -3526,35 +3536,31 @@ out:
static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
{
- int unlocked = 0;
- struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
ssize_t ret;
- if (ext4_should_dioread_nolock(inode)) {
- /*
- * Nolock dioread optimization may be dynamically disabled
- * via ext4_inode_block_unlocked_dio(). Check inode's state
- * while holding extra i_dio_count ref.
- */
- inode_dio_begin(inode);
- smp_mb();
- if (unlikely(ext4_test_inode_state(inode,
- EXT4_STATE_DIOREAD_LOCK)))
- inode_dio_end(inode);
- else
- unlocked = 1;
- }
+ /*
+ * Shared inode_lock is enough for us - it protects against concurrent
+ * writes & truncates and since we take care of writing back page cache,
+ * we are protected against page writeback as well.
+ */
+ inode_lock_shared(inode);
if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
- NULL, unlocked ? 0 : DIO_LOCKING);
+ ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
} else {
+ size_t count = iov_iter_count(iter);
+
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+ iocb->ki_pos + count);
+ if (ret)
+ goto out_unlock;
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
iter, ext4_dio_get_block,
- NULL, NULL,
- unlocked ? 0 : DIO_LOCKING);
+ NULL, NULL, 0);
}
- if (unlocked)
- inode_dio_end(inode);
+out_unlock:
+ inode_unlock_shared(inode);
return ret;
}
@@ -3890,7 +3896,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
}
/*
- * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * ext4_punch_hole: punches a hole in a file by releasing the blocks
* associated with the given offset and length
*
* @inode: File inode
@@ -3919,7 +3925,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
ret = filemap_write_and_wait_range(mapping, offset,
offset + length - 1);
if (ret)
@@ -4414,7 +4420,7 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ if (!ext4_has_feature_project(inode->i_sb))
return -EOPNOTSUPP;
*projid = EXT4_I(inode)->i_projid;
return 0;
@@ -4481,7 +4487,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ if (ext4_has_feature_project(sb) &&
EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
@@ -4814,14 +4820,14 @@ static int ext4_do_update_inode(handle_t *handle,
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
- if (!ei->i_dtime) {
+ if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ } else {
raw_inode->i_uid_high =
cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
cpu_to_le16(high_16_bits(i_gid));
- } else {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
}
} else {
raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
@@ -4885,8 +4891,7 @@ static int ext4_do_update_inode(handle_t *handle,
}
}
- BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
i_projid != EXT4_DEF_PROJID);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -5073,7 +5078,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1bb7df5e4536..bf5ae8ebbc97 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -19,8 +19,6 @@
#include "ext4_jbd2.h"
#include "ext4.h"
-#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
-
/**
* Swap memory between @a and @b for @len bytes.
*
@@ -310,8 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
struct ext4_inode *raw_inode;
struct dquot *transfer_to[MAXQUOTAS] = { };
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ if (!ext4_has_feature_project(sb)) {
if (projid != EXT4_DEF_PROJID)
return -EOPNOTSUPP;
else
@@ -772,6 +769,9 @@ resizefs_out:
#ifdef CONFIG_EXT4_FS_ENCRYPTION
struct fscrypt_policy policy;
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&policy,
(struct fscrypt_policy __user *)arg,
sizeof(policy)))
@@ -842,8 +842,7 @@ resizefs_out:
ext4_get_inode_flags(ei);
fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ if (ext4_has_feature_project(inode->i_sb)) {
fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
EXT4_I(inode)->i_projid);
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3ef1df6ae9ec..1aba469f8220 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -27,16 +27,15 @@
#ifdef CONFIG_EXT4_DEBUG
extern ushort ext4_mballoc_debug;
-#define mb_debug(n, fmt, a...) \
- do { \
- if ((n) <= ext4_mballoc_debug) { \
- printk(KERN_DEBUG "(%s, %d): %s: ", \
- __FILE__, __LINE__, __func__); \
- printk(fmt, ## a); \
- } \
- } while (0)
+#define mb_debug(n, fmt, ...) \
+do { \
+ if ((n) <= ext4_mballoc_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: " fmt, \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
+ } \
+} while (0)
#else
-#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
+#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index a920c5d29fac..6fc14def0c70 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
+ if (ext4_encrypted_inode(orig_inode) ||
+ ext4_encrypted_inode(donor_inode)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported for encrypted files");
+ return -EOPNOTSUPP;
+ }
+
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 34c0142caf6a..104f8bfba718 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -577,12 +577,13 @@ static inline unsigned dx_node_limit(struct inode *dir)
static void dx_show_index(char * label, struct dx_entry *entries)
{
int i, n = dx_get_count (entries);
- printk(KERN_DEBUG "%s index ", label);
+ printk(KERN_DEBUG "%s index", label);
for (i = 0; i < n; i++) {
- printk("%x->%lu ", i ? dx_get_hash(entries + i) :
- 0, (unsigned long)dx_get_block(entries + i));
+ printk(KERN_CONT " %x->%lu",
+ i ? dx_get_hash(entries + i) : 0,
+ (unsigned long)dx_get_block(entries + i));
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
struct stats
@@ -639,7 +640,7 @@ static struct stats dx_show_leaf(struct inode *dir,
res = fscrypt_fname_alloc_buffer(
dir, len,
&fname_crypto_str);
- if (res < 0)
+ if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
"buffer--skipping "
@@ -647,7 +648,7 @@ static struct stats dx_show_leaf(struct inode *dir,
res = fscrypt_fname_disk_to_usr(dir,
0, 0, &de_name,
&fname_crypto_str);
- if (res < 0) {
+ if (res) {
printk(KERN_WARNING "Error "
"converting filename "
"from disk to usr"
@@ -679,7 +680,7 @@ static struct stats dx_show_leaf(struct inode *dir,
}
de = ext4_next_entry(de, size);
}
- printk("(%i)\n", names);
+ printk(KERN_CONT "(%i)\n", names);
return (struct stats) { names, space, 1 };
}
@@ -798,7 +799,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
q = entries + count - 1;
while (p <= q) {
m = p + (q - p) / 2;
- dxtrace(printk("."));
+ dxtrace(printk(KERN_CONT "."));
if (dx_get_hash(m) > hash)
q = m - 1;
else
@@ -810,7 +811,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
at = entries;
while (n--)
{
- dxtrace(printk(","));
+ dxtrace(printk(KERN_CONT ","));
if (dx_get_hash(++at) > hash)
{
at--;
@@ -821,7 +822,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
at = p - 1;
- dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at),
+ dxtrace(printk(KERN_CONT " %x->%u\n",
+ at == entries ? 0 : dx_get_hash(at),
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
@@ -1011,7 +1013,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
hinfo->minor_hash, &de_name,
&fname_crypto_str);
- if (err < 0) {
+ if (err) {
count = err;
goto errout;
}
@@ -2044,33 +2046,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries;
frame->at = entries;
frame->bh = bh;
- bh = bh2;
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (retval)
goto out_frames;
- retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ retval = ext4_handle_dirty_dirent_node(handle, dir, bh2);
if (retval)
goto out_frames;
- de = do_split(handle,dir, &bh, frame, &fname->hinfo);
+ de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
if (IS_ERR(de)) {
retval = PTR_ERR(de);
goto out_frames;
}
- dx_release(frames);
- retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
- brelse(bh);
- return retval;
+ retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
out_frames:
/*
* Even if the block split failed, we have to properly write
* out all the changes we did so far. Otherwise we can end up
* with corrupted filesystem.
*/
- ext4_mark_inode_dirty(handle, dir);
+ if (retval)
+ ext4_mark_inode_dirty(handle, dir);
dx_release(frames);
+ brelse(bh2);
return retval;
}
@@ -3144,7 +3144,7 @@ static int ext4_symlink(struct inode *dir,
istr.name = (const unsigned char *) symname;
istr.len = len;
err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
- if (err < 0)
+ if (err)
goto err_drop_inode;
sd->len = cpu_to_le16(ostr.len);
disk_link.name = (char *) sd;
@@ -3880,12 +3880,9 @@ const struct inode_operations ext4_dir_inode_operations = {
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
- .rename2 = ext4_rename2,
+ .rename = ext4_rename2,
.setattr = ext4_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
@@ -3893,10 +3890,7 @@ const struct inode_operations ext4_dir_inode_operations = {
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a6132a730967..0094923e5ebf 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -88,7 +88,7 @@ static void ext4_finish_bio(struct bio *bio)
if (bio->bi_error) {
SetPageError(page);
- set_bit(AS_EIO, &page->mapping->flags);
+ mapping_set_error(page->mapping, -EIO);
}
bh = head = page_buffers(page);
/*
@@ -405,14 +405,12 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
{
struct page *data_page = NULL;
struct inode *inode = page->mapping->host;
- unsigned block_start, blocksize;
+ unsigned block_start;
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
int nr_to_submit = 0;
- blocksize = 1 << inode->i_blkbits;
-
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ec8708989ca..20da99da0a34 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -78,6 +78,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+static struct inode *ext4_get_journal_inode(struct super_block *sb,
+ unsigned int journal_inum);
/*
* Lock ordering
@@ -595,14 +597,15 @@ void __ext4_std_error(struct super_block *sb, const char *function,
void __ext4_abort(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
save_error_info(sb, function, line);
va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
- function, line);
- vprintk(fmt, args);
- printk("\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
+ sb->s_id, function, line, &vaf);
va_end(args);
if ((sb->s_flags & MS_RDONLY) == 0) {
@@ -1267,7 +1270,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
@@ -1327,6 +1330,7 @@ static const match_table_t tokens = {
{Opt_noquota, "noquota"},
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
+ {Opt_prjquota, "prjquota"},
{Opt_barrier, "barrier=%u"},
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
@@ -1546,8 +1550,11 @@ static const struct mount_opts {
MOPT_SET | MOPT_Q},
{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
MOPT_SET | MOPT_Q},
+ {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
+ MOPT_SET | MOPT_Q},
{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
- EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
+ MOPT_CLEAR | MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q},
{Opt_grpjquota, 0, MOPT_Q},
{Opt_offusrjquota, 0, MOPT_Q},
@@ -1836,13 +1843,17 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
- if (ext4_has_feature_quota(sb) &&
- (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
- ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
- "mount options ignored.");
- clear_opt(sb, USRQUOTA);
- clear_opt(sb, GRPQUOTA);
- } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ /*
+ * We do the test below only for project quotas. 'usrquota' and
+ * 'grpquota' mount options are allowed even without quota feature
+ * to support legacy quotas in quota files.
+ */
+ if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
+ ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
+ "Cannot enable project quota enforcement.");
+ return 0;
+ }
+ if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -2705,12 +2716,12 @@ static void print_daily_error_info(unsigned long arg)
es->s_first_error_func,
le32_to_cpu(es->s_first_error_line));
if (es->s_first_error_ino)
- printk(": inode %u",
+ printk(KERN_CONT ": inode %u",
le32_to_cpu(es->s_first_error_ino));
if (es->s_first_error_block)
- printk(": block %llu", (unsigned long long)
+ printk(KERN_CONT ": block %llu", (unsigned long long)
le64_to_cpu(es->s_first_error_block));
- printk("\n");
+ printk(KERN_CONT "\n");
}
if (es->s_last_error_time) {
printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
@@ -2719,12 +2730,12 @@ static void print_daily_error_info(unsigned long arg)
es->s_last_error_func,
le32_to_cpu(es->s_last_error_line));
if (es->s_last_error_ino)
- printk(": inode %u",
+ printk(KERN_CONT ": inode %u",
le32_to_cpu(es->s_last_error_ino));
if (es->s_last_error_block)
- printk(": block %llu", (unsigned long long)
+ printk(KERN_CONT ": block %llu", (unsigned long long)
le64_to_cpu(es->s_last_error_block));
- printk("\n");
+ printk(KERN_CONT "\n");
}
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
}
@@ -2741,7 +2752,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
sb = elr->lr_super;
ngroups = EXT4_SB(sb)->s_groups_count;
- sb_start_write(sb);
for (group = elr->lr_next_group; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
@@ -2768,8 +2778,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
}
- sb_end_write(sb);
-
return ret;
}
@@ -2834,19 +2842,43 @@ cont_thread:
mutex_unlock(&eli->li_list_mtx);
goto exit_thread;
}
-
list_for_each_safe(pos, n, &eli->li_request_list) {
+ int err = 0;
+ int progress = 0;
elr = list_entry(pos, struct ext4_li_request,
lr_request);
- if (time_after_eq(jiffies, elr->lr_next_sched)) {
- if (ext4_run_li_request(elr) != 0) {
- /* error, remove the lazy_init job */
- ext4_remove_li_request(elr);
- continue;
+ if (time_before(jiffies, elr->lr_next_sched)) {
+ if (time_before(elr->lr_next_sched, next_wakeup))
+ next_wakeup = elr->lr_next_sched;
+ continue;
+ }
+ if (down_read_trylock(&elr->lr_super->s_umount)) {
+ if (sb_start_write_trylock(elr->lr_super)) {
+ progress = 1;
+ /*
+ * We hold sb->s_umount, sb can not
+ * be removed from the list, it is
+ * now safe to drop li_list_mtx
+ */
+ mutex_unlock(&eli->li_list_mtx);
+ err = ext4_run_li_request(elr);
+ sb_end_write(elr->lr_super);
+ mutex_lock(&eli->li_list_mtx);
+ n = pos->next;
}
+ up_read((&elr->lr_super->s_umount));
+ }
+ /* error, remove the lazy_init job */
+ if (err) {
+ ext4_remove_li_request(elr);
+ continue;
+ }
+ if (!progress) {
+ elr->lr_next_sched = jiffies +
+ (prandom_u32()
+ % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
}
-
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
}
@@ -3179,6 +3211,8 @@ int ext4_calculate_overhead(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ struct inode *j_inode;
+ unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
char *buf = (char *) get_zeroed_page(GFP_NOFS);
@@ -3209,10 +3243,23 @@ int ext4_calculate_overhead(struct super_block *sb)
memset(buf, 0, PAGE_SIZE);
cond_resched();
}
- /* Add the internal journal blocks as well */
+
+ /*
+ * Add the internal journal blocks whether the journal has been
+ * loaded or not
+ */
if (sbi->s_journal && !sbi->journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
-
+ else if (ext4_has_feature_journal(sb) && !sbi->s_journal) {
+ j_inode = ext4_get_journal_inode(sb, j_inum);
+ if (j_inode) {
+ j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
+ overhead += EXT4_NUM_B2C(sbi, j_blocks);
+ iput(j_inode);
+ } else {
+ ext4_msg(sb, KERN_ERR, "can't get journal size");
+ }
+ }
sbi->s_overhead = overhead;
smp_wmb();
free_page((unsigned long) buf);
@@ -4208,18 +4255,16 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
write_unlock(&journal->j_state_lock);
}
-static journal_t *ext4_get_journal(struct super_block *sb,
- unsigned int journal_inum)
+static struct inode *ext4_get_journal_inode(struct super_block *sb,
+ unsigned int journal_inum)
{
struct inode *journal_inode;
- journal_t *journal;
-
- BUG_ON(!ext4_has_feature_journal(sb));
-
- /* First, test for the existence of a valid inode on disk. Bad
- * things happen if we iget() an unused inode, as the subsequent
- * iput() will try to delete it. */
+ /*
+ * Test for the existence of a valid inode on disk. Bad things
+ * happen if we iget() an unused inode, as the subsequent iput()
+ * will try to delete it.
+ */
journal_inode = ext4_iget(sb, journal_inum);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
@@ -4239,6 +4284,20 @@ static journal_t *ext4_get_journal(struct super_block *sb,
iput(journal_inode);
return NULL;
}
+ return journal_inode;
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+ unsigned int journal_inum)
+{
+ struct inode *journal_inode;
+ journal_t *journal;
+
+ BUG_ON(!ext4_has_feature_journal(sb));
+
+ journal_inode = ext4_get_journal_inode(sb, journal_inum);
+ if (!journal_inode)
+ return NULL;
journal = jbd2_journal_init_inode(journal_inode);
if (!journal) {
@@ -5250,12 +5309,18 @@ static int ext4_enable_quotas(struct super_block *sb)
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
+ bool quota_mopt[EXT4_MAXQUOTAS] = {
+ test_opt(sb, USRQUOTA),
+ test_opt(sb, GRPQUOTA),
+ test_opt(sb, PRJQUOTA),
+ };
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
for (type = 0; type < EXT4_MAXQUOTAS; type++) {
if (qf_inums[type]) {
err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
- DQUOT_USAGE_ENABLED);
+ DQUOT_USAGE_ENABLED |
+ (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 4d83d9e05f2e..557b3b0d668c 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -30,7 +30,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
char *caddr, *paddr = NULL;
struct fscrypt_str cstr, pstr;
struct fscrypt_symlink_data *sd;
- loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
u32 max_size = inode->i_sb->s_blocksize;
@@ -49,7 +48,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
if (IS_ERR(cpage))
return ERR_CAST(cpage);
caddr = page_address(cpage);
- caddr[size] = 0;
}
/* Symlink is encrypted */
@@ -65,16 +63,14 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
if (res)
goto errout;
+ paddr = pstr.name;
res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
- if (res < 0)
+ if (res)
goto errout;
- paddr = pstr.name;
-
/* Null-terminate the name */
- if (res <= pstr.len)
- paddr[res] = '\0';
+ paddr[pstr.len] = '\0';
if (cpage)
put_page(cpage);
set_delayed_call(done, kfree_link, paddr);
@@ -90,28 +86,19 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
};
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = page_get_link,
.setattr = ext4_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = ext4_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
- .removexattr = generic_removexattr,
};
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 73bcfd41f5f2..42145be5c6b4 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -223,14 +223,18 @@ static struct attribute *ext4_attrs[] = {
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
+#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
ATTR_LIST(encryption),
+#endif
ATTR_LIST(metadata_csum_seed),
NULL,
};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2eb935ca5d9e..d77be9e9f535 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,18 +61,12 @@
#include "acl.h"
#ifdef EXT4_XATTR_DEBUG
-# define ea_idebug(inode, f...) do { \
- printk(KERN_DEBUG "inode %s:%lu: ", \
- inode->i_sb->s_id, inode->i_ino); \
- printk(f); \
- printk("\n"); \
- } while (0)
-# define ea_bdebug(bh, f...) do { \
- printk(KERN_DEBUG "block %pg:%lu: ", \
- bh->b_bdev, (unsigned long) bh->b_blocknr); \
- printk(f); \
- printk("\n"); \
- } while (0)
+# define ea_idebug(inode, fmt, ...) \
+ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \
+ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...) \
+ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \
+ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__)
#else
# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
@@ -199,6 +193,8 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
}
while (!IS_LAST_ENTRY(entry)) {
+ if (entry->e_value_block != 0)
+ return -EFSCORRUPTED;
if (entry->e_value_size != 0 &&
(value_start + le16_to_cpu(entry->e_value_offs) <
(void *)e + sizeof(__u32) ||
@@ -239,7 +235,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
int error = -EFSCORRUPTED;
if (((void *) header >= end) ||
- (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC)))
+ (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
goto errout;
error = ext4_xattr_check_names(entry, end, entry);
errout:
@@ -641,7 +637,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (!last->e_value_block && last->e_value_size) {
+ if (last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
@@ -661,7 +657,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
/* Compute min_offs and last. */
last = s->first;
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (!last->e_value_block && last->e_value_size) {
+ if (last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < min_offs)
min_offs = offs;
@@ -669,7 +665,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
}
free = min_offs - ((void *)last - s->base) - sizeof(__u32);
if (!s->not_found) {
- if (!s->here->e_value_block && s->here->e_value_size) {
+ if (s->here->e_value_size) {
size_t size = le32_to_cpu(s->here->e_value_size);
free += EXT4_XATTR_SIZE(size);
}
@@ -691,7 +687,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
s->here->e_name_len = name_len;
memcpy(s->here->e_name, i->name, name_len);
} else {
- if (!s->here->e_value_block && s->here->e_value_size) {
+ if (s->here->e_value_size) {
void *first_val = s->base + min_offs;
size_t offs = le16_to_cpu(s->here->e_value_offs);
void *val = s->base + offs;
@@ -725,8 +721,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
last = s->first;
while (!IS_LAST_ENTRY(last)) {
size_t o = le16_to_cpu(last->e_value_offs);
- if (!last->e_value_block &&
- last->e_value_size && o < offs)
+ if (last->e_value_size && o < offs)
last->e_value_offs =
cpu_to_le16(o + size);
last = EXT4_XATTR_NEXT(last);
@@ -1318,18 +1313,19 @@ retry:
*/
static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
int value_offs_shift, void *to,
- void *from, size_t n, int blocksize)
+ void *from, size_t n)
{
struct ext4_xattr_entry *last = entry;
int new_offs;
+ /* We always shift xattr headers further thus offsets get lower */
+ BUG_ON(value_offs_shift > 0);
+
/* Adjust the value offsets of the entries */
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- if (!last->e_value_block && last->e_value_size) {
+ if (last->e_value_size) {
new_offs = le16_to_cpu(last->e_value_offs) +
value_offs_shift;
- BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
- > blocksize);
last->e_value_offs = cpu_to_le16(new_offs);
}
}
@@ -1338,6 +1334,141 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
}
/*
+ * Move xattr pointed to by 'entry' from inode into external xattr block
+ */
+static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_xattr_entry *entry)
+{
+ struct ext4_xattr_ibody_find *is = NULL;
+ struct ext4_xattr_block_find *bs = NULL;
+ char *buffer = NULL, *b_entry_name = NULL;
+ size_t value_offs, value_size;
+ struct ext4_xattr_info i = {
+ .value = NULL,
+ .value_len = 0,
+ .name_index = entry->e_name_index,
+ };
+ struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
+ int error;
+
+ value_offs = le16_to_cpu(entry->e_value_offs);
+ value_size = le32_to_cpu(entry->e_value_size);
+
+ is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+ bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
+ buffer = kmalloc(value_size, GFP_NOFS);
+ b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+ if (!is || !bs || !buffer || !b_entry_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ is->s.not_found = -ENODATA;
+ bs->s.not_found = -ENODATA;
+ is->iloc.bh = NULL;
+ bs->bh = NULL;
+
+ /* Save the entry name and the entry value */
+ memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ memcpy(b_entry_name, entry->e_name, entry->e_name_len);
+ b_entry_name[entry->e_name_len] = '\0';
+ i.name = b_entry_name;
+
+ error = ext4_get_inode_loc(inode, &is->iloc);
+ if (error)
+ goto out;
+
+ error = ext4_xattr_ibody_find(inode, &i, is);
+ if (error)
+ goto out;
+
+ /* Remove the chosen entry from the inode */
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ if (error)
+ goto out;
+
+ i.name = b_entry_name;
+ i.value = buffer;
+ i.value_len = value_size;
+ error = ext4_xattr_block_find(inode, &i, bs);
+ if (error)
+ goto out;
+
+ /* Add entry which was removed from the inode into the block */
+ error = ext4_xattr_block_set(handle, inode, &i, bs);
+ if (error)
+ goto out;
+ error = 0;
+out:
+ kfree(b_entry_name);
+ kfree(buffer);
+ if (is)
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
+
+ return error;
+}
+
+static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
+ struct ext4_inode *raw_inode,
+ int isize_diff, size_t ifree,
+ size_t bfree, int *total_ino)
+{
+ struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
+ struct ext4_xattr_entry *small_entry;
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_entry *last;
+ unsigned int entry_size; /* EA entry size */
+ unsigned int total_size; /* EA entry size + value size */
+ unsigned int min_total_size;
+ int error;
+
+ while (isize_diff > ifree) {
+ entry = NULL;
+ small_entry = NULL;
+ min_total_size = ~0U;
+ last = IFIRST(header);
+ /* Find the entry best suited to be pushed into EA block */
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ total_size =
+ EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+ EXT4_XATTR_LEN(last->e_name_len);
+ if (total_size <= bfree &&
+ total_size < min_total_size) {
+ if (total_size + ifree < isize_diff) {
+ small_entry = last;
+ } else {
+ entry = last;
+ min_total_size = total_size;
+ }
+ }
+ }
+
+ if (entry == NULL) {
+ if (small_entry == NULL)
+ return -ENOSPC;
+ entry = small_entry;
+ }
+
+ entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+ total_size = entry_size +
+ EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+ error = ext4_xattr_move_to_block(handle, inode, raw_inode,
+ entry);
+ if (error)
+ return error;
+
+ *total_ino -= entry_size;
+ ifree += total_size;
+ bfree -= total_size;
+ }
+
+ return 0;
+}
+
+/*
* Expand an inode by new_extra_isize bytes when EAs are present.
* Returns 0 on success or negative error number on failure.
*/
@@ -1345,14 +1476,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle)
{
struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_entry *entry, *last, *first;
struct buffer_head *bh = NULL;
- struct ext4_xattr_ibody_find *is = NULL;
- struct ext4_xattr_block_find *bs = NULL;
- char *buffer = NULL, *b_entry_name = NULL;
- size_t min_offs, free;
+ size_t min_offs;
+ size_t ifree, bfree;
int total_ino;
- void *base, *start, *end;
+ void *base, *end;
int error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
int isize_diff; /* How much do we need to grow i_extra_isize */
@@ -1368,34 +1496,24 @@ retry:
goto out;
header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
/*
* Check if enough free space is available in the inode to shift the
* entries ahead by new_extra_isize.
*/
- base = start = entry;
+ base = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
min_offs = end - base;
- last = entry;
total_ino = sizeof(struct ext4_xattr_ibody_header);
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
- free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
- if (free >= isize_diff) {
- entry = IFIRST(header);
- ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
- - new_extra_isize, (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
- (void *)header, total_ino,
- inode->i_sb->s_blocksize);
- EXT4_I(inode)->i_extra_isize = new_extra_isize;
- goto out;
- }
+ ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
+ if (ifree >= isize_diff)
+ goto shift;
/*
* Enough free space isn't available in the inode, check if
@@ -1413,146 +1531,44 @@ retry:
goto cleanup;
}
base = BHDR(bh);
- first = BFIRST(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
- free = ext4_xattr_free_space(first, &min_offs, base, NULL);
- if (free < isize_diff) {
+ bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base,
+ NULL);
+ if (bfree + ifree < isize_diff) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
brelse(bh);
goto retry;
}
- error = -1;
+ error = -ENOSPC;
goto cleanup;
}
} else {
- free = inode->i_sb->s_blocksize;
+ bfree = inode->i_sb->s_blocksize;
}
- while (isize_diff > 0) {
- size_t offs, size, entry_size;
- struct ext4_xattr_entry *small_entry = NULL;
- struct ext4_xattr_info i = {
- .value = NULL,
- .value_len = 0,
- };
- unsigned int total_size; /* EA entry size + value size */
- unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
- unsigned int min_total_size = ~0U;
-
- is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
- bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- if (!is || !bs) {
- error = -ENOMEM;
- goto cleanup;
- }
-
- is->s.not_found = -ENODATA;
- bs->s.not_found = -ENODATA;
- is->iloc.bh = NULL;
- bs->bh = NULL;
-
- last = IFIRST(header);
- /* Find the entry best suited to be pushed into EA block */
- entry = NULL;
- for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- total_size =
- EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
- EXT4_XATTR_LEN(last->e_name_len);
- if (total_size <= free && total_size < min_total_size) {
- if (total_size < isize_diff) {
- small_entry = last;
- } else {
- entry = last;
- min_total_size = total_size;
- }
- }
- }
-
- if (entry == NULL) {
- if (small_entry) {
- entry = small_entry;
- } else {
- if (!tried_min_extra_isize &&
- s_min_extra_isize) {
- tried_min_extra_isize++;
- new_extra_isize = s_min_extra_isize;
- kfree(is); is = NULL;
- kfree(bs); bs = NULL;
- brelse(bh);
- goto retry;
- }
- error = -1;
- goto cleanup;
- }
- }
- offs = le16_to_cpu(entry->e_value_offs);
- size = le32_to_cpu(entry->e_value_size);
- entry_size = EXT4_XATTR_LEN(entry->e_name_len);
- i.name_index = entry->e_name_index,
- buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
- b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
- if (!buffer || !b_entry_name) {
- error = -ENOMEM;
- goto cleanup;
+ error = ext4_xattr_make_inode_space(handle, inode, raw_inode,
+ isize_diff, ifree, bfree,
+ &total_ino);
+ if (error) {
+ if (error == -ENOSPC && !tried_min_extra_isize &&
+ s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+ brelse(bh);
+ goto retry;
}
- /* Save the entry name and the entry value */
- memcpy(buffer, (void *)IFIRST(header) + offs,
- EXT4_XATTR_SIZE(size));
- memcpy(b_entry_name, entry->e_name, entry->e_name_len);
- b_entry_name[entry->e_name_len] = '\0';
- i.name = b_entry_name;
-
- error = ext4_get_inode_loc(inode, &is->iloc);
- if (error)
- goto cleanup;
-
- error = ext4_xattr_ibody_find(inode, &i, is);
- if (error)
- goto cleanup;
-
- /* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
- if (error)
- goto cleanup;
- total_ino -= entry_size;
-
- entry = IFIRST(header);
- if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff)
- shift_bytes = isize_diff;
- else
- shift_bytes = entry_size + EXT4_XATTR_SIZE(size);
- /* Adjust the offsets and shift the remaining entries ahead */
- ext4_xattr_shift_entries(entry, -shift_bytes,
- (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
- EXT4_I(inode)->i_extra_isize + shift_bytes,
- (void *)header, total_ino, inode->i_sb->s_blocksize);
-
- isize_diff -= shift_bytes;
- EXT4_I(inode)->i_extra_isize += shift_bytes;
- header = IHDR(inode, raw_inode);
-
- i.name = b_entry_name;
- i.value = buffer;
- i.value_len = size;
- error = ext4_xattr_block_find(inode, &i, bs);
- if (error)
- goto cleanup;
-
- /* Add entry which was removed from the inode into the block */
- error = ext4_xattr_block_set(handle, inode, &i, bs);
- if (error)
- goto cleanup;
- kfree(b_entry_name);
- kfree(buffer);
- b_entry_name = NULL;
- buffer = NULL;
- brelse(is->iloc.bh);
- kfree(is);
- kfree(bs);
+ goto cleanup;
}
+shift:
+ /* Adjust the offsets and shift the remaining entries ahead */
+ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize
+ - new_extra_isize, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+ (void *)header, total_ino);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
brelse(bh);
out:
ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
@@ -1560,12 +1576,6 @@ out:
return 0;
cleanup:
- kfree(b_entry_name);
- kfree(buffer);
- if (is)
- brelse(is->iloc.bh);
- kfree(is);
- kfree(bs);
brelse(bh);
/*
* We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
@@ -1734,7 +1744,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
*name++;
}
- if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+ if (entry->e_value_size != 0) {
__le32 *value = (__le32 *)((char *)header +
le16_to_cpu(entry->e_value_offs));
for (n = (le32_to_cpu(entry->e_value_size) +
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 4dcc9e28dc5c..6fe23af509e1 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -109,14 +109,16 @@ fail:
return ERR_PTR(-EINVAL);
}
-static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
+ const struct posix_acl *acl, size_t *size)
{
struct f2fs_acl_header *f2fs_acl;
struct f2fs_acl_entry *entry;
int i;
- f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
- sizeof(struct f2fs_acl_entry), GFP_NOFS);
+ f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) +
+ acl->a_count * sizeof(struct f2fs_acl_entry),
+ GFP_NOFS);
if (!f2fs_acl)
return ERR_PTR(-ENOMEM);
@@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
if (retval > 0) {
- value = f2fs_kmalloc(retval, GFP_F2FS_ZERO);
+ value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value,
@@ -210,12 +212,10 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
set_acl_inode(inode, inode->i_mode);
- if (error == 0)
- acl = NULL;
}
break;
@@ -230,7 +230,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
}
if (acl) {
- value = f2fs_acl_to_disk(acl, &size);
+ value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
if (IS_ERR(value)) {
clear_inode_flag(inode, FI_ACL_MODE);
return (int)PTR_ERR(value);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index b2334d11dae8..2c685185c24d 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -41,7 +41,6 @@ extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
#else
-#define f2fs_check_acl NULL
#define f2fs_get_acl NULL
#define f2fs_set_acl NULL
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f94d01e7d001..7e9b504bd8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -28,7 +28,7 @@ struct kmem_cache *inode_entry_slab;
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
{
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
sbi->sb->s_flags |= MS_RDONLY;
if (!end_io)
f2fs_flush_merged_bios(sbi);
@@ -267,7 +267,6 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
- struct blk_plug plug;
long diff, written;
/* collect a number of dirty meta pages and write together */
@@ -280,9 +279,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
diff = nr_pages_to_write(sbi, META, wbc);
- blk_start_plug(&plug);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
- blk_finish_plug(&plug);
mutex_unlock(&sbi->cp_mutex);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -388,6 +385,9 @@ const struct address_space_operations f2fs_meta_aops = {
.set_page_dirty = f2fs_set_meta_page_dirty,
.invalidatepage = f2fs_invalidate_page,
.releasepage = f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -491,7 +491,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
spin_lock(&im->ino_lock);
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_ORPHAN)) {
+ if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
return -ENOSPC;
}
@@ -531,8 +531,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
+ struct node_info ni;
+ int err = acquire_orphan_inode(sbi);
+
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x), run fsck to fix.",
+ __func__, ino);
+ return err;
+ }
- inode = f2fs_iget(sbi->sb, ino);
+ __add_ino_entry(sbi, ino, ORPHAN_INO);
+
+ inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode)) {
/*
* there should be a bug that we can't find the entry
@@ -546,6 +558,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
/* truncate all the data during iput */
iput(inode);
+
+ get_node_info(sbi, ino, &ni);
+
+ /* ENOMEM was fully retried in f2fs_evict_inode. */
+ if (ni.blk_addr != NULL_ADDR) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x), run fsck to fix.",
+ __func__, ino);
+ return -EIO;
+ }
+ __remove_ino_entry(sbi, ino, ORPHAN_INO);
return 0;
}
@@ -554,7 +578,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
block_t start_blk, orphan_blocks, i, j;
int err;
- if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+ if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
return 0;
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -578,7 +602,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
f2fs_put_page(page, 1);
}
/* clear Orphan Flag */
- clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
return 0;
}
@@ -639,45 +663,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
}
}
-static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
- block_t cp_addr, unsigned long long *version)
+static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
+ struct f2fs_checkpoint **cp_block, struct page **cp_page,
+ unsigned long long *version)
{
- struct page *cp_page_1, *cp_page_2 = NULL;
unsigned long blk_size = sbi->blocksize;
- struct f2fs_checkpoint *cp_block;
- unsigned long long cur_version = 0, pre_version = 0;
- size_t crc_offset;
+ size_t crc_offset = 0;
__u32 crc = 0;
- /* Read the 1st cp block in this CP pack */
- cp_page_1 = get_meta_page(sbi, cp_addr);
+ *cp_page = get_meta_page(sbi, cp_addr);
+ *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
- /* get the version number */
- cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
- crc_offset = le32_to_cpu(cp_block->checksum_offset);
- if (crc_offset >= blk_size)
- goto invalid_cp1;
+ crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
+ if (crc_offset >= blk_size) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "invalid crc_offset: %zu", crc_offset);
+ return -EINVAL;
+ }
- crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
- goto invalid_cp1;
+ crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
+ + crc_offset)));
+ if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
+ f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+ return -EINVAL;
+ }
- pre_version = cur_cp_version(cp_block);
+ *version = cur_cp_version(*cp_block);
+ return 0;
+}
- /* Read the 2nd cp block in this CP pack */
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
- cp_page_2 = get_meta_page(sbi, cp_addr);
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+ block_t cp_addr, unsigned long long *version)
+{
+ struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+ struct f2fs_checkpoint *cp_block = NULL;
+ unsigned long long cur_version = 0, pre_version = 0;
+ int err;
- cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
- crc_offset = le32_to_cpu(cp_block->checksum_offset);
- if (crc_offset >= blk_size)
- goto invalid_cp2;
+ err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+ &cp_page_1, version);
+ if (err)
+ goto invalid_cp1;
+ pre_version = *version;
- crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
+ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+ &cp_page_2, version);
+ if (err)
goto invalid_cp2;
-
- cur_version = cur_cp_version(cp_block);
+ cur_version = *version;
if (cur_version == pre_version) {
*version = cur_version;
@@ -972,10 +1006,40 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
+static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+ unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+ spin_lock(&sbi->cp_lock);
+
+ if (cpc->reason == CP_UMOUNT)
+ __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+ if (cpc->reason == CP_FASTBOOT)
+ __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+
+ if (orphan_num)
+ __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+ __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
+ /* set this flag to activate crc|cp_ver for recovery */
+ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+
+ spin_unlock(&sbi->cp_lock);
+}
+
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
nid_t last_nid = nm_i->next_scan_nid;
@@ -984,19 +1048,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__u32 crc32 = 0;
int i;
int cp_payload_blks = __cp_payload(sbi);
- block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
- bool invalidate = false;
struct super_block *sb = sbi->sb;
struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
u64 kbytes_written;
- /*
- * This avoids to conduct wrong roll-forward operations and uses
- * metapages, so should be called prior to sync_meta_pages below.
- */
- if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk))
- invalidate = true;
-
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
@@ -1036,10 +1091,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi, false);
+ spin_lock(&sbi->cp_lock);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
- set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
- clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ spin_unlock(&sbi->cp_lock);
orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
@@ -1054,23 +1111,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
cp_payload_blks + data_sum_blocks +
orphan_blocks);
- if (cpc->reason == CP_UMOUNT)
- set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- else
- clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-
- if (cpc->reason == CP_FASTBOOT)
- set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
- else
- clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
-
- if (orphan_num)
- set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- else
- clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
- set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ /* update ckpt flag for checkpoint */
+ update_ckpt_flags(sbi, cpc);
/* update SIT/NAT bitmap */
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
@@ -1137,14 +1179,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* wait for previous submitted meta pages writeback */
wait_on_all_pages_writeback(sbi);
- /*
- * invalidate meta page which is used temporarily for zeroing out
- * block at the end of warm node chain.
- */
- if (invalidate)
- invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
- discard_blk);
-
release_ino_entry(sbi, false);
if (unlikely(f2fs_cp_error(sbi)))
@@ -1152,6 +1186,17 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
+ clear_sbi_flag(sbi, SBI_NEED_CP);
+
+ /*
+ * redirty superblock if metadata like node page or inode cache is
+ * updated during writing checkpoint.
+ */
+ if (get_pages(sbi, F2FS_DIRTY_NODES) ||
+ get_pages(sbi, F2FS_DIRTY_IMETA))
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
return 0;
}
@@ -1190,6 +1235,18 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_merged_bios(sbi);
+ /* this is the case of multiple fstrims without any changes */
+ if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) {
+ f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt);
+ f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries);
+ f2fs_bug_on(sbi, prefree_segments(sbi));
+ flush_sit_entries(sbi, cpc);
+ clear_prefree_segments(sbi, cpc);
+ f2fs_wait_all_discard_bio(sbi);
+ unblock_operations(sbi);
+ goto out;
+ }
+
/*
* update checkpoint pack index
* Increase the version number so that
@@ -1205,6 +1262,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
+ f2fs_wait_all_discard_bio(sbi);
+
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ccb401eebc11..9ae194fd2fdb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,6 +34,11 @@ static void f2fs_read_end_io(struct bio *bio)
struct bio_vec *bvec;
int i;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO))
+ bio->bi_error = -EIO;
+#endif
+
if (f2fs_bio_encrypted(bio)) {
if (bio->bi_error) {
fscrypt_release_ctx(bio->bi_private);
@@ -70,7 +75,7 @@ static void f2fs_write_end_io(struct bio *bio)
fscrypt_pullback_bio_page(&page, true);
if (unlikely(bio->bi_error)) {
- set_bit(AS_EIO, &page->mapping->flags);
+ mapping_set_error(page->mapping, -EIO);
f2fs_stop_checkpoint(sbi, true);
}
end_page_writeback(page);
@@ -626,11 +631,13 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret = 0;
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
- map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from));
- map.m_next_pgofs = NULL;
+ map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
+ if (map.m_len > map.m_lblk)
+ map.m_len -= map.m_lblk;
+ else
+ map.m_len = 0;
- if (f2fs_encrypted_inode(inode))
- return 0;
+ map.m_next_pgofs = NULL;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = f2fs_convert_inline_inode(inode);
@@ -672,6 +679,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
bool allocated = false;
block_t blkaddr;
+ if (!maxblocks)
+ return 0;
+
map->m_len = 0;
map->m_flags = 0;
@@ -783,6 +793,7 @@ skip:
err = reserve_new_blocks(&dn, prealloc);
if (err)
goto sync_out;
+ allocated = dn.node_changed;
map->m_len += dn.ofs_in_node - ofs_in_node;
if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -966,8 +977,8 @@ out:
return ret;
}
-struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
- unsigned nr_pages)
+static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct fscrypt_ctx *ctx = NULL;
@@ -1284,7 +1295,7 @@ write:
if (!wbc->for_reclaim)
need_balance_fs = true;
- else if (has_not_enough_free_secs(sbi, 0))
+ else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
err = -EAGAIN;
@@ -1344,6 +1355,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int cycled;
int range_whole = 0;
int tag;
+ int nwritten = 0;
pagevec_init(&pvec, 0);
@@ -1418,6 +1430,8 @@ continue_unlock:
done_index = page->index + 1;
done = 1;
break;
+ } else {
+ nwritten++;
}
if (--wbc->nr_to_write <= 0 &&
@@ -1439,6 +1453,10 @@ continue_unlock:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
+ if (nwritten)
+ f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
+ NULL, 0, DATA, WRITE);
+
return ret;
}
@@ -1480,7 +1498,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
* if some pages were truncated, we cannot guarantee its mapping->host
* to detect pending bios.
*/
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
remove_dirty_inode(inode);
return ret;
@@ -1518,8 +1535,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
* we already allocated all the blocks, so we don't need to get
* the block addresses when there is no need to fill the page.
*/
- if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
- len == PAGE_SIZE)
+ if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
return 0;
if (f2fs_has_inline_data(inode) ||
@@ -1616,7 +1632,7 @@ repeat:
if (err)
goto fail;
- if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+ if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
unlock_page(page);
f2fs_balance_fs(sbi, true);
lock_page(page);
@@ -1633,22 +1649,12 @@ repeat:
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- if (len == PAGE_SIZE)
- goto out_update;
- if (PageUptodate(page))
- goto out_clear;
-
- if ((pos & PAGE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_SIZE - 1);
- unsigned end = start + len;
-
- /* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_SIZE);
- goto out_update;
- }
+ if (len == PAGE_SIZE || PageUptodate(page))
+ return 0;
if (blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
} else {
struct bio *bio;
@@ -1676,11 +1682,6 @@ repeat:
goto fail;
}
}
-out_update:
- if (!PageUptodate(page))
- SetPageUptodate(page);
-out_clear:
- clear_cold_data(page);
return 0;
fail:
@@ -1698,11 +1699,26 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
+ /*
+ * This should be come from len == PAGE_SIZE, and we expect copied
+ * should be PAGE_SIZE. Otherwise, we treat it with zero copied and
+ * let generic_perform_write() try to copy data again through copied=0.
+ */
+ if (!PageUptodate(page)) {
+ if (unlikely(copied != PAGE_SIZE))
+ copied = 0;
+ else
+ SetPageUptodate(page);
+ }
+ if (!copied)
+ goto unlock_out;
+
set_page_dirty(page);
+ clear_cold_data(page);
if (pos + copied > i_size_read(inode))
f2fs_i_size_write(inode, pos + copied);
-
+unlock_out:
f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
@@ -1873,6 +1889,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, get_data_block_bmap);
}
+#ifdef CONFIG_MIGRATION
+#include <linux/migrate.h>
+
+int f2fs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ int rc, extra_count;
+ struct f2fs_inode_info *fi = F2FS_I(mapping->host);
+ bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
+
+ BUG_ON(PageWriteback(page));
+
+ /* migrating an atomic written page is safe with the inmem_lock hold */
+ if (atomic_written && !mutex_trylock(&fi->inmem_lock))
+ return -EAGAIN;
+
+ /*
+ * A reference is expected if PagePrivate set when move mapping,
+ * however F2FS breaks this for maintaining dirty page counts when
+ * truncating pages. So here adjusting the 'extra_count' make it work.
+ */
+ extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+ rc = migrate_page_move_mapping(mapping, newpage,
+ page, NULL, mode, extra_count);
+ if (rc != MIGRATEPAGE_SUCCESS) {
+ if (atomic_written)
+ mutex_unlock(&fi->inmem_lock);
+ return rc;
+ }
+
+ if (atomic_written) {
+ struct inmem_pages *cur;
+ list_for_each_entry(cur, &fi->inmem_pages, list)
+ if (cur->page == page) {
+ cur->page = newpage;
+ break;
+ }
+ mutex_unlock(&fi->inmem_lock);
+ put_page(page);
+ get_page(newpage);
+ }
+
+ if (PagePrivate(page))
+ SetPagePrivate(newpage);
+ set_page_private(newpage, page_private(page));
+
+ migrate_page_copy(newpage, page);
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
.readpages = f2fs_read_data_pages,
@@ -1885,4 +1953,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.releasepage = f2fs_release_page,
.direct_IO = f2fs_direct_IO,
.bmap = f2fs_bmap,
+#ifdef CONFIG_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index badd407bb622..fb245bd302e4 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,6 +45,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
@@ -54,6 +55,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
si->valid_count = valid_user_blocks(sbi);
+ si->discard_blks = discard_blocks(sbi);
si->valid_node_count = valid_node_count(sbi);
si->valid_inode_count = valid_inode_count(sbi);
si->inline_xattr = atomic_read(&sbi->inline_xattr);
@@ -154,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += sizeof(struct sit_info);
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
- si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ if (f2fs_discard_en(sbi))
+ si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE;
if (sbi->segs_per_sec > 1)
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
@@ -228,8 +232,13 @@ static int stat_show(struct seq_file *s, void *v)
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
si->overp_segs, si->rsvd_segs);
- seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
- si->utilization, si->valid_count);
+ if (test_opt(si->sbi, DISCARD))
+ seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
+ si->utilization, si->valid_count, si->discard_blks);
+ else
+ seq_printf(s, "Utilization: %u%% (%u valid blocks)\n",
+ si->utilization, si->valid_count);
+
seq_printf(s, " - Node: %u (Inode: %u, ",
si->valid_node_count, si->valid_inode_count);
seq_printf(s, "Other: %u)\n - Data: %u\n",
@@ -311,6 +320,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4lld in %4d\n",
si->ndirty_meta, si->meta_pages);
+ seq_printf(s, " - imeta: %4lld\n",
+ si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9054aeac8015..369f4513be37 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level)
return 4;
}
-unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
[F2FS_FT_UNKNOWN] = DT_UNKNOWN,
[F2FS_FT_REG_FILE] = DT_REG,
[F2FS_FT_DIR] = DT_DIR,
@@ -172,7 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
int max_slots;
f2fs_hash_t namehash;
- namehash = f2fs_dentry_hash(&name);
+ if(fname->hash)
+ namehash = cpu_to_le32(fname->hash);
+ else
+ namehash = f2fs_dentry_hash(&name);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -212,31 +215,17 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
return de;
}
-/*
- * Find an entry in the specified directory with the wanted name.
- * It returns the page where the entry was found (as a parameter - res_page),
- * and the entry itself. Page is returned mapped and unlocked.
- * Entry is guaranteed to be valid.
- */
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
- const struct qstr *child, struct page **res_page)
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page)
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
unsigned int max_depth;
unsigned int level;
- struct fscrypt_name fname;
- int err;
-
- err = fscrypt_setup_filename(dir, child, 1, &fname);
- if (err) {
- *res_page = ERR_PTR(err);
- return NULL;
- }
if (f2fs_has_inline_dentry(dir)) {
*res_page = NULL;
- de = find_in_inline_dir(dir, &fname, res_page);
+ de = find_in_inline_dir(dir, fname, res_page);
goto out;
}
@@ -256,11 +245,35 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
for (level = 0; level < max_depth; level++) {
*res_page = NULL;
- de = find_in_level(dir, level, &fname, res_page);
+ de = find_in_level(dir, level, fname, res_page);
if (de || IS_ERR(*res_page))
break;
}
out:
+ return de;
+}
+
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ const struct qstr *child, struct page **res_page)
+{
+ struct f2fs_dir_entry *de = NULL;
+ struct fscrypt_name fname;
+ int err;
+
+ err = fscrypt_setup_filename(dir, child, 1, &fname);
+ if (err) {
+ *res_page = ERR_PTR(err);
+ return NULL;
+ }
+
+ de = __f2fs_find_entry(dir, &fname, res_page);
+
fscrypt_free_filename(&fname);
return de;
}
@@ -299,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
f2fs_mark_inode_dirty_sync(dir);
f2fs_put_page(page, 1);
}
@@ -375,7 +388,8 @@ static int make_empty_dir(struct inode *inode,
}
struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
- const struct qstr *name, struct page *dpage)
+ const struct qstr *new_name, const struct qstr *orig_name,
+ struct page *dpage)
{
struct page *page;
int err;
@@ -400,7 +414,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- err = f2fs_init_security(inode, dir, name, page);
+ err = f2fs_init_security(inode, dir, orig_name, page);
if (err)
goto put_error;
@@ -417,8 +431,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
set_cold_node(inode, page);
}
- if (name)
- init_dent_inode(name, page);
+ if (new_name)
+ init_dent_inode(new_name, page);
/*
* This file should be checkpointed during fsync.
@@ -451,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
f2fs_i_links_write(dir, true);
clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
f2fs_mark_inode_dirty_sync(dir);
if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -496,7 +510,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
de->ino = cpu_to_le32(ino);
set_de_type(de, mode);
for (i = 0; i < slots; i++) {
- test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+ __set_bit_le(bit_pos + i, (void *)d->bitmap);
/* avoid wrong garbage data for readdir */
if (i)
(de + i)->name_len = 0;
@@ -504,6 +518,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
}
int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
@@ -530,7 +545,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
start:
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_DIR_DEPTH))
+ if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
return -ENOSPC;
#endif
if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
@@ -569,7 +584,8 @@ add_dentry:
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, new_name, NULL);
+ page = init_inode_metadata(inode, dir, new_name,
+ orig_name, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -599,6 +615,26 @@ fail:
return err;
}
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct qstr new_name;
+ int err = -EAGAIN;
+
+ new_name.name = fname_name(fname);
+ new_name.len = fname_len(fname);
+
+ if (f2fs_has_inline_dentry(dir))
+ err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname,
+ inode, ino, mode);
+ if (err == -EAGAIN)
+ err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname,
+ inode, ino, mode);
+
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+ return err;
+}
+
/*
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
@@ -607,24 +643,15 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
struct fscrypt_name fname;
- struct qstr new_name;
int err;
err = fscrypt_setup_filename(dir, name, 0, &fname);
if (err)
return err;
- new_name.name = fname_name(&fname);
- new_name.len = fname_len(&fname);
-
- err = -EAGAIN;
- if (f2fs_has_inline_dentry(dir))
- err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
- if (err == -EAGAIN)
- err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+ err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
fscrypt_free_filename(&fname);
- f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
}
@@ -634,7 +661,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, NULL, NULL);
+ page = init_inode_metadata(inode, dir, NULL, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -656,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
f2fs_i_links_write(inode, false);
if (S_ISDIR(inode->i_mode)) {
@@ -703,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
kunmap(page); /* kunmap - pair of f2fs_find_entry */
set_page_dirty(page);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
f2fs_mark_inode_dirty_sync(dir);
if (inode)
@@ -786,19 +813,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (f2fs_encrypted_inode(d->inode)) {
int save_len = fstr->len;
- int ret;
+ int err;
- de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS);
- if (!de_name.name)
- return false;
-
- memcpy(de_name.name, d->filename[bit_pos], de_name.len);
-
- ret = fscrypt_fname_disk_to_usr(d->inode,
+ err = fscrypt_fname_disk_to_usr(d->inode,
(u32)de->hash_code, 0,
&de_name, fstr);
- kfree(de_name.name);
- if (ret < 0)
+ if (err)
return true;
de_name = *fstr;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 14f5fe2b841e..9e8de18a168a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -46,6 +46,8 @@ enum {
FAULT_BLOCK,
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
+ FAULT_IO,
+ FAULT_CHECKPOINT,
FAULT_MAX,
};
@@ -55,40 +57,8 @@ struct f2fs_fault_info {
unsigned int inject_type;
};
-extern struct f2fs_fault_info f2fs_fault;
extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type)))
-
-static inline bool time_to_inject(int type)
-{
- if (!f2fs_fault.inject_rate)
- return false;
- if (type == FAULT_KMALLOC && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_BLOCK && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type))
- return false;
- else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type))
- return false;
-
- atomic_inc(&f2fs_fault.inject_ops);
- if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) {
- atomic_set(&f2fs_fault.inject_ops, 0);
- printk("%sF2FS-fs : inject %s in %pF\n",
- KERN_INFO,
- fault_name[type],
- __builtin_return_address(0));
- return true;
- }
- return false;
-}
+#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
#endif
/*
@@ -158,7 +128,7 @@ enum {
CP_DISCARD,
};
-#define DEF_BATCHED_TRIM_SECTIONS 32
+#define DEF_BATCHED_TRIM_SECTIONS 2
#define BATCHED_TRIM_SEGMENTS(sbi) \
(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
#define BATCHED_TRIM_BLOCKS(sbi) \
@@ -211,6 +181,13 @@ struct discard_entry {
int len; /* # of consecutive blocks of the discard */
};
+struct bio_entry {
+ struct list_head list;
+ struct bio *bio;
+ struct completion event;
+ int error;
+};
+
/* for the list of fsync inodes, used only during recovery */
struct fsync_inode_entry {
struct list_head list; /* list head */
@@ -645,6 +622,7 @@ struct f2fs_sm_info {
/* for small discard management */
struct list_head discard_list; /* 4KB discard list */
+ struct list_head wait_list; /* linked with issued discard bio */
int nr_discards; /* # of discards in the list */
int max_discards; /* max. discards to be issued */
@@ -748,6 +726,7 @@ enum {
SBI_NEED_FSCK, /* need fsck.f2fs to fix */
SBI_POR_DOING, /* recovery is doing or not */
SBI_NEED_SB_WRITE, /* need to recover superblock */
+ SBI_NEED_CP, /* need to checkpoint */
};
enum {
@@ -765,7 +744,7 @@ struct f2fs_sb_info {
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int valid_super_block; /* valid super block no */
- int s_flag; /* flags for sbi */
+ unsigned long s_flag; /* flags for sbi */
#ifdef CONFIG_F2FS_FS_ENCRYPTION
u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
@@ -785,6 +764,7 @@ struct f2fs_sb_info {
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
@@ -892,8 +872,37 @@ struct f2fs_sb_info {
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
+
+ /* For fault injection */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info fault_info;
+#endif
};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+{
+ struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+ if (!ffi->inject_rate)
+ return false;
+
+ if (!IS_FAULT_SET(ffi, type))
+ return false;
+
+ atomic_inc(&ffi->inject_ops);
+ if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+ atomic_set(&ffi->inject_ops, 0);
+ printk("%sF2FS-fs : inject %s in %pF\n",
+ KERN_INFO,
+ fault_name[type],
+ __builtin_return_address(0));
+ return true;
+ }
+ return false;
+}
+#endif
+
/* For write statistics. Suppose sector size is 512 bytes,
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
@@ -1034,17 +1043,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
{
- return sbi->s_flag & (0x01 << type);
+ return test_bit(type, &sbi->s_flag);
}
static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_flag |= (0x01 << type);
+ set_bit(type, &sbi->s_flag);
}
static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_flag &= ~(0x01 << type);
+ clear_bit(type, &sbi->s_flag);
}
static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -1052,26 +1061,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
return le64_to_cpu(cp->checkpoint_ver);
}
-static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
{
unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+
return ckpt_flags & f;
}
-static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ return __is_set_ckpt_flags(F2FS_CKPT(sbi), f);
+}
+
+static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags;
+
+ ckpt_flags = le32_to_cpu(cp->ckpt_flags);
ckpt_flags |= f;
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
-static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ spin_lock(&sbi->cp_lock);
+ __set_ckpt_flags(F2FS_CKPT(sbi), f);
+ spin_unlock(&sbi->cp_lock);
+}
+
+static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags;
+
+ ckpt_flags = le32_to_cpu(cp->ckpt_flags);
ckpt_flags &= (~f);
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
+static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
+{
+ spin_lock(&sbi->cp_lock);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), f);
+ spin_unlock(&sbi->cp_lock);
+}
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+{
+ struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+ return blk_queue_discard(q);
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
down_read(&sbi->cp_rwsem);
@@ -1110,8 +1150,8 @@ static inline bool __remain_node_summaries(int reason)
static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
{
- return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
- is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+ return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) ||
+ is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG));
}
/*
@@ -1151,7 +1191,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t diff;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_BLOCK))
+ if (time_to_inject(sbi, FAULT_BLOCK))
return false;
#endif
/*
@@ -1193,6 +1233,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
percpu_counter_inc(&sbi->nr_pages[count_type]);
+
+ if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+ return;
+
set_sbi_flag(sbi, SBI_IS_DIRTY);
}
@@ -1243,6 +1287,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
return sbi->total_valid_block_count;
}
+static inline block_t discard_blocks(struct f2fs_sb_info *sbi)
+{
+ return sbi->discard_blks;
+}
+
static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1376,7 +1425,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
if (page)
return page;
- if (time_to_inject(FAULT_PAGE_ALLOC))
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
return NULL;
#endif
if (!for_write)
@@ -1804,7 +1853,7 @@ static inline int f2fs_readonly(struct super_block *sb)
static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
{
- return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
}
static inline bool is_dot_dotdot(const struct qstr *str)
@@ -1827,10 +1876,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
return S_ISREG(inode->i_mode);
}
-static inline void *f2fs_kmalloc(size_t size, gfp_t flags)
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
{
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_KMALLOC))
+ if (time_to_inject(sbi, FAULT_KMALLOC))
return NULL;
#endif
return kmalloc(size, flags);
@@ -1885,6 +1935,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
*/
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
+struct inode *f2fs_iget_retry(struct super_block *, unsigned long);
int try_to_free_nats(struct f2fs_sb_info *, int);
int update_inode(struct inode *, struct page *);
int update_inode_page(struct inode *);
@@ -1900,7 +1951,6 @@ struct dentry *f2fs_get_parent(struct dentry *child);
/*
* dir.c
*/
-extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
void set_de_type(struct f2fs_dir_entry *, umode_t);
unsigned char get_de_type(struct f2fs_dir_entry *);
struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
@@ -1910,10 +1960,12 @@ bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
struct page *init_inode_metadata(struct inode *, struct inode *,
- const struct qstr *, struct page *);
+ const struct qstr *, const struct qstr *, struct page *);
void update_parent_metadata(struct inode *, struct inode *, unsigned int);
int room_for_filename(const void *, int, int);
void f2fs_drop_nlink(struct inode *, struct inode *);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *,
+ struct page **);
struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
struct page **);
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
@@ -1924,7 +1976,9 @@ int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
const struct qstr *, f2fs_hash_t , unsigned int);
int f2fs_add_regular_entry(struct inode *, const struct qstr *,
- struct inode *, nid_t, umode_t);
+ const struct qstr *, struct inode *, nid_t, umode_t);
+int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *,
+ nid_t, umode_t);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
umode_t);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
@@ -2010,9 +2064,9 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *);
void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
void release_discard_addrs(struct f2fs_sb_info *);
-bool discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *, bool);
void allocate_new_segments(struct f2fs_sb_info *);
int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
@@ -2095,6 +2149,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
void f2fs_set_page_dirty_nobuffers(struct page *);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
+#ifdef CONFIG_MIGRATION
+int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
+ enum migrate_mode);
+#endif
/*
* gc.c
@@ -2123,13 +2181,14 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages;
+ s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+ s64 inmem_pages;
unsigned int ndirty_dirs, ndirty_files, ndirty_all;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
int bg_gc, wb_bios;
int inline_xattr, inline_inode, inline_dir, orphans;
- unsigned int valid_count, valid_node_count, valid_inode_count;
+ unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
@@ -2294,8 +2353,8 @@ bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
struct fscrypt_name *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
- nid_t, umode_t);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *,
+ const struct qstr *, struct inode *, nid_t, umode_t);
void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
struct inode *, struct inode *);
bool f2fs_empty_inline_dir(struct inode *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 28f4f4cbb8d8..c7865073cd26 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -135,7 +135,7 @@ static inline bool need_do_checkpoint(struct inode *inode)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino))
+ else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
need_cp = true;
else if (file_wrong_pino(inode))
need_cp = true;
@@ -523,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
if (cache_only) {
- page = f2fs_grab_cache_page(mapping, index, false);
+ page = find_lock_page(mapping, index);
if (page && PageUptodate(page))
goto truncate_out;
f2fs_put_page(page, 1);
@@ -631,7 +631,7 @@ int f2fs_truncate(struct inode *inode)
if (err)
return err;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode);
return 0;
}
@@ -680,7 +680,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int err;
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
}
}
@@ -732,10 +732,7 @@ const struct inode_operations f2fs_file_inode_operations = {
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
- .removexattr = generic_removexattr,
#endif
.fiemap = f2fs_fiemap,
};
@@ -1395,7 +1392,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
if (!ret) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1454,7 +1451,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int flags;
unsigned int oldflags;
int ret;
@@ -1487,7 +1484,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
fi->i_flags = flags;
inode_unlock(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
out:
mnt_drop_write_file(filp);
@@ -1954,7 +1951,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
* avoid defragment running in SSR mode when free section are allocated
* intensively
*/
- if (has_not_enough_free_secs(sbi, sec_num)) {
+ if (has_not_enough_free_secs(sbi, 0, sec_num)) {
err = -EAGAIN;
goto out;
}
@@ -2085,6 +2082,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
return -EOPNOTSUPP;
+ if (src == dst) {
+ if (pos_in == pos_out)
+ return 0;
+ if (pos_out > pos_in && pos_out < pos_in + len)
+ return -EINVAL;
+ }
+
inode_lock(src);
if (src != dst) {
if (!inode_trylock(dst)) {
@@ -2136,8 +2140,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- ret = __exchange_data_block(src, dst, pos_in,
- pos_out, len >> F2FS_BLKSIZE_BITS, false);
+ ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
+ pos_out >> F2FS_BLKSIZE_BITS,
+ len >> F2FS_BLKSIZE_BITS, false);
if (!ret) {
if (dst_max_i_size)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8f7fa326ce95..6f14ee923acd 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -47,6 +47,11 @@ static int gc_thread_func(void *data)
continue;
}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_CHECKPOINT))
+ f2fs_stop_checkpoint(sbi, false);
+#endif
+
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
@@ -96,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
- gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
@@ -270,7 +275,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno, max_cost, last_victim;
+ unsigned int secno, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
unsigned int nsearched = 0;
@@ -280,7 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = max_cost = get_max_cost(sbi, &p);
+ p.min_cost = get_max_cost(sbi, &p);
if (p.max_search == 0)
goto out;
@@ -423,10 +428,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
- bool initial = true;
struct f2fs_summary *entry;
block_t start_addr;
int off;
+ int phase = 0;
start_addr = START_BLOCK(sbi, segno);
@@ -439,16 +444,24 @@ next_step:
struct node_info ni;
/* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
- if (initial) {
+ if (phase == 0) {
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ META_NAT, true);
+ continue;
+ }
+
+ if (phase == 1) {
ra_node_page(sbi, nid);
continue;
}
+
+ /* phase == 2 */
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
continue;
@@ -469,10 +482,8 @@ next_step:
stat_inc_node_blk_count(sbi, 1, gc_type);
}
- if (initial) {
- initial = false;
+ if (++phase < 3)
goto next_step;
- }
}
/*
@@ -706,16 +717,23 @@ next_step:
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
block_t start_bidx;
+ nid_t nid = le32_to_cpu(entry->nid);
/* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
- ra_node_page(sbi, le32_to_cpu(entry->nid));
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ META_NAT, true);
+ continue;
+ }
+
+ if (phase == 1) {
+ ra_node_page(sbi, nid);
continue;
}
@@ -723,14 +741,14 @@ next_step:
if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
continue;
- if (phase == 1) {
+ if (phase == 2) {
ra_node_page(sbi, dni.ino);
continue;
}
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
- if (phase == 2) {
+ if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
@@ -756,7 +774,7 @@ next_step:
continue;
}
- /* phase 3 */
+ /* phase 4 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -789,7 +807,7 @@ next_step:
}
}
- if (++phase < 4)
+ if (++phase < 5)
goto next_step;
}
@@ -815,7 +833,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + sbi->segs_per_sec;
- int seg_freed = 0;
+ int sec_freed = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
@@ -834,15 +852,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
for (segno = start_segno; segno < end_segno; segno++) {
- if (get_valid_blocks(sbi, segno, 1) == 0)
- continue;
-
/* find segment summary of victim */
sum_page = find_get_page(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
- f2fs_bug_on(sbi, !PageUptodate(sum_page));
f2fs_put_page(sum_page, 0);
+ if (get_valid_blocks(sbi, segno, 1) == 0 ||
+ !PageUptodate(sum_page) ||
+ unlikely(f2fs_cp_error(sbi)))
+ goto next;
+
sum = page_address(sum_page);
f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
@@ -861,7 +880,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
gc_type);
stat_inc_seg_count(sbi, type, gc_type);
-
+next:
f2fs_put_page(sum_page, 0);
}
@@ -871,22 +890,20 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
blk_finish_plug(&plug);
- if (gc_type == FG_GC) {
- while (start_segno < end_segno)
- if (get_valid_blocks(sbi, start_segno++, 1) == 0)
- seg_freed++;
- }
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+ sec_freed = 1;
stat_inc_call_count(sbi->stat_info);
- return seg_freed;
+ return sec_freed;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
- int sec_freed = 0, seg_freed;
+ int sec_freed = 0;
int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
@@ -905,7 +922,7 @@ gc_more:
goto stop;
}
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
gc_type = FG_GC;
/*
* If there is no victim and no prefree segment but still not
@@ -914,10 +931,14 @@ gc_more:
*/
if (__get_victim(sbi, &segno, gc_type) ||
prefree_segments(sbi)) {
- write_checkpoint(sbi, &cpc);
+ ret = write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
segno = NULL_SEGNO;
- } else if (has_not_enough_free_secs(sbi, 0)) {
- write_checkpoint(sbi, &cpc);
+ } else if (has_not_enough_free_secs(sbi, 0, 0)) {
+ ret = write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
}
}
@@ -925,20 +946,19 @@ gc_more:
goto stop;
ret = 0;
- seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
-
- if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
+ if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
+ gc_type == FG_GC)
sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
if (!sync) {
- if (has_not_enough_free_secs(sbi, sec_freed))
+ if (has_not_enough_free_secs(sbi, sec_freed, 0))
goto gc_more;
if (gc_type == FG_GC)
- write_checkpoint(sbi, &cpc);
+ ret = write_checkpoint(sbi, &cpc);
}
stop:
mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index ccea8735de59..5f1a67f756af 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -424,7 +424,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
ino = le32_to_cpu(de->ino);
fake_mode = get_de_type(de) << S_SHIFT;
- err = f2fs_add_regular_entry(dir, &new_name, NULL,
+ err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
ino, fake_mode);
if (err)
goto punch_dentry_pages;
@@ -445,8 +445,8 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
struct f2fs_inline_dentry *backup_dentry;
int err;
- backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry),
- GFP_F2FS_ZERO);
+ backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
+ sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
if (!backup_dentry) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
@@ -488,17 +488,17 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
}
-int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
- struct inode *inode, nid_t ino, umode_t mode)
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
unsigned int bit_pos;
f2fs_hash_t name_hash;
- size_t namelen = name->len;
struct f2fs_inline_dentry *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
- int slots = GET_DENTRY_SLOTS(namelen);
+ int slots = GET_DENTRY_SLOTS(new_name->len);
struct page *page = NULL;
int err = 0;
@@ -519,18 +519,21 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, name, ipage);
+ page = init_inode_metadata(inode, dir, new_name,
+ orig_name, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
}
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
}
f2fs_wait_on_page_writeback(ipage, NODE, true);
- name_hash = f2fs_dentry_hash(name);
+ name_hash = f2fs_dentry_hash(new_name);
make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
- f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
+ f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
set_page_dirty(ipage);
@@ -563,13 +566,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
inline_dentry = inline_data_addr(page);
bit_pos = dentry - inline_dentry->dentry;
for (i = 0; i < slots; i++)
- test_and_clear_bit_le(bit_pos + i,
+ __clear_bit_le(bit_pos + i,
&inline_dentry->dentry_bitmap);
set_page_dirty(page);
f2fs_put_page(page, 1);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
f2fs_mark_inode_dirty_sync(dir);
if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9ac5efc15347..d7369895a78a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include "f2fs.h"
@@ -234,6 +235,20 @@ bad_inode:
return ERR_PTR(ret);
}
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
+{
+ struct inode *inode;
+retry:
+ inode = f2fs_iget(sb, ino);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ }
+ return inode;
+}
+
int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
@@ -354,7 +369,7 @@ void f2fs_evict_inode(struct inode *inode)
goto no_delete;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_EVICT_INODE))
+ if (time_to_inject(sbi, FAULT_EVICT_INODE))
goto no_delete;
#endif
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 73fa356f8fbb..489fa0d5f914 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_generation = sbi->s_next_generation++;
err = insert_inode_locked(inode);
@@ -91,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
+ int i;
/*
* filename format of multimedia file should be defined as:
- * "filename + '.' + extension".
+ * "filename + '.' + extension + (optional: '.' + temp extension)".
*/
if (slen < sublen + 2)
return 0;
- if (s[slen - sublen - 1] != '.')
- return 0;
+ for (i = 1; i < slen - sublen; i++) {
+ if (s[i] != '.')
+ continue;
+ if (!strncasecmp(s + i + 1, sub, sublen))
+ return 1;
+ }
- return !strncasecmp(s + slen - sublen, sub, sublen);
+ return 0;
}
/*
@@ -177,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi, true);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
ihold(inode);
set_inode_flag(inode, FI_INC_LINK);
@@ -449,7 +454,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
ostr.name = sd->encrypted_path;
ostr.len = disk_link.len;
err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
- if (err < 0)
+ if (err)
goto err_out;
sd->len = cpu_to_le16(ostr.len);
@@ -718,7 +723,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_time(new_inode);
down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
@@ -772,7 +777,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
file_set_enc_name(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- old_inode->i_ctime = CURRENT_TIME;
+ old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -927,7 +932,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- old_dir->i_ctime = CURRENT_TIME;
+ old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
@@ -942,7 +947,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(new_inode);
up_write(&F2FS_I(new_inode)->i_sem);
- new_dir->i_ctime = CURRENT_TIME;
+ new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
@@ -1010,7 +1015,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
struct fscrypt_symlink_data *sd;
- loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
@@ -1025,7 +1029,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
if (IS_ERR(cpage))
return ERR_CAST(cpage);
caddr = page_address(cpage);
- caddr[size] = 0;
/* Symlink is encrypted */
sd = (struct fscrypt_symlink_data *)caddr;
@@ -1048,7 +1051,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
goto errout;
res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
- if (res < 0)
+ if (res)
goto errout;
/* this is broken symlink case */
@@ -1060,7 +1063,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
paddr = pstr.name;
/* Null-terminate the name */
- paddr[res] = '\0';
+ paddr[pstr.len] = '\0';
put_page(cpage);
set_delayed_call(done, kfree_link, paddr);
@@ -1077,10 +1080,7 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
- .removexattr = generic_removexattr,
#endif
};
@@ -1093,17 +1093,14 @@ const struct inode_operations f2fs_dir_inode_operations = {
.mkdir = f2fs_mkdir,
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
- .rename2 = f2fs_rename2,
+ .rename = f2fs_rename2,
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
- .removexattr = generic_removexattr,
#endif
};
@@ -1113,10 +1110,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
- .removexattr = generic_removexattr,
#endif
};
@@ -1126,9 +1120,6 @@ const struct inode_operations f2fs_special_inode_operations = {
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
- .removexattr = generic_removexattr,
#endif
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f75d197d5beb..01177ecdeab8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,8 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
if (excess_cached_nats(sbi))
res = false;
- if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD)
- res = false;
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
@@ -1314,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *last_page = NULL;
bool marked = false;
nid_t ino = inode->i_ino;
+ int nwritten = 0;
if (atomic) {
last_page = last_fsync_dnode(sbi, ino);
@@ -1387,7 +1386,10 @@ continue_unlock:
unlock_page(page);
f2fs_put_page(last_page, 0);
break;
+ } else {
+ nwritten++;
}
+
if (page == last_page) {
f2fs_put_page(page, 0);
marked = true;
@@ -1409,6 +1411,9 @@ continue_unlock:
unlock_page(last_page);
goto retry;
}
+
+ if (nwritten)
+ f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
return ret ? -EIO: 0;
}
@@ -1418,6 +1423,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
struct pagevec pvec;
int step = 0;
int nwritten = 0;
+ int ret = 0;
pagevec_init(&pvec, 0);
@@ -1438,7 +1444,8 @@ next_step:
if (unlikely(f2fs_cp_error(sbi))) {
pagevec_release(&pvec);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
/*
@@ -1489,6 +1496,8 @@ continue_unlock:
if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
unlock_page(page);
+ else
+ nwritten++;
if (--wbc->nr_to_write == 0)
break;
@@ -1506,14 +1515,17 @@ continue_unlock:
step++;
goto next_step;
}
- return nwritten;
+out:
+ if (nwritten)
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ return ret;
}
int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
{
pgoff_t index = 0, end = ULONG_MAX;
struct pagevec pvec;
- int ret2 = 0, ret = 0;
+ int ret2, ret = 0;
pagevec_init(&pvec, 0);
@@ -1542,10 +1554,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
cond_resched();
}
- if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
- ret2 = -ENOSPC;
- if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
- ret2 = -EIO;
+ ret2 = filemap_check_errors(NODE_MAPPING(sbi));
if (!ret)
ret = ret2;
return ret;
@@ -1672,6 +1681,9 @@ const struct address_space_operations f2fs_node_aops = {
.set_page_dirty = f2fs_set_node_page_dirty,
.invalidatepage = f2fs_invalidate_page,
.releasepage = f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1838,7 +1850,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct free_nid *i = NULL;
retry:
#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(FAULT_ALLOC_NID))
+ if (time_to_inject(sbi, FAULT_ALLOC_NID))
return false;
#endif
if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
@@ -2015,10 +2027,12 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
-
+retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
- if (!ipage)
- return -ENOMEM;
+ if (!ipage) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
/* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index fc7684554b1a..868bec65e51c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -229,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
f2fs_change_bit(block_off, nm_i->nat_bitmap);
}
+static inline nid_t ino_of_node(struct page *node_page)
+{
+ struct f2fs_node *rn = F2FS_NODE(node_page);
+ return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+ struct f2fs_node *rn = F2FS_NODE(node_page);
+ return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+ struct f2fs_node *rn = F2FS_NODE(node_page);
+ unsigned flag = le32_to_cpu(rn->footer.flag);
+ return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline __u64 cpver_of_node(struct page *node_page)
+{
+ struct f2fs_node *rn = F2FS_NODE(node_page);
+ return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+ struct f2fs_node *rn = F2FS_NODE(node_page);
+ return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
static inline void fill_node_footer(struct page *page, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
@@ -259,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
struct f2fs_node *rn = F2FS_NODE(page);
+ size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+ __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
- rn->footer.cp_ver = ckpt->checkpoint_ver;
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+ __u64 crc = le32_to_cpu(*((__le32 *)
+ ((unsigned char *)ckpt + crc_offset)));
+ cp_ver |= (crc << 32);
+ }
+ rn->footer.cp_ver = cpu_to_le64(cp_ver);
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
-static inline nid_t ino_of_node(struct page *node_page)
-{
- struct f2fs_node *rn = F2FS_NODE(node_page);
- return le32_to_cpu(rn->footer.ino);
-}
-
-static inline nid_t nid_of_node(struct page *node_page)
+static inline bool is_recoverable_dnode(struct page *page)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
- return le32_to_cpu(rn->footer.nid);
-}
-
-static inline unsigned int ofs_of_node(struct page *node_page)
-{
- struct f2fs_node *rn = F2FS_NODE(node_page);
- unsigned flag = le32_to_cpu(rn->footer.flag);
- return flag >> OFFSET_BIT_SHIFT;
-}
-
-static inline unsigned long long cpver_of_node(struct page *node_page)
-{
- struct f2fs_node *rn = F2FS_NODE(node_page);
- return le64_to_cpu(rn->footer.cp_ver);
-}
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+ size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+ __u64 cp_ver = cur_cp_version(ckpt);
-static inline block_t next_blkaddr_of_node(struct page *node_page)
-{
- struct f2fs_node *rn = F2FS_NODE(node_page);
- return le32_to_cpu(rn->footer.next_blkaddr);
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+ __u64 crc = le32_to_cpu(*((__le32 *)
+ ((unsigned char *)ckpt + crc_offset)));
+ cp_ver |= (crc << 32);
+ }
+ return cpu_to_le64(cp_ver) == cpver_of_node(page);
}
/*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9e652d5a659b..2fc84a991325 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -68,15 +68,17 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
- struct inode *inode)
+static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
+ struct list_head *head, nid_t ino)
{
+ struct inode *inode;
struct fsync_inode_entry *entry;
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
- if (!entry)
- return NULL;
+ inode = f2fs_iget_retry(sbi->sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
entry->inode = inode;
list_add_tail(&entry->list, head);
@@ -96,48 +98,41 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
- struct qstr name;
+ struct fscrypt_name fname;
struct page *page;
struct inode *dir, *einode;
struct fsync_inode_entry *entry;
int err = 0;
+ char *name;
entry = get_fsync_inode(dir_list, pino);
if (!entry) {
- dir = f2fs_iget(inode->i_sb, pino);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
- }
-
- entry = add_fsync_inode(dir_list, dir);
- if (!entry) {
- err = -ENOMEM;
- iput(dir);
+ entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+ if (IS_ERR(entry)) {
+ dir = ERR_CAST(entry);
+ err = PTR_ERR(entry);
goto out;
}
}
dir = entry->inode;
- if (file_enc_name(inode))
- return 0;
+ memset(&fname, 0, sizeof(struct fscrypt_name));
+ fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
+ fname.disk_name.name = raw_inode->i_name;
- name.len = le32_to_cpu(raw_inode->i_namelen);
- name.name = raw_inode->i_name;
-
- if (unlikely(name.len > F2FS_NAME_LEN)) {
+ if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
WARN_ON(1);
err = -ENAMETOOLONG;
goto out;
}
retry:
- de = f2fs_find_entry(dir, &name, &page);
+ de = __f2fs_find_entry(dir, &fname, &page);
if (de && inode->i_ino == le32_to_cpu(de->ino))
goto out_unmap_put;
if (de) {
- einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
WARN_ON(1);
err = PTR_ERR(einode);
@@ -156,18 +151,24 @@ retry:
} else if (IS_ERR(page)) {
err = PTR_ERR(page);
} else {
- err = __f2fs_add_link(dir, &name, inode,
+ err = __f2fs_do_add_link(dir, &fname, inode,
inode->i_ino, inode->i_mode);
}
+ if (err == -ENOMEM)
+ goto retry;
goto out;
out_unmap_put:
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
out:
+ if (file_enc_name(inode))
+ name = "<encrypted>";
+ else
+ name = raw_inode->i_name;
f2fs_msg(inode->i_sb, KERN_NOTICE,
"%s: ino = %x, name = %s, dir = %lx, err = %d",
- __func__, ino_of_node(ipage), raw_inode->i_name,
+ __func__, ino_of_node(ipage), name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
@@ -223,9 +224,7 @@ static bool is_same_inode(struct inode *inode, struct page *ipage)
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
- unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct inode *inode;
struct page *page = NULL;
block_t blkaddr;
int err = 0;
@@ -242,7 +241,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
page = get_tmp_page(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page))
+ if (!is_recoverable_dnode(page))
break;
if (!is_fsync_dnode(page))
@@ -263,23 +262,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- inode = f2fs_iget(sbi->sb, ino_of_node(page));
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ entry = add_fsync_inode(sbi, head, ino_of_node(page));
+ if (IS_ERR(entry)) {
+ err = PTR_ERR(entry);
if (err == -ENOENT) {
err = 0;
goto next;
}
break;
}
-
- /* add this fsync inode to the list */
- entry = add_fsync_inode(head, inode);
- if (!entry) {
- err = -ENOMEM;
- iput(inode);
- break;
- }
}
entry->blkaddr = blkaddr;
@@ -363,7 +354,7 @@ got_it:
if (ino != dn->inode->i_ino) {
/* Deallocate previous index in the node page */
- inode = f2fs_iget(sbi->sb, ino);
+ inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return PTR_ERR(inode);
} else {
@@ -431,10 +422,15 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
-
+retry_dn:
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
- if (err)
+ if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_dn;
+ }
goto out;
+ }
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
@@ -485,11 +481,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (err)
goto err;
}
-
+retry_prev:
/* Check the previous node page having this index */
err = check_index_in_prev_nodes(sbi, dest, &dn);
- if (err)
+ if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_prev;
+ }
goto err;
+ }
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
@@ -514,7 +515,6 @@ out:
static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct list_head *dir_list)
{
- unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page = NULL;
int err = 0;
@@ -534,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
page = get_tmp_page(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page)) {
+ if (!is_recoverable_dnode(page)) {
f2fs_put_page(page, 1);
break;
}
@@ -626,38 +626,20 @@ out:
}
clear_sbi_flag(sbi, SBI_POR_DOING);
- if (err) {
- bool invalidate = false;
-
- if (test_opt(sbi, LFS)) {
- update_meta_page(sbi, NULL, blkaddr);
- invalidate = true;
- } else if (discard_next_dnode(sbi, blkaddr)) {
- invalidate = true;
- }
-
- /* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META))
- sync_meta_pages(sbi, META, LONG_MAX);
+ if (err)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ mutex_unlock(&sbi->cp_mutex);
- /* invalidate temporary meta page */
- if (invalidate)
- invalidate_mapping_pages(META_MAPPING(sbi),
- blkaddr, blkaddr);
+ /* let's drop all the directory inodes for clean checkpoint */
+ destroy_fsync_dnodes(&dir_list);
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- mutex_unlock(&sbi->cp_mutex);
- } else if (need_writecp) {
+ if (!err && need_writecp) {
struct cp_control cpc = {
.reason = CP_RECOVERY,
};
- mutex_unlock(&sbi->cp_mutex);
err = write_checkpoint(sbi, &cpc);
- } else {
- mutex_unlock(&sbi->cp_mutex);
}
- destroy_fsync_dnodes(&dir_list);
kmem_cache_destroy(fsync_entry_slab);
return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a46296f57b02..fc886f008449 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -26,6 +26,7 @@
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *bio_entry_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
@@ -344,6 +345,11 @@ int commit_inmem_pages(struct inode *inode)
*/
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_CHECKPOINT))
+ f2fs_stop_checkpoint(sbi, false);
+#endif
+
if (!need)
return;
@@ -355,7 +361,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
- if (has_not_enough_free_secs(sbi, 0)) {
+ if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
f2fs_gc(sbi, false);
}
@@ -580,6 +586,74 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
+static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
+ struct bio *bio)
+{
+ struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+ struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+
+ INIT_LIST_HEAD(&be->list);
+ be->bio = bio;
+ init_completion(&be->event);
+ list_add_tail(&be->list, wait_list);
+
+ return be;
+}
+
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
+{
+ struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+ struct bio_entry *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, wait_list, list) {
+ struct bio *bio = be->bio;
+ int err;
+
+ wait_for_completion_io(&be->event);
+ err = be->error;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+
+ if (err)
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Issue discard failed, ret: %d", err);
+
+ bio_put(bio);
+ list_del(&be->list);
+ kmem_cache_free(bio_entry_slab, be);
+ }
+}
+
+static void f2fs_submit_bio_wait_endio(struct bio *bio)
+{
+ struct bio_entry *be = (struct bio_entry *)bio->bi_private;
+
+ be->error = bio->bi_error;
+ complete(&be->event);
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct bio *bio = NULL;
+ int err;
+
+ err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
+ &bio);
+ if (!err && bio) {
+ struct bio_entry *be = __add_bio_entry(sbi, bio);
+
+ bio->bi_private = be;
+ bio->bi_end_io = f2fs_submit_bio_wait_endio;
+ bio->bi_opf |= REQ_SYNC;
+ submit_bio(bio);
+ }
+
+ return err;
+}
+
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
@@ -597,29 +671,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
sbi->discard_blks--;
}
trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
- return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
-}
-
-bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
-{
- int err = -EOPNOTSUPP;
-
- if (test_opt(sbi, DISCARD)) {
- struct seg_entry *se = get_seg_entry(sbi,
- GET_SEGNO(sbi, blkaddr));
- unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-
- if (f2fs_test_bit(offset, se->discard_map))
- return false;
-
- err = f2fs_issue_discard(sbi, blkaddr, 1);
- }
-
- if (err) {
- update_meta_page(sbi, NULL, blkaddr);
- return true;
- }
- return false;
+ return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
}
static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -660,7 +712,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
bool force = (cpc->reason == CP_DISCARD);
int i;
- if (se->valid_blocks == max_blocks)
+ if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
return;
if (!force) {
@@ -719,11 +771,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct list_head *head = &(SM_I(sbi)->discard_list);
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct blk_plug plug;
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason == CP_DISCARD);
+ blk_start_plug(&plug);
+
mutex_lock(&dirty_i->seglist_lock);
while (1) {
@@ -772,6 +827,8 @@ skip:
SM_I(sbi)->nr_discards -= entry->len;
kmem_cache_free(discard_entry_slab, entry);
}
+
+ blk_finish_plug(&plug);
}
static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -818,12 +875,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
if (del > 0) {
if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
- if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ if (f2fs_discard_en(sbi) &&
+ !f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
} else {
if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
- if (f2fs_test_and_clear_bit(offset, se->discard_map))
+ if (f2fs_discard_en(sbi) &&
+ f2fs_test_and_clear_bit(offset, se->discard_map))
sbi->discard_blks++;
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -1202,7 +1261,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
- if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+ if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0))
return v_ops->get_victim(sbi,
&(curseg)->next_segno, BG_GC, type, SSR);
@@ -1277,6 +1336,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (end <= MAIN_BLKADDR(sbi))
goto out;
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Found FS corruption, run fsck to fix.");
+ goto out;
+ }
+
/* start/end segment number in main_area */
start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
@@ -1301,6 +1366,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
mutex_lock(&sbi->gc_mutex);
err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
+ if (err)
+ break;
+
+ schedule();
}
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
@@ -1391,7 +1460,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
/* direct_io'ed data is aligned to the segment for better performance */
if (direct_io && curseg->next_blkoff &&
- !has_not_enough_free_secs(sbi, 0))
+ !has_not_enough_free_secs(sbi, 0, 0))
__allocate_new_segments(sbi, type);
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
@@ -1589,11 +1658,9 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
{
struct page *cpage;
- if (blkaddr == NEW_ADDR)
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
return;
- f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
-
cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
if (cpage) {
f2fs_wait_on_page_writeback(cpage, DATA, true);
@@ -1739,7 +1806,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
int type = CURSEG_HOT_DATA;
int err;
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+ if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
int npages = npages_for_summary_flush(sbi, true);
if (npages >= 2)
@@ -1836,7 +1903,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+ if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
write_compacted_summaries(sbi, start_blk);
else
write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
@@ -2127,12 +2194,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
- sit_i->sentries[start].discard_map
- = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
if (!sit_i->sentries[start].cur_valid_map ||
- !sit_i->sentries[start].ckpt_valid_map ||
- !sit_i->sentries[start].discard_map)
+ !sit_i->sentries[start].ckpt_valid_map)
return -ENOMEM;
+
+ if (f2fs_discard_en(sbi)) {
+ sit_i->sentries[start].discard_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].discard_map)
+ return -ENOMEM;
+ }
}
sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -2239,6 +2310,8 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_journal *journal = curseg->journal;
+ struct seg_entry *se;
+ struct f2fs_sit_entry sit;
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
@@ -2251,41 +2324,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
end = (start_blk + readed) * sit_i->sents_per_block;
for (; start < end && start < MAIN_SEGS(sbi); start++) {
- struct seg_entry *se = &sit_i->sentries[start];
struct f2fs_sit_block *sit_blk;
- struct f2fs_sit_entry sit;
struct page *page;
- down_read(&curseg->journal_rwsem);
- for (i = 0; i < sits_in_cursum(journal); i++) {
- if (le32_to_cpu(segno_in_journal(journal, i))
- == start) {
- sit = sit_in_journal(journal, i);
- up_read(&curseg->journal_rwsem);
- goto got_it;
- }
- }
- up_read(&curseg->journal_rwsem);
-
+ se = &sit_i->sentries[start];
page = get_current_sit_page(sbi, start);
sit_blk = (struct f2fs_sit_block *)page_address(page);
sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
f2fs_put_page(page, 1);
-got_it:
+
check_block_count(sbi, start, &sit);
seg_info_from_raw_sit(se, &sit);
/* build discard map only one time */
- memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
-
- if (sbi->segs_per_sec > 1) {
- struct sec_entry *e = get_sec_entry(sbi, start);
- e->valid_blocks += se->valid_blocks;
+ if (f2fs_discard_en(sbi)) {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += sbi->blocks_per_seg -
+ se->valid_blocks;
}
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, start)->valid_blocks +=
+ se->valid_blocks;
}
start_blk += readed;
} while (start_blk < sit_blk_cnt);
+
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
+ unsigned int old_valid_blocks;
+
+ start = le32_to_cpu(segno_in_journal(journal, i));
+ se = &sit_i->sentries[start];
+ sit = sit_in_journal(journal, i);
+
+ old_valid_blocks = se->valid_blocks;
+
+ check_block_count(sbi, start, &sit);
+ seg_info_from_raw_sit(se, &sit);
+
+ if (f2fs_discard_en(sbi)) {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += old_valid_blocks -
+ se->valid_blocks;
+ }
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, start)->valid_blocks +=
+ se->valid_blocks - old_valid_blocks;
+ }
+ up_read(&curseg->journal_rwsem);
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -2427,6 +2517,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
INIT_LIST_HEAD(&sm_info->discard_list);
+ INIT_LIST_HEAD(&sm_info->wait_list);
sm_info->nr_discards = 0;
sm_info->max_discards = 0;
@@ -2570,10 +2661,15 @@ int __init create_segment_manager_caches(void)
if (!discard_entry_slab)
goto fail;
+ bio_entry_slab = f2fs_kmem_cache_create("bio_entry",
+ sizeof(struct bio_entry));
+ if (!bio_entry_slab)
+ goto destroy_discard_entry;
+
sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
- goto destory_discard_entry;
+ goto destroy_bio_entry;
inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
sizeof(struct inmem_pages));
@@ -2583,7 +2679,9 @@ int __init create_segment_manager_caches(void)
destroy_sit_entry_set:
kmem_cache_destroy(sit_entry_set_slab);
-destory_discard_entry:
+destroy_bio_entry:
+ kmem_cache_destroy(bio_entry_slab);
+destroy_discard_entry:
kmem_cache_destroy(discard_entry_slab);
fail:
return -ENOMEM;
@@ -2592,6 +2690,7 @@ fail:
void destroy_segment_manager_caches(void)
{
kmem_cache_destroy(sit_entry_set_slab);
+ kmem_cache_destroy(bio_entry_slab);
kmem_cache_destroy(discard_entry_slab);
kmem_cache_destroy(inmem_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b33f73ec60a4..fecb856ad874 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -479,7 +479,8 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
reserved_sections(sbi) + 1);
}
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
+ int freed, int needed)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
@@ -489,8 +490,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi));
+ return (free_sections(sbi) + freed) <=
+ (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
}
static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -587,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
- f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
- || blk_addr >= MAX_BLKADDR(sbi));
+ BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
+ || blk_addr >= MAX_BLKADDR(sbi));
}
/*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7f863a645ab1..6132b4ce4e4c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -40,7 +40,6 @@ static struct kmem_cache *f2fs_inode_cachep;
static struct kset *f2fs_kset;
#ifdef CONFIG_F2FS_FAULT_INJECTION
-struct f2fs_fault_info f2fs_fault;
char *fault_name[FAULT_MAX] = {
[FAULT_KMALLOC] = "kmalloc",
@@ -50,16 +49,21 @@ char *fault_name[FAULT_MAX] = {
[FAULT_BLOCK] = "no more block",
[FAULT_DIR_DEPTH] = "too big dir depth",
[FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_IO] = "IO error",
+ [FAULT_CHECKPOINT] = "checkpoint error",
};
-static void f2fs_build_fault_attr(unsigned int rate)
+static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+ unsigned int rate)
{
+ struct f2fs_fault_info *ffi = &sbi->fault_info;
+
if (rate) {
- atomic_set(&f2fs_fault.inject_ops, 0);
- f2fs_fault.inject_rate = rate;
- f2fs_fault.inject_type = (1 << FAULT_MAX) - 1;
+ atomic_set(&ffi->inject_ops, 0);
+ ffi->inject_rate = rate;
+ ffi->inject_type = (1 << FAULT_MAX) - 1;
} else {
- memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info));
+ memset(ffi, 0, sizeof(struct f2fs_fault_info));
}
}
#endif
@@ -87,6 +91,7 @@ enum {
Opt_inline_xattr,
Opt_inline_data,
Opt_inline_dentry,
+ Opt_noinline_dentry,
Opt_flush_merge,
Opt_noflush_merge,
Opt_nobarrier,
@@ -118,6 +123,7 @@ static match_table_t f2fs_tokens = {
{Opt_inline_xattr, "inline_xattr"},
{Opt_inline_data, "inline_data"},
{Opt_inline_dentry, "inline_dentry"},
+ {Opt_noinline_dentry, "noinline_dentry"},
{Opt_flush_merge, "flush_merge"},
{Opt_noflush_merge, "noflush_merge"},
{Opt_nobarrier, "nobarrier"},
@@ -167,7 +173,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
#ifdef CONFIG_F2FS_FAULT_INJECTION
else if (struct_type == FAULT_INFO_RATE ||
struct_type == FAULT_INFO_TYPE)
- return (unsigned char *)&f2fs_fault;
+ return (unsigned char *)&sbi->fault_info;
#endif
return NULL;
}
@@ -312,6 +318,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ ATTR_LIST(inject_rate),
+ ATTR_LIST(inject_type),
+#endif
ATTR_LIST(lifetime_write_kbytes),
NULL,
};
@@ -327,22 +337,6 @@ static struct kobj_type f2fs_ktype = {
.release = f2fs_sb_release,
};
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-/* sysfs for f2fs fault injection */
-static struct kobject f2fs_fault_inject;
-
-static struct attribute *f2fs_fault_attrs[] = {
- ATTR_LIST(inject_rate),
- ATTR_LIST(inject_type),
- NULL
-};
-
-static struct kobj_type f2fs_fault_ktype = {
- .default_attrs = f2fs_fault_attrs,
- .sysfs_ops = &f2fs_attr_ops,
-};
-#endif
-
void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
@@ -370,10 +364,6 @@ static int parse_options(struct super_block *sb, char *options)
char *p, *name;
int arg = 0;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- f2fs_build_fault_attr(0);
-#endif
-
if (!options)
return 0;
@@ -488,6 +478,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_dentry:
set_opt(sbi, INLINE_DENTRY);
break;
+ case Opt_noinline_dentry:
+ clear_opt(sbi, INLINE_DENTRY);
+ break;
case Opt_flush_merge:
set_opt(sbi, FLUSH_MERGE);
break;
@@ -533,7 +526,7 @@ static int parse_options(struct super_block *sb, char *options)
if (args->from && match_int(args, &arg))
return -EINVAL;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- f2fs_build_fault_attr(arg);
+ f2fs_build_fault_attr(sbi, arg);
#else
f2fs_msg(sb, KERN_INFO,
"FAULT_INJECTION was not selected");
@@ -730,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb)
* clean checkpoint again.
*/
if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
- !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
@@ -878,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noinline_data");
if (test_opt(sbi, INLINE_DENTRY))
seq_puts(seq, ",inline_dentry");
+ else
+ seq_puts(seq, ",noinline_dentry");
if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
if (test_opt(sbi, NOBARRIER))
@@ -946,7 +941,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
seq_printf(seq, "%d|%-3u|", se->type,
get_valid_blocks(sbi, i, 1));
for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
- seq_printf(seq, "%x ", se->cur_valid_map[j]);
+ seq_printf(seq, " %.2x", se->cur_valid_map[j]);
seq_putc(seq, '\n');
}
return 0;
@@ -975,6 +970,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_DATA);
+ set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, EXTENT_CACHE);
sbi->sb->s_flags |= MS_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
@@ -991,6 +987,10 @@ static void default_options(struct f2fs_sb_info *sbi)
#ifdef CONFIG_F2FS_FS_POSIX_ACL
set_opt(sbi, POSIX_ACL);
#endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(sbi, 0);
+#endif
}
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
@@ -1001,6 +1001,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_restart_gc = false;
bool need_stop_gc = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info ffi = sbi->fault_info;
+#endif
/*
* Save the old mount options in case we
@@ -1096,6 +1099,9 @@ restore_gc:
restore_opts:
sbi->mount_opt = org_mount_opt;
sbi->active_logs = active_logs;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ sbi->fault_info = ffi;
+#endif
return err;
}
@@ -1469,6 +1475,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
mutex_init(&sbi->umount_mutex);
mutex_init(&sbi->wio_mutex[NODE]);
mutex_init(&sbi->wio_mutex[DATA]);
+ spin_lock_init(&sbi->cp_lock);
#ifdef CONFIG_F2FS_FS_ENCRYPTION
memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
@@ -1810,7 +1817,7 @@ try_onemore:
* previous checkpoint was not done by clean system shutdown.
*/
if (bdev_read_only(sb->s_bdev) &&
- !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
err = -EROFS;
goto free_kobj;
}
@@ -1818,6 +1825,9 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ if (!retry)
+ goto skip_recovery;
+
err = recover_fsync_data(sbi, false);
if (err < 0) {
need_fsck = true;
@@ -1835,7 +1845,7 @@ try_onemore:
goto free_kobj;
}
}
-
+skip_recovery:
/* recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -1879,7 +1889,9 @@ free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
free_node_inode:
+ truncate_inode_pages_final(NODE_MAPPING(sbi));
mutex_lock(&sbi->umount_mutex);
+ release_ino_entry(sbi, true);
f2fs_leave_shrinker(sbi);
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
@@ -1978,16 +1990,6 @@ static int __init init_f2fs_fs(void)
err = -ENOMEM;
goto free_extent_cache;
}
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- f2fs_fault_inject.kset = f2fs_kset;
- f2fs_build_fault_attr(0);
- err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype,
- NULL, "fault_injection");
- if (err) {
- f2fs_fault_inject.kset = NULL;
- goto free_kset;
- }
-#endif
err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_kset;
@@ -2006,10 +2008,6 @@ free_filesystem:
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_kset:
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (f2fs_fault_inject.kset)
- kobject_put(&f2fs_fault_inject);
-#endif
kset_unregister(f2fs_kset);
free_extent_cache:
destroy_extent_cache();
@@ -2031,9 +2029,6 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
unregister_shrinker(&f2fs_shrinker_info);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- kobject_put(&f2fs_fault_inject);
-#endif
kset_unregister(f2fs_kset);
destroy_extent_cache();
destroy_checkpoint_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c8898b5148eb..3e1c0280f866 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,18 +217,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
return entry;
}
-static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+static int read_all_xattrs(struct inode *inode, struct page *ipage,
+ void **base_addr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
size_t size = PAGE_SIZE, inline_size = 0;
void *txattr_addr;
+ int err;
inline_size = inline_xattr_size(inode);
txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
if (!txattr_addr)
- return NULL;
+ return -ENOMEM;
/* read from inline xattr */
if (inline_size) {
@@ -239,8 +241,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
inline_addr = inline_xattr_addr(ipage);
} else {
page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
goto fail;
+ }
inline_addr = inline_xattr_addr(page);
}
memcpy(txattr_addr, inline_addr, inline_size);
@@ -254,8 +258,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
/* The inode already has an extended attribute block. */
xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
- if (IS_ERR(xpage))
+ if (IS_ERR(xpage)) {
+ err = PTR_ERR(xpage);
goto fail;
+ }
xattr_addr = page_address(xpage);
memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
@@ -269,10 +275,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
header->h_refcount = cpu_to_le32(1);
}
- return txattr_addr;
+ *base_addr = txattr_addr;
+ return 0;
fail:
kzfree(txattr_addr);
- return NULL;
+ return err;
}
static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
@@ -366,9 +373,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- base_addr = read_all_xattrs(inode, ipage);
- if (!base_addr)
- return -ENOMEM;
+ error = read_all_xattrs(inode, ipage, &base_addr);
+ if (error)
+ return error;
entry = __find_xattr(base_addr, index, len, name);
if (IS_XATTR_LAST_ENTRY(entry)) {
@@ -402,9 +409,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error = 0;
size_t rest = buffer_size;
- base_addr = read_all_xattrs(inode, NULL);
- if (!base_addr)
- return -ENOMEM;
+ error = read_all_xattrs(inode, NULL, &base_addr);
+ if (error)
+ return error;
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -463,9 +470,9 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (size > MAX_VALUE_LEN(inode))
return -E2BIG;
- base_addr = read_all_xattrs(inode, ipage);
- if (!base_addr)
- return -ENOMEM;
+ error = read_all_xattrs(inode, ipage, &base_addr);
+ if (error)
+ return error;
/* find entry with wanted name. */
here = __find_xattr(base_addr, index, len, name);
@@ -541,13 +548,15 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
clear_inode_flag(inode, FI_ACL_MODE);
}
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
f2fs_mark_inode_dirty_sync(inode);
+ if (!error && S_ISDIR(inode->i_mode))
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
kzfree(base_addr);
return error;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 663e428596c6..81cecbe6d7cf 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1071,7 +1071,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
}
}
- dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_atime = current_time(dir);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f70185668832..3d04b124bce0 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -63,7 +63,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
/* Equivalent to a chmod() */
ia.ia_valid = ATTR_MODE | ATTR_CTIME;
- ia.ia_ctime = current_fs_time(inode->i_sb);
+ ia.ia_ctime = current_time(inode);
if (is_dir)
ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
else {
@@ -194,7 +194,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
if (err)
goto out;
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
int err2;
@@ -297,7 +297,7 @@ static int fat_free(struct inode *inode, int skip)
MSDOS_I(inode)->i_logstart = 0;
}
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
if (wait) {
err = fat_sync_inode(inode);
if (err) {
@@ -450,7 +450,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~TIMES_SET_FLAGS;
}
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
attr->ia_valid = ia_valid;
if (error) {
if (sbi->options.quiet)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index da04c0298fab..338d2f73eb29 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -237,7 +237,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
mark_inode_dirty(inode);
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 664655b2c55f..7d6a105d601b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -283,7 +283,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto out;
}
- ts = CURRENT_TIME_SEC;
+ ts = current_time(dir);
err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
if (err)
goto out;
@@ -330,7 +330,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -364,7 +364,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
- ts = CURRENT_TIME_SEC;
+ ts = current_time(dir);
cluster = fat_alloc_new_dir(dir, &ts);
if (cluster < 0) {
err = cluster;
@@ -416,7 +416,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -481,7 +481,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
mark_inode_dirty(old_inode);
old_dir->i_version++;
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -490,7 +490,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
}
}
- ts = CURRENT_TIME_SEC;
+ ts = current_time(old_inode);
if (new_inode) {
if (err)
goto out;
@@ -596,12 +596,16 @@ error_inode:
/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct super_block *sb = old_dir->i_sb;
unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
int err, is_hid;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
mutex_lock(&MSDOS_SB(sb)->s_lock);
err = msdos_format_name(old_dentry->d_name.name,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 92b7363dafa9..6a7152d0c250 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -21,6 +21,17 @@
#include <linux/namei.h>
#include "fat.h"
+static inline unsigned long vfat_d_version(struct dentry *dentry)
+{
+ return (unsigned long) dentry->d_fsdata;
+}
+
+static inline void vfat_d_version_set(struct dentry *dentry,
+ unsigned long version)
+{
+ dentry->d_fsdata = (void *) version;
+}
+
/*
* If new entry was created in the parent, it could create the 8.3
* alias (the shortname of logname). So, the parent may have the
@@ -33,7 +44,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
{
int ret = 1;
spin_lock(&dentry->d_lock);
- if (dentry->d_time != d_inode(dentry->d_parent)->i_version)
+ if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version)
ret = 0;
spin_unlock(&dentry->d_lock);
return ret;
@@ -759,7 +770,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
if (!inode)
- dentry->d_time = dir->i_version;
+ vfat_d_version_set(dentry, dir->i_version);
return d_splice_alias(inode, dentry);
error:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -777,7 +788,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
mutex_lock(&MSDOS_SB(sb)->s_lock);
- ts = CURRENT_TIME_SEC;
+ ts = current_time(dir);
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
if (err)
goto out;
@@ -821,9 +832,9 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = current_time(inode);
fat_detach(inode);
- dentry->d_time = dir->i_version;
+ vfat_d_version_set(dentry, dir->i_version);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -847,9 +858,9 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = current_time(inode);
fat_detach(inode);
- dentry->d_time = dir->i_version;
+ vfat_d_version_set(dentry, dir->i_version);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -866,7 +877,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
mutex_lock(&MSDOS_SB(sb)->s_lock);
- ts = CURRENT_TIME_SEC;
+ ts = current_time(dir);
cluster = fat_alloc_new_dir(dir, &ts);
if (cluster < 0) {
err = cluster;
@@ -903,7 +914,8 @@ out:
}
static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct buffer_head *dotdot_bh;
struct msdos_dir_entry *dotdot_de;
@@ -914,6 +926,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = d_inode(old_dentry);
new_inode = d_inode(new_dentry);
@@ -931,7 +946,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- ts = CURRENT_TIME_SEC;
+ ts = current_time(old_dir);
if (new_inode) {
if (is_dir) {
err = fat_dir_empty(new_inode);
diff --git a/fs/file.c b/fs/file.c
index 6b1acdfe59da..69d6990e3021 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -23,12 +23,12 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
-int sysctl_nr_open __read_mostly = 1024*1024;
-int sysctl_nr_open_min = BITS_PER_LONG;
+unsigned int sysctl_nr_open __read_mostly = 1024*1024;
+unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
- -BITS_PER_LONG;
+unsigned int sysctl_nr_open_max =
+ __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
{
@@ -163,7 +163,7 @@ out:
* Return <0 error code on error; 1 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
-static int expand_fdtable(struct files_struct *files, int nr)
+static int expand_fdtable(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -208,7 +208,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
* expanded and execution may have blocked.
* The files->file_lock should be held on entry, and will be held on exit.
*/
-static int expand_files(struct files_struct *files, int nr)
+static int expand_files(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -243,12 +243,12 @@ repeat:
return expanded;
}
-static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->close_on_exec);
}
-static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
+static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
if (test_bit(fd, fdt->close_on_exec))
__clear_bit(fd, fdt->close_on_exec);
@@ -268,10 +268,10 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
-static int count_open_files(struct fdtable *fdt)
+static unsigned int count_open_files(struct fdtable *fdt)
{
- int size = fdt->max_fds;
- int i;
+ unsigned int size = fdt->max_fds;
+ unsigned int i;
/* Find the last open fd */
for (i = size / BITS_PER_LONG; i > 0; ) {
@@ -291,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, i;
+ unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files)
* files structure.
*/
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
- int i, j = 0;
+ unsigned int i, j = 0;
for (;;) {
unsigned long set;
@@ -477,11 +477,11 @@ struct files_struct init_files = {
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
-static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned long maxfd = fdt->max_fds;
- unsigned long maxbit = maxfd / BITS_PER_LONG;
- unsigned long bitbit = start / BITS_PER_LONG;
+ unsigned int maxfd = fdt->max_fds;
+ unsigned int maxbit = maxfd / BITS_PER_LONG;
+ unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 1b2f6c2c3aaf..76f09ce7e5b2 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -1,5 +1,6 @@
config FUSE_FS
tristate "FUSE (Filesystem in Userspace) support"
+ select FS_POSIX_ACL
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index e95eeb445e58..60da84a86dab 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-fuse-objs := dev.o dir.o file.o inode.o control.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
new file mode 100644
index 000000000000..ec85765502f1
--- /dev/null
+++ b/fs/fuse/acl.c
@@ -0,0 +1,99 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+
+struct posix_acl *fuse_get_acl(struct inode *inode, int type)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int size;
+ const char *name;
+ void *value = NULL;
+ struct posix_acl *acl;
+
+ if (!fc->posix_acl || fc->no_getxattr)
+ return NULL;
+
+ if (type == ACL_TYPE_ACCESS)
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ else if (type == ACL_TYPE_DEFAULT)
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ else
+ return ERR_PTR(-EOPNOTSUPP);
+
+ value = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = fuse_getxattr(inode, name, value, PAGE_SIZE);
+ if (size > 0)
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ else if ((size == 0) || (size == -ENODATA) ||
+ (size == -EOPNOTSUPP && fc->no_getxattr))
+ acl = NULL;
+ else if (size == -ERANGE)
+ acl = ERR_PTR(-E2BIG);
+ else
+ acl = ERR_PTR(size);
+
+ kfree(value);
+ return acl;
+}
+
+int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ const char *name;
+ int ret;
+
+ if (!fc->posix_acl || fc->no_setxattr)
+ return -EOPNOTSUPP;
+
+ if (type == ACL_TYPE_ACCESS)
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ else if (type == ACL_TYPE_DEFAULT)
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ else
+ return -EINVAL;
+
+ if (acl) {
+ /*
+ * Fuse userspace is responsible for updating access
+ * permissions in the inode, if needed. fuse_setxattr
+ * invalidates the inode attributes, which will force
+ * them to be refreshed the next time they are used,
+ * and it also updates i_ctime.
+ */
+ size_t size = posix_acl_xattr_size(acl->a_count);
+ void *value;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ value = kmalloc(size, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0) {
+ kfree(value);
+ return ret;
+ }
+
+ ret = fuse_setxattr(inode, name, value, size, 0);
+ kfree(value);
+ } else {
+ ret = fuse_removexattr(inode, name);
+ }
+ forget_all_cached_acls(inode);
+ fuse_invalidate_attr(inode);
+
+ return ret;
+}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index f863ac6647ac..6e22748b0704 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -220,7 +220,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
/* setting ->i_op to NULL is not allowed */
if (iop)
inode->i_op = iop;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a94d2ed81ab4..70ea57c7b6bb 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -728,7 +728,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
struct pipe_buffer *buf = cs->pipebufs;
if (!cs->write) {
- err = buf->ops->confirm(cs->pipe, buf);
+ err = pipe_buf_confirm(cs->pipe, buf);
if (err)
return err;
@@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->len = err;
cs->offset = off;
cs->pg = page;
- cs->offset = off;
iov_iter_advance(cs->iter, err);
}
@@ -828,7 +827,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
fuse_copy_finish(cs);
- err = buf->ops->confirm(cs->pipe, buf);
+ err = pipe_buf_confirm(cs->pipe, buf);
if (err)
return err;
@@ -841,7 +840,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (cs->len != PAGE_SIZE)
goto out_fallback;
- if (buf->ops->steal(cs->pipe, buf) != 0)
+ if (pipe_buf_steal(cs->pipe, buf) != 0)
goto out_fallback;
newpage = buf->page;
@@ -1342,9 +1341,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- int ret;
+ int total, ret;
int page_nr = 0;
- int do_wakeup = 0;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
struct fuse_dev *fud = fuse_get_dev(in);
@@ -1363,52 +1361,23 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (ret < 0)
goto out;
- ret = 0;
- pipe_lock(pipe);
-
- if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
- if (!ret)
- ret = -EPIPE;
- goto out_unlock;
- }
-
if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
ret = -EIO;
- goto out_unlock;
+ goto out;
}
- while (page_nr < cs.nr_segs) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
-
- buf->page = bufs[page_nr].page;
- buf->offset = bufs[page_nr].offset;
- buf->len = bufs[page_nr].len;
+ for (ret = total = 0; page_nr < cs.nr_segs; total += ret) {
/*
* Need to be careful about this. Having buf->ops in module
* code can Oops if the buffer persists after module unload.
*/
- buf->ops = &nosteal_pipe_buf_ops;
-
- pipe->nrbufs++;
- page_nr++;
- ret += buf->len;
-
- if (pipe->files)
- do_wakeup = 1;
- }
-
-out_unlock:
- pipe_unlock(pipe);
-
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ bufs[page_nr].ops = &nosteal_pipe_buf_ops;
+ ret = add_to_pipe(pipe, &bufs[page_nr++]);
+ if (unlikely(ret < 0))
+ break;
}
-
+ if (total)
+ ret = total;
out:
for (; page_nr < cs.nr_segs; page_nr++)
put_page(bufs[page_nr].page);
@@ -1993,7 +1962,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
} else {
- ibuf->ops->get(pipe, ibuf);
+ pipe_buf_get(pipe, ibuf);
*obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem;
@@ -2015,10 +1984,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
- for (idx = 0; idx < nbuf; idx++) {
- struct pipe_buffer *buf = &bufs[idx];
- buf->ops->release(pipe, buf);
- }
+ for (idx = 0; idx < nbuf; idx++)
+ pipe_buf_release(pipe, &bufs[idx]);
+
out:
kfree(bufs);
return ret;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c47b7780ce37..6a4d0e5418a1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,6 +13,8 @@
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
{
@@ -37,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
}
-#if BITS_PER_LONG >= 64
+union fuse_dentry {
+ u64 time;
+ struct rcu_head rcu;
+};
+
static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
{
- entry->d_time = time;
+ ((union fuse_dentry *) entry->d_fsdata)->time = time;
}
static inline u64 fuse_dentry_time(struct dentry *entry)
{
- return entry->d_time;
-}
-#else
-/*
- * On 32 bit archs store the high 32 bits of time in d_fsdata
- */
-static void fuse_dentry_settime(struct dentry *entry, u64 time)
-{
- entry->d_time = time;
- entry->d_fsdata = (void *) (unsigned long) (time >> 32);
-}
-
-static u64 fuse_dentry_time(struct dentry *entry)
-{
- return (u64) entry->d_time +
- ((u64) (unsigned long) entry->d_fsdata << 32);
+ return ((union fuse_dentry *) entry->d_fsdata)->time;
}
-#endif
/*
* FUSE caches dentries and attributes with separate timeout. The
* time in jiffies until the dentry/attributes are valid is stored in
- * dentry->d_time and fuse_inode->i_time respectively.
+ * dentry->d_fsdata and fuse_inode->i_time respectively.
*/
/*
* Calculate the time in jiffies until a dentry/attributes are valid
*/
-static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
+static u64 time_to_jiffies(u64 sec, u32 nsec)
{
if (sec || nsec) {
- struct timespec ts = {sec, nsec};
- return get_jiffies_64() + timespec_to_jiffies(&ts);
+ struct timespec64 ts = {
+ sec,
+ max_t(u32, nsec, NSEC_PER_SEC - 1)
+ };
+
+ return get_jiffies_64() + timespec64_to_jiffies(&ts);
} else
return 0;
}
@@ -243,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
+ forget_all_cached_acls(inode);
fuse_change_attributes(inode, &outarg.attr,
entry_attr_timeout(&outarg),
attr_version);
@@ -272,8 +267,23 @@ static int invalid_nodeid(u64 nodeid)
return !nodeid || nodeid == FUSE_ROOT_ID;
}
+static int fuse_dentry_init(struct dentry *dentry)
+{
+ dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+
+ return dentry->d_fsdata ? 0 : -ENOMEM;
+}
+static void fuse_dentry_release(struct dentry *dentry)
+{
+ union fuse_dentry *fd = dentry->d_fsdata;
+
+ kfree_rcu(fd, rcu);
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
+ .d_init = fuse_dentry_init,
+ .d_release = fuse_dentry_release,
};
int fuse_valid_type(int m)
@@ -634,10 +644,10 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
return create_new_entry(fc, &args, dir, entry, S_IFLNK);
}
-static inline void fuse_update_ctime(struct inode *inode)
+void fuse_update_ctime(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
}
}
@@ -917,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
if (time_before64(fi->i_time, get_jiffies_64())) {
r = true;
+ forget_all_cached_acls(inode);
err = fuse_do_getattr(inode, stat, file);
} else {
r = false;
@@ -1017,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
- if (fc->flags & FUSE_ALLOW_OTHER)
+ if (fc->allow_other)
return 1;
cred = current_cred();
@@ -1064,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
+ forget_all_cached_acls(inode);
return fuse_do_getattr(inode, NULL, NULL);
}
@@ -1092,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask)
/*
* If attributes are needed, refresh them before proceeding
*/
- if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
+ if (fc->default_permissions ||
((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1105,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask)
}
}
- if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+ if (fc->default_permissions) {
err = generic_permission(inode, mask);
/* If permission is denied, try to refresh file
@@ -1233,6 +1245,7 @@ retry:
fi->nlookup++;
spin_unlock(&fc->lock);
+ forget_all_cached_acls(inode);
fuse_change_attributes(inode, &o->attr,
entry_attr_timeout(o),
attr_version);
@@ -1591,9 +1604,10 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
* vmtruncate() doesn't allow for this case, so do the rlimit checking
* and the actual truncation by hand.
*/
-int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct file *file)
{
+ struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
FUSE_ARGS(args);
@@ -1605,10 +1619,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
int err;
bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
- if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+ if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -1702,172 +1716,75 @@ error:
static int fuse_setattr(struct dentry *entry, struct iattr *attr)
{
struct inode *inode = d_inode(entry);
-
- if (!fuse_allow_current_process(get_fuse_conn(inode)))
- return -EACCES;
-
- if (attr->ia_valid & ATTR_FILE)
- return fuse_do_setattr(inode, attr, attr->ia_file);
- else
- return fuse_do_setattr(inode, attr, NULL);
-}
-
-static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
- struct kstat *stat)
-{
- struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL;
+ int ret;
- if (!fuse_allow_current_process(fc))
+ if (!fuse_allow_current_process(get_fuse_conn(inode)))
return -EACCES;
- return fuse_update_attributes(inode, stat, NULL, NULL);
-}
-
-static int fuse_setxattr(struct dentry *unused, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- struct fuse_setxattr_in inarg;
- int err;
-
- if (fc->no_setxattr)
- return -EOPNOTSUPP;
+ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) {
+ attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID |
+ ATTR_MODE);
- memset(&inarg, 0, sizeof(inarg));
- inarg.size = size;
- inarg.flags = flags;
- args.in.h.opcode = FUSE_SETXATTR;
- args.in.h.nodeid = get_node_id(inode);
- args.in.numargs = 3;
- args.in.args[0].size = sizeof(inarg);
- args.in.args[0].value = &inarg;
- args.in.args[1].size = strlen(name) + 1;
- args.in.args[1].value = name;
- args.in.args[2].size = size;
- args.in.args[2].value = value;
- err = fuse_simple_request(fc, &args);
- if (err == -ENOSYS) {
- fc->no_setxattr = 1;
- err = -EOPNOTSUPP;
- }
- if (!err) {
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
+ /*
+ * The only sane way to reliably kill suid/sgid is to do it in
+ * the userspace filesystem
+ *
+ * This should be done on write(), truncate() and chown().
+ */
+ if (!fc->handle_killpriv) {
+ int kill;
+
+ /*
+ * ia_mode calculation may have used stale i_mode.
+ * Refresh and recalculate.
+ */
+ ret = fuse_do_getattr(inode, NULL, file);
+ if (ret)
+ return ret;
+
+ attr->ia_mode = inode->i_mode;
+ kill = should_remove_suid(entry);
+ if (kill & ATTR_KILL_SUID) {
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode &= ~S_ISUID;
+ }
+ if (kill & ATTR_KILL_SGID) {
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode &= ~S_ISGID;
+ }
+ }
}
- return err;
-}
-
-static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- struct fuse_getxattr_in inarg;
- struct fuse_getxattr_out outarg;
- ssize_t ret;
+ if (!attr->ia_valid)
+ return 0;
- if (fc->no_getxattr)
- return -EOPNOTSUPP;
+ ret = fuse_do_setattr(entry, attr, file);
+ if (!ret) {
+ /*
+ * If filesystem supports acls it may have updated acl xattrs in
+ * the filesystem, so forget cached acls for the inode.
+ */
+ if (fc->posix_acl)
+ forget_all_cached_acls(inode);
- memset(&inarg, 0, sizeof(inarg));
- inarg.size = size;
- args.in.h.opcode = FUSE_GETXATTR;
- args.in.h.nodeid = get_node_id(inode);
- args.in.numargs = 2;
- args.in.args[0].size = sizeof(inarg);
- args.in.args[0].value = &inarg;
- args.in.args[1].size = strlen(name) + 1;
- args.in.args[1].value = name;
- /* This is really two different operations rolled into one */
- args.out.numargs = 1;
- if (size) {
- args.out.argvar = 1;
- args.out.args[0].size = size;
- args.out.args[0].value = value;
- } else {
- args.out.args[0].size = sizeof(outarg);
- args.out.args[0].value = &outarg;
- }
- ret = fuse_simple_request(fc, &args);
- if (!ret && !size)
- ret = outarg.size;
- if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
- ret = -EOPNOTSUPP;
+ /* Directory mode changed, may need to revalidate access */
+ if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE))
+ fuse_invalidate_entry_cache(entry);
}
return ret;
}
-static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+ struct kstat *stat)
{
struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- struct fuse_getxattr_in inarg;
- struct fuse_getxattr_out outarg;
- ssize_t ret;
if (!fuse_allow_current_process(fc))
return -EACCES;
- if (fc->no_listxattr)
- return -EOPNOTSUPP;
-
- memset(&inarg, 0, sizeof(inarg));
- inarg.size = size;
- args.in.h.opcode = FUSE_LISTXATTR;
- args.in.h.nodeid = get_node_id(inode);
- args.in.numargs = 1;
- args.in.args[0].size = sizeof(inarg);
- args.in.args[0].value = &inarg;
- /* This is really two different operations rolled into one */
- args.out.numargs = 1;
- if (size) {
- args.out.argvar = 1;
- args.out.args[0].size = size;
- args.out.args[0].value = list;
- } else {
- args.out.args[0].size = sizeof(outarg);
- args.out.args[0].value = &outarg;
- }
- ret = fuse_simple_request(fc, &args);
- if (!ret && !size)
- ret = outarg.size;
- if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
- ret = -EOPNOTSUPP;
- }
- return ret;
-}
-
-static int fuse_removexattr(struct dentry *entry, const char *name)
-{
- struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- int err;
-
- if (fc->no_removexattr)
- return -EOPNOTSUPP;
-
- args.in.h.opcode = FUSE_REMOVEXATTR;
- args.in.h.nodeid = get_node_id(inode);
- args.in.numargs = 1;
- args.in.args[0].size = strlen(name) + 1;
- args.in.args[0].value = name;
- err = fuse_simple_request(fc, &args);
- if (err == -ENOSYS) {
- fc->no_removexattr = 1;
- err = -EOPNOTSUPP;
- }
- if (!err) {
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
- }
- return err;
+ return fuse_update_attributes(inode, stat, NULL, NULL);
}
static const struct inode_operations fuse_dir_inode_operations = {
@@ -1876,7 +1793,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.symlink = fuse_symlink,
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
- .rename2 = fuse_rename2,
+ .rename = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,
.create = fuse_create,
@@ -1884,10 +1801,9 @@ static const struct inode_operations fuse_dir_inode_operations = {
.mknod = fuse_mknod,
.permission = fuse_permission,
.getattr = fuse_getattr,
- .setxattr = fuse_setxattr,
- .getxattr = fuse_getxattr,
.listxattr = fuse_listxattr,
- .removexattr = fuse_removexattr,
+ .get_acl = fuse_get_acl,
+ .set_acl = fuse_set_acl,
};
static const struct file_operations fuse_dir_operations = {
@@ -1905,10 +1821,9 @@ static const struct inode_operations fuse_common_inode_operations = {
.setattr = fuse_setattr,
.permission = fuse_permission,
.getattr = fuse_getattr,
- .setxattr = fuse_setxattr,
- .getxattr = fuse_getxattr,
.listxattr = fuse_listxattr,
- .removexattr = fuse_removexattr,
+ .get_acl = fuse_get_acl,
+ .set_acl = fuse_set_acl,
};
static const struct inode_operations fuse_symlink_inode_operations = {
@@ -1916,10 +1831,7 @@ static const struct inode_operations fuse_symlink_inode_operations = {
.get_link = fuse_get_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
- .setxattr = fuse_setxattr,
- .getxattr = fuse_getxattr,
.listxattr = fuse_listxattr,
- .removexattr = fuse_removexattr,
};
void fuse_init_common(struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3988b43c2f5a..abc66a6237fd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2326,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
return retval;
}
-static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
- unsigned int nr_segs, size_t bytes, bool to_user)
-{
- struct iov_iter ii;
- int page_idx = 0;
-
- if (!bytes)
- return 0;
-
- iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
-
- while (iov_iter_count(&ii)) {
- struct page *page = pages[page_idx++];
- size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
- void *kaddr;
-
- kaddr = kmap(page);
-
- while (todo) {
- char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
- size_t iov_len = ii.iov->iov_len - ii.iov_offset;
- size_t copy = min(todo, iov_len);
- size_t left;
-
- if (!to_user)
- left = copy_from_user(kaddr, uaddr, copy);
- else
- left = copy_to_user(uaddr, kaddr, copy);
-
- if (unlikely(left))
- return -EFAULT;
-
- iov_iter_advance(&ii, copy);
- todo -= copy;
- kaddr += copy;
- }
-
- kunmap(page);
- }
-
- return 0;
-}
-
/*
* CUSE servers compiled on 32bit broke on 64bit kernels because the
* ABI was defined to be 'struct iovec' which is different on 32bit
@@ -2520,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
struct iovec *iov_page = NULL;
struct iovec *in_iov = NULL, *out_iov = NULL;
unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
- size_t in_size, out_size, transferred;
- int err;
+ size_t in_size, out_size, transferred, c;
+ int err, i;
+ struct iov_iter ii;
#if BITS_PER_LONG == 32
inarg.flags |= FUSE_IOCTL_32BIT;
@@ -2603,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
req->in.args[1].size = in_size;
req->in.argpages = 1;
- err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
- false);
- if (err)
- goto out;
+ err = -EFAULT;
+ iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
+ c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
}
req->out.numargs = 2;
@@ -2672,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
if (transferred > inarg.out_size)
goto out;
- err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ err = -EFAULT;
+ iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
+ c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ err = 0;
out:
if (req)
fuse_put_request(fc, req);
@@ -2842,7 +2810,7 @@ static void fuse_do_truncate(struct file *file)
attr.ia_file = file;
attr.ia_valid |= ATTR_FILE;
- fuse_do_setattr(inode, &attr, file);
+ fuse_do_setattr(file_dentry(file), &attr, file);
}
static inline loff_t fuse_round_up(loff_t off)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d98d8cc84def..0dfbb136e59a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -23,6 +23,7 @@
#include <linux/poll.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
+#include <linux/xattr.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -36,15 +37,6 @@
/** Number of dentries for each connection in the control filesystem */
#define FUSE_CTL_NUM_DENTRIES 5
-/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
- module will check permissions based on the file mode. Otherwise no
- permission checking is done in the kernel */
-#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
-
-/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
- doing the mount will be allowed to access the filesystem */
-#define FUSE_ALLOW_OTHER (1 << 1)
-
/** Number of page pointers embedded in fuse_req */
#define FUSE_REQ_INLINE_PAGES 1
@@ -469,9 +461,6 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;
- /** The fuse mount flags for this mount */
- unsigned flags;
-
/** Maximum read size */
unsigned max_read;
@@ -547,6 +536,9 @@ struct fuse_conn {
/** allow parallel lookups and readdir (default is serialized) */
unsigned parallel_dirops:1;
+ /** handle fs handles killing suid/sgid/cap on write/chown/trunc */
+ unsigned handle_killpriv:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -624,6 +616,15 @@ struct fuse_conn {
/** Is lseek not implemented by fs? */
unsigned no_lseek:1;
+ /** Does the filesystem support posix acls? */
+ unsigned posix_acl:1;
+
+ /** Check permissions based on the file mode or not? */
+ unsigned default_permissions:1;
+
+ /** Allow other than the mounter user to access the filesystem ? */
+ unsigned allow_other:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -902,6 +903,8 @@ int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+void fuse_update_ctime(struct inode *inode);
+
int fuse_update_attributes(struct inode *inode, struct kstat *stat,
struct file *file, bool *refreshed);
@@ -958,7 +961,7 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
-int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct file *file);
void fuse_set_initialized(struct fuse_conn *fc);
@@ -966,4 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc);
void fuse_unlock_inode(struct inode *inode);
void fuse_lock_inode(struct inode *inode);
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags);
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+ size_t size);
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
+int fuse_removexattr(struct inode *inode, const char *name);
+extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler *fuse_acl_xattr_handlers[];
+
+struct posix_acl;
+struct posix_acl *fuse_get_acl(struct inode *inode, int type);
+int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4e05b51120f4..17141099f2e7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -20,6 +20,7 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/exportfs.h>
+#include <linux/posix_acl.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -66,7 +67,8 @@ struct fuse_mount_data {
unsigned rootmode_present:1;
unsigned user_id_present:1;
unsigned group_id_present:1;
- unsigned flags;
+ unsigned default_permissions:1;
+ unsigned allow_other:1;
unsigned max_read;
unsigned blksize;
};
@@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
* check in may_delete().
*/
fi->orig_i_mode = inode->i_mode;
- if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+ if (!fc->default_permissions)
inode->i_mode &= ~S_ISVTX;
fi->orig_ino = attr->ino;
@@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
return -ENOENT;
fuse_invalidate_attr(inode);
+ forget_all_cached_acls(inode);
if (offset >= 0) {
pg_start = offset >> PAGE_SHIFT;
if (len <= 0)
@@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
break;
case OPT_DEFAULT_PERMISSIONS:
- d->flags |= FUSE_DEFAULT_PERMISSIONS;
+ d->default_permissions = 1;
break;
case OPT_ALLOW_OTHER:
- d->flags |= FUSE_ALLOW_OTHER;
+ d->allow_other = 1;
break;
case OPT_MAX_READ:
@@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
- if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+ if (fc->default_permissions)
seq_puts(m, ",default_permissions");
- if (fc->flags & FUSE_ALLOW_OTHER)
+ if (fc->allow_other)
seq_puts(m, ",allow_other");
if (fc->max_read != ~0)
seq_printf(m, ",max_read=%u", fc->max_read);
@@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->writeback_cache = 1;
if (arg->flags & FUSE_PARALLEL_DIROPS)
fc->parallel_dirops = 1;
+ if (arg->flags & FUSE_HANDLE_KILLPRIV)
+ fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
fc->sb->s_time_gran = arg->time_gran;
+ if ((arg->flags & FUSE_POSIX_ACL)) {
+ fc->default_permissions = 1;
+ fc->posix_acl = 1;
+ fc->sb->s_xattr = fuse_acl_xattr_handlers;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
- FUSE_PARALLEL_DIROPS;
+ FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1071,6 +1081,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_magic = FUSE_SUPER_MAGIC;
sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_time_gran = 1;
sb->s_export_op = &fuse_export_operations;
@@ -1109,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->dont_mask = 1;
sb->s_flags |= MS_POSIXACL;
- fc->flags = d.flags;
+ fc->default_permissions = d.default_permissions;
+ fc->allow_other = d.allow_other;
fc->user_id = d.user_id;
fc->group_id = d.group_id;
fc->max_read = max_t(unsigned, 4096, d.max_read);
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
new file mode 100644
index 000000000000..3caac46b08b0
--- /dev/null
+++ b/fs/fuse/xattr.c
@@ -0,0 +1,211 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_setxattr_in inarg;
+ int err;
+
+ if (fc->no_setxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ inarg.flags = flags;
+ args.in.h.opcode = FUSE_SETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 3;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
+ args.in.args[2].size = size;
+ args.in.args[2].value = value;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_setxattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err) {
+ fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
+ return err;
+}
+
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (fc->no_getxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.in.h.opcode = FUSE_GETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
+ /* This is really two different operations rolled into one */
+ args.out.numargs = 1;
+ if (size) {
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = value;
+ } else {
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
+ if (ret == -ENOSYS) {
+ fc->no_getxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+static int fuse_verify_xattr_list(char *list, size_t size)
+{
+ size_t origsize = size;
+
+ while (size) {
+ size_t thislen = strnlen(list, size);
+
+ if (!thislen || thislen == size)
+ return -EIO;
+
+ size -= thislen + 1;
+ list += thislen + 1;
+ }
+
+ return origsize;
+}
+
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (fc->no_listxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.in.h.opcode = FUSE_LISTXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ /* This is really two different operations rolled into one */
+ args.out.numargs = 1;
+ if (size) {
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = list;
+ } else {
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
+ if (ret > 0 && size)
+ ret = fuse_verify_xattr_list(list, ret);
+ if (ret == -ENOSYS) {
+ fc->no_listxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+int fuse_removexattr(struct inode *inode, const char *name)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ int err;
+
+ if (fc->no_removexattr)
+ return -EOPNOTSUPP;
+
+ args.in.h.opcode = FUSE_REMOVEXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = strlen(name) + 1;
+ args.in.args[0].value = name;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_removexattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err) {
+ fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
+ return err;
+}
+
+static int fuse_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ return fuse_getxattr(inode, name, value, size);
+}
+
+static int fuse_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ if (!value)
+ return fuse_removexattr(inode, name);
+
+ return fuse_setxattr(inode, name, value, size, flags);
+}
+
+static const struct xattr_handler fuse_xattr_handler = {
+ .prefix = "",
+ .get = fuse_xattr_get,
+ .set = fuse_xattr_set,
+};
+
+const struct xattr_handler *fuse_xattr_handlers[] = {
+ &fuse_xattr_handler,
+ NULL
+};
+
+const struct xattr_handler *fuse_acl_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &fuse_xattr_handler,
+ NULL
+};
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 363ba9e9d8d0..2524807ee070 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -92,17 +92,11 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (type == ACL_TYPE_ACCESS) {
umode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
-
- if (error == 0)
- acl = NULL;
-
- if (mode != inode->i_mode) {
- inode->i_mode = mode;
+ if (mode != inode->i_mode)
mark_inode_dirty(inode);
- }
}
if (acl) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 82df36886938..5a6f52ea2722 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
ClearPageChecked(page);
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ BIT(BH_Dirty)|BIT(BH_Uptodate));
}
gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
}
@@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
if (!page_has_buffers(page))
return 0;
+ /*
+ * From xfs_vm_releasepage: mm accommodates an old ext3 case where
+ * clean pages might not have had the dirty bit cleared. Thus, it can
+ * send actual dirty pages to ->releasepage() via shrink_active_list().
+ *
+ * As a workaround, we skip pages that contain dirty buffers below.
+ * Once ->releasepage isn't called on dirty pages anymore, we can warn
+ * on dirty buffers like we used to here again.
+ */
+
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
head = bh = page_buffers(page);
@@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bd = bh->b_private;
if (bd && bd->bd_tr)
goto cannot_release;
- if (buffer_pinned(bh) || buffer_dirty(bh))
- goto not_possible;
+ if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh)))
+ goto cannot_release;
bh = bh->b_this_page;
} while(bh != head);
spin_unlock(&sdp->sd_ail_lock);
@@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
return try_to_free_buffers(page);
-not_possible: /* Should never happen */
- WARN_ON(buffer_dirty(bh));
- WARN_ON(buffer_pinned(bh));
cannot_release:
spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6e2bec1cd289..fc5da4cbe88c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
}
if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits,
- (1 << BH_Uptodate));
+ create_empty_buffers(page, BIT(inode->i_blkbits),
+ BIT(BH_Uptodate));
bh = page_buffers(page);
@@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
BUG_ON(!dblock);
BUG_ON(!new);
- bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
+ bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
ret = gfs2_block_map(inode, lblock, &bh, create);
*extlen = bh.b_size >> inode->i_blkbits;
*dblock = bh.b_blocknr;
@@ -836,7 +836,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1063,7 +1063,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
}
i_size_write(inode, newsize);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
if (journaled)
@@ -1142,7 +1142,7 @@ static int trunc_end(struct gfs2_inode *ip)
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
gfs2_ordered_del_inode(ip);
}
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -1252,7 +1252,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_end_trans;
i_size_write(inode, size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fcb59b23f1e3..3cdde5f5d399 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -135,7 +135,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -233,7 +233,7 @@ out:
if (ip->i_inode.i_size < offset + copied)
i_size_write(&ip->i_inode, offset + copied);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
@@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
if (hc)
return hc;
- hsize = 1 << ip->i_depth;
+ hsize = BIT(ip->i_depth);
hsize *= sizeof(__be64);
if (hsize != i_size_read(&ip->i_inode)) {
gfs2_consist_inode(ip);
@@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf;
- unsigned hsize = 1 << ip->i_depth;
- unsigned index;
+ unsigned int hsize = BIT(ip->i_depth);
+ unsigned int index;
u64 ln;
if (hsize * sizeof(u64) != i_size_read(inode)) {
gfs2_consist_inode(ip);
@@ -872,7 +872,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct gfs2_leaf *leaf;
struct gfs2_dirent *dent;
struct qstr name = { .name = "" };
- struct timespec tv = CURRENT_TIME;
+ struct timespec tv = current_time(inode);
error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
if (error)
@@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode)
return -ENOSPC;
bn = bh->b_blocknr;
- gfs2_assert(sdp, dip->i_entries < (1 << 16));
+ gfs2_assert(sdp, dip->i_entries < BIT(16));
leaf->lf_entries = cpu_to_be16(dip->i_entries);
/* Copy dirents */
@@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
bn = nbh->b_blocknr;
/* Compute the start and len of leaf pointers in the hash table. */
- len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
+ len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth));
half_len = len >> 1;
if (!half_len) {
pr_warn("i_depth %u lf_depth %u index %u\n",
@@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
int x;
int error = 0;
- hsize = 1 << dip->i_depth;
+ hsize = BIT(dip->i_depth);
hsize_bytes = hsize * sizeof(__be64);
hc = gfs2_dir_get_hash_table(dip);
@@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx,
int error = 0;
unsigned depth = 0;
- hsize = 1 << dip->i_depth;
+ hsize = BIT(dip->i_depth);
hash = gfs2_dir_offset2hash(ctx->pos);
index = hash >> (32 - dip->i_depth);
@@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx,
if (error)
break;
- len = 1 << (dip->i_depth - depth);
+ len = BIT(dip->i_depth - depth);
index = (index & ~(len - 1)) + len;
}
@@ -1816,7 +1816,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
- tv = CURRENT_TIME;
+ tv = current_time(&ip->i_inode);
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
@@ -1878,7 +1878,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
- struct timespec tv = CURRENT_TIME;
+ struct timespec tv = current_time(&dip->i_inode);
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1960,7 +1960,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
gfs2_trans_add_meta(dip->i_gl, bh);
}
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+ dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode);
gfs2_dinode_out(dip, bh->b_data);
brelse(bh);
return 0;
@@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
u64 leaf_no;
int error = 0, last;
- hsize = 1 << dip->i_depth;
+ hsize = BIT(dip->i_depth);
lp = gfs2_dir_get_hash_table(dip);
if (IS_ERR(lp))
@@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
if (error)
goto out;
leaf = (struct gfs2_leaf *)bh->b_data;
- len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+ len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth));
next_index = (index & ~(len - 1)) + len;
last = ((next_index >= hsize) ? 1 : 0);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 320e65e61938..e23ff70b3435 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
- /* Update file times before taking page lock */
- file_update_time(vma->vm_file);
-
ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
@@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out_uninit;
+ /* Update file times before taking page lock */
+ file_update_time(vma->vm_file);
+
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
@@ -954,30 +954,6 @@ out_uninit:
return ret;
}
-static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
-{
- struct inode *inode = in->f_mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- inode_lock(inode);
-
- ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
-
- gfs2_glock_dq_uninit(&gh);
- inode_unlock(inode);
-
- return generic_file_splice_read(in, ppos, pipe, len, flags);
-}
-
-
static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
@@ -1140,7 +1116,7 @@ const struct file_operations gfs2_file_fops = {
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = gfs2_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
@@ -1168,7 +1144,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = gfs2_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3a90b2b5b9bb..14cbf60167a7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
-#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT)
static struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
@@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
- register_shrinker(&glock_shrinker);
+ ret = register_shrinker(&glock_shrinker);
+ if (ret) {
+ destroy_workqueue(gfs2_delete_workqueue);
+ destroy_workqueue(glock_workqueue);
+ rhashtable_destroy(&gl_hash_table);
+ return ret;
+ }
return 0;
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e4da0ecd3285..fe3f84995c48 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
}
gfs2_set_iop(inode);
+
+ inode->i_atime.tv_sec = 0;
+ inode->i_atime.tv_nsec = 0;
+
unlock_new_inode(inode);
}
@@ -652,7 +656,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
gfs2_set_inode_blocks(inode, 1);
munge_mode_uid_gid(dip, inode);
check_and_update_goal(dip);
@@ -979,7 +983,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_trans_add_meta(ip->i_gl, dibh);
inc_nlink(&ip->i_inode);
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
ihold(inode);
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
@@ -1063,7 +1067,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
return error;
ip->i_entries = 0;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
if (S_ISDIR(inode->i_mode))
clear_nlink(inode);
else
@@ -1326,7 +1330,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
return error;
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1932,7 +1936,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
goto out;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
goto out;
@@ -2036,10 +2040,7 @@ const struct inode_operations gfs2_file_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
@@ -2054,14 +2055,11 @@ const struct inode_operations gfs2_dir_iops = {
.mkdir = gfs2_mkdir,
.rmdir = gfs2_unlink,
.mknod = gfs2_mknod,
- .rename2 = gfs2_rename2,
+ .rename = gfs2_rename2,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
@@ -2074,10 +2072,7 @@ const struct inode_operations gfs2_symlink_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = gfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = gfs2_fiemap,
};
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 7710dfd3af35..aace8ce34a18 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode,
u64 size = i_size_read(inode);
if (size < minsize || size > maxsize)
goto err;
- if (size & ((1 << inode->i_blkbits) - 1))
+ if (size & (BIT(inode->i_blkbits) - 1))
goto err;
return 0;
err:
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 74fd0139e6c2..67d1fc4668f7 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void)
if (!gfs2_qadata_cachep)
goto fail;
- register_shrinker(&gfs2_qd_shrinker);
+ error = register_shrinker(&gfs2_qd_shrinker);
+ if (error)
+ goto fail;
error = register_filesystem(&gfs2_fs_type);
if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 950b8be68e41..373639a59782 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio)
static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
int num)
{
- struct buffer_head *bh = bhs[0];
- struct bio *bio;
- int i;
-
- if (!num)
- return;
-
- bio = bio_alloc(GFP_NOIO, num);
- bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
- for (i = 0; i < num; i++) {
- bh = bhs[i];
- bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ while (num > 0) {
+ struct buffer_head *bh = *bhs;
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_NOIO, num);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ while (num > 0) {
+ bh = *bhs;
+ if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
+ BUG_ON(bio->bi_iter.bi_size == 0);
+ break;
+ }
+ bhs++;
+ num--;
+ }
+ bio->bi_end_io = gfs2_meta_read_endio;
+ bio_set_op_attrs(bio, op, op_flags);
+ submit_bio(bio);
}
- bio->bi_end_io = gfs2_meta_read_endio;
- bio_set_op_attrs(bio, op, op_flags);
- submit_bio(bio);
}
/**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ef1e1822977f..ff72ac6439c8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_quota_scale_num = 1;
gt->gt_quota_scale_den = 1;
gt->gt_new_files_jdata = 0;
- gt->gt_max_readahead = 1 << 18;
+ gt->gt_max_readahead = BIT(18);
gt->gt_complain_secs = 10;
}
@@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
GFS2_BASIC_BLOCK_SHIFT;
- sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+ sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_dinode)) / sizeof(u64);
sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
@@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
/* Compute maximum reservation required to add a entry to a directory */
- hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+ hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH),
sdp->sd_jbsize);
ind_blocks = 0;
@@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
GFS2_BASIC_BLOCK_SHIFT;
- sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+ sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 77930ca25303..c2ca9566b764 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -75,7 +75,7 @@
#include "util.h"
#define GFS2_QD_HASH_SHIFT 12
-#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT)
#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
@@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd)
block = qd->qd_slot / sdp->sd_qc_per_block;
offset = qd->qd_slot % sdp->sd_qc_per_block;
- bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+ bh_map.b_size = BIT(ip->i_inode.i_blkbits);
error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
if (error)
goto fail;
@@ -854,7 +854,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
- inode->i_mtime = inode->i_atime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = current_time(inode);
mark_inode_dirty(inode);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 3a7e60bb39f8..e3ee387a6dfe 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
u64 size = i_size_read(jd->jd_inode);
- if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
+ if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30)))
return -EIO;
jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 3a2853504084..a4a577088d19 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -775,7 +775,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -910,7 +910,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto out;
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1133,7 +1133,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_ctime = current_time(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index d9a86919fdf6..0933600e11c8 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -13,9 +13,13 @@
#include "hfs_fs.h"
#include "btree.h"
-int hfs_setxattr(struct dentry *unused, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
+enum hfs_xattr_type {
+ HFS_TYPE,
+ HFS_CREATOR,
+};
+
+static int __hfs_setxattr(struct inode *inode, enum hfs_xattr_type type,
+ const void *value, size_t size, int flags)
{
struct hfs_find_data fd;
hfs_cat_rec rec;
@@ -36,18 +40,22 @@ int hfs_setxattr(struct dentry *unused, struct inode *inode,
sizeof(struct hfs_cat_file));
file = &rec.file;
- if (!strcmp(name, "hfs.type")) {
+ switch (type) {
+ case HFS_TYPE:
if (size == 4)
memcpy(&file->UsrWds.fdType, value, 4);
else
res = -ERANGE;
- } else if (!strcmp(name, "hfs.creator")) {
+ break;
+
+ case HFS_CREATOR:
if (size == 4)
memcpy(&file->UsrWds.fdCreator, value, 4);
else
res = -ERANGE;
- } else
- res = -EOPNOTSUPP;
+ break;
+ }
+
if (!res)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
@@ -56,8 +64,8 @@ out:
return res;
}
-ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode,
- const char *name, void *value, size_t size)
+static ssize_t __hfs_getxattr(struct inode *inode, enum hfs_xattr_type type,
+ void *value, size_t size)
{
struct hfs_find_data fd;
hfs_cat_rec rec;
@@ -80,41 +88,64 @@ ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode,
}
file = &rec.file;
- if (!strcmp(name, "hfs.type")) {
+ switch (type) {
+ case HFS_TYPE:
if (size >= 4) {
memcpy(value, &file->UsrWds.fdType, 4);
res = 4;
} else
res = size ? -ERANGE : 4;
- } else if (!strcmp(name, "hfs.creator")) {
+ break;
+
+ case HFS_CREATOR:
if (size >= 4) {
memcpy(value, &file->UsrWds.fdCreator, 4);
res = 4;
} else
res = size ? -ERANGE : 4;
- } else
- res = -ENODATA;
+ break;
+ }
+
out:
if (size)
hfs_find_exit(&fd);
return res;
}
-#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
-
-ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+static int hfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- struct inode *inode = d_inode(dentry);
+ return __hfs_getxattr(inode, handler->flags, value, size);
+}
- if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
+static int hfs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ if (!value)
return -EOPNOTSUPP;
- if (!buffer || !size)
- return HFS_ATTRLIST_SIZE;
- if (size < HFS_ATTRLIST_SIZE)
- return -ERANGE;
- strcpy(buffer, "hfs.type");
- strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
-
- return HFS_ATTRLIST_SIZE;
+ return __hfs_setxattr(inode, handler->flags, value, size, flags);
}
+
+static const struct xattr_handler hfs_creator_handler = {
+ .name = "hfs.creator",
+ .flags = HFS_CREATOR,
+ .get = hfs_xattr_get,
+ .set = hfs_xattr_set,
+};
+
+static const struct xattr_handler hfs_type_handler = {
+ .name = "hfs.type",
+ .flags = HFS_TYPE,
+ .get = hfs_xattr_get,
+ .set = hfs_xattr_set,
+};
+
+const struct xattr_handler *hfs_xattr_handlers[] = {
+ &hfs_creator_handler,
+ &hfs_type_handler,
+ NULL
+};
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8f4afd3f5108..8a66405b0f8b 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -125,7 +125,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
goto err1;
dir->i_size++;
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
hfs_find_exit(&fd);
return 0;
@@ -261,7 +261,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
}
dir->i_size--;
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
res = 0;
out:
@@ -321,7 +321,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
dst_dir->i_size++;
- dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+ dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
mark_inode_dirty(dst_dir);
/* finally remove the old entry */
@@ -333,7 +333,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
src_dir->i_size--;
- src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+ src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
mark_inode_dirty(src_dir);
type = entry.type;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 163190ecc0d2..5de5c48b418d 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -268,7 +268,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
if (res)
return res;
clear_nlink(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
hfs_delete_inode(inode);
mark_inode_dirty(inode);
return 0;
@@ -286,10 +286,14 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
* XXX: how do you handle must_be dir?
*/
static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
int res;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
/* Unlink destination if it already exists */
if (d_really_is_positive(new_dentry)) {
res = hfs_remove(new_dir, new_dentry);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 16f5172ee40d..4cdec5a19347 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -212,11 +212,7 @@ extern void hfs_evict_inode(struct inode *);
extern void hfs_delete_inode(struct inode *);
/* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags);
-extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size);
-extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern const struct xattr_handler *hfs_xattr_handlers[];
/* mdb.c */
extern int hfs_mdb_get(struct super_block *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c6a32415735b..f776acf2378a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -15,6 +15,7 @@
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/xattr.h>
#include "hfs_fs.h"
#include "btree.h"
@@ -193,7 +194,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
@@ -605,7 +606,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
int error;
- error = inode_change_ok(inode, attr); /* basic permission checks */
+ error = setattr_prepare(dentry, attr); /* basic permission checks */
if (error)
return error;
@@ -687,7 +688,5 @@ static const struct file_operations hfs_file_operations = {
static const struct inode_operations hfs_file_inode_operations = {
.lookup = hfs_file_lookup,
.setattr = hfs_inode_setattr,
- .setxattr = hfs_setxattr,
- .getxattr = hfs_getxattr,
- .listxattr = hfs_listxattr,
+ .listxattr = generic_listxattr,
};
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1ca95c232bb5..bf6304a350a6 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -406,6 +406,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_op = &hfs_super_operations;
+ sb->s_xattr = hfs_xattr_handlers;
sb->s_flags |= MS_NODIRATIME;
mutex_init(&sbi->bitmap_lock);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 142534d3c2d5..a5e00f7a4c14 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -303,7 +303,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dir->i_size++;
if (S_ISDIR(inode->i_mode))
hfsplus_subfolders_inc(dir);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
hfs_find_exit(&fd);
@@ -400,7 +400,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(dir);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -469,7 +469,7 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_size++;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_inc(dst_dir);
- dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
+ dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
/* finally remove the old entry */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -486,7 +486,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(src_dir);
- src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
+ src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
/* remove old thread entry */
hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 42e128661dc1..31d5e3f1fe17 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -347,7 +347,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
inc_nlink(inode);
hfsplus_instantiate(dst_dentry, inode, cnid);
ihold(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
sbi->file_count++;
hfsplus_mark_mdb_dirty(dst_dir->i_sb);
@@ -406,7 +406,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
hfsplus_delete_inode(inode);
} else
sbi->file_count--;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
out:
mutex_unlock(&sbi->vh_mutex);
@@ -427,7 +427,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
if (res)
goto out;
clear_nlink(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
hfsplus_delete_inode(inode);
mark_inode_dirty(inode);
out:
@@ -530,10 +530,14 @@ static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
int res;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
/* Unlink destination if it already exists */
if (d_really_is_positive(new_dentry)) {
if (d_is_dir(new_dentry))
@@ -562,10 +566,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.symlink = hfsplus_symlink,
.mknod = hfsplus_mknod,
.rename = hfsplus_rename,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = hfsplus_listxattr,
- .removexattr = generic_removexattr,
#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
.get_acl = hfsplus_get_posix_acl,
.set_acl = hfsplus_set_posix_acl,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 19462d773fe2..2e796f8302ff 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -245,7 +245,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -333,10 +333,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.setattr = hfsplus_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = hfsplus_listxattr,
- .removexattr = generic_removexattr,
#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
.get_acl = hfsplus_get_posix_acl,
.set_acl = hfsplus_set_posix_acl,
@@ -369,7 +366,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 32a49e292b6a..99627f8a0a18 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -122,7 +122,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
else
hip->userflags &= ~HFSPLUS_FLG_NODUMP;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
out_unlock_inode:
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index ab7ea2506b4d..9b92058a1240 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -65,8 +65,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
case ACL_TYPE_ACCESS:
xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- err = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (err < 0)
+ err = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (err)
return err;
}
err = 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 90e46cd752fe..23e15ea53e45 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -812,7 +812,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
int fd = HOSTFS_I(inode)->fd;
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -885,7 +885,7 @@ static const struct inode_operations hostfs_dir_iops = {
.mkdir = hostfs_mkdir,
.rmdir = hostfs_rmdir,
.mknod = hostfs_mknod,
- .rename2 = hostfs_rename2,
+ .rename = hostfs_rename2,
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3bcdd975700..b3be1b5a62e2 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -189,6 +189,11 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hpfs_get_block);
}
+static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block);
+}
+
const struct address_space_operations hpfs_aops = {
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
@@ -214,4 +219,5 @@ const struct file_operations hpfs_file_ops =
const struct inode_operations hpfs_file_iops =
{
.setattr = hpfs_setattr,
+ .fiemap = hpfs_fiemap,
};
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1f3c6d76200b..b9c724ed1e7e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -273,7 +273,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
goto out_unlock;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
goto out_unlock;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index bb8d67e2740a..f30c14414518 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -507,7 +507,8 @@ const struct address_space_operations hpfs_symlink_aops = {
};
static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
const unsigned char *old_name = old_dentry->d_name.name;
unsigned old_len = old_dentry->d_name.len;
@@ -524,6 +525,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct fnode *fnode;
int err;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
if ((err = hpfs_chk_name(new_name, &new_len))) return err;
err = 0;
hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4ea71eba40a5..4fb7b10f3a05 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
- bool rsv_on_error;
u32 hash;
/*
@@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* cache (remove_huge_page) BEFORE removing the
* region/reserve map (hugetlb_unreserve_pages). In
* rare out of memory conditions, removal of the
- * region/reserve map could fail. Before free'ing
- * the page, note PagePrivate which is used in case
- * of error.
+ * region/reserve map could fail. Correspondingly,
+ * the subpool and global reserve usage count can need
+ * to be adjusted.
*/
- rsv_on_error = !PagePrivate(page);
+ VM_BUG_ON(PagePrivate(page));
remove_huge_page(page);
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
next, next + 1, 1)))
- hugetlb_fix_reserve_counts(inode,
- rsv_on_error);
+ hugetlb_fix_reserve_counts(inode);
}
unlock_page(page);
@@ -657,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
out:
inode_unlock(inode);
return error;
@@ -672,7 +670,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
BUG_ON(!inode);
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -702,7 +700,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | config->mode;
inode->i_uid = config->uid;
inode->i_gid = config->gid;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
info = HUGETLBFS_I(inode);
mpol_shared_policy_init(&info->policy, NULL);
inode->i_op = &hugetlbfs_dir_inode_operations;
@@ -741,7 +739,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_mapping->private_data = resv_map;
info = HUGETLBFS_I(inode);
/*
@@ -790,7 +788,7 @@ static int hugetlbfs_mknod(struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
@@ -827,7 +825,7 @@ static int hugetlbfs_symlink(struct inode *dir,
} else
iput(inode);
}
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 7e3ef3af3db9..88110fd0b282 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -140,6 +140,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fop = &no_open_fops;
inode->__i_nlink = 1;
inode->i_opflags = 0;
+ if (sb->s_xattr)
+ inode->i_opflags |= IOP_XATTR;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
@@ -1021,13 +1023,17 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
spin_unlock(&inode_hash_lock);
if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
return inode;
}
@@ -1064,6 +1070,10 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
}
return inode;
@@ -1091,12 +1101,16 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
return inode;
}
@@ -1131,6 +1145,10 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
}
return inode;
}
@@ -1266,10 +1284,16 @@ EXPORT_SYMBOL(ilookup5_nowait);
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
-
- if (inode)
+ struct inode *inode;
+again:
+ inode = ilookup5_nowait(sb, hashval, test, data);
+ if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ }
return inode;
}
EXPORT_SYMBOL(ilookup5);
@@ -1286,13 +1310,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
- if (inode)
+ if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ }
return inode;
}
EXPORT_SYMBOL(ilookup);
@@ -1536,16 +1565,36 @@ sector_t bmap(struct inode *inode, sector_t block)
EXPORT_SYMBOL(bmap);
/*
+ * Update times in overlayed inode from underlying real inode
+ */
+static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode,
+ bool rcu)
+{
+ if (!rcu) {
+ struct inode *realinode = d_real_inode(dentry);
+
+ if (unlikely(inode != realinode) &&
+ (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
+ !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) {
+ inode->i_mtime = realinode->i_mtime;
+ inode->i_ctime = realinode->i_ctime;
+ }
+ }
+}
+
+/*
* With relative atime, only update atime if the previous atime is
* earlier than either the ctime or mtime or if at least a day has
* passed since the last atime update.
*/
-static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
- struct timespec now)
+static int relatime_need_update(const struct path *path, struct inode *inode,
+ struct timespec now, bool rcu)
{
- if (!(mnt->mnt_flags & MNT_RELATIME))
+ if (!(path->mnt->mnt_flags & MNT_RELATIME))
return 1;
+
+ update_ovl_inode_times(path->dentry, inode, rcu);
/*
* Is mtime younger than atime? If yes, update atime:
*/
@@ -1612,7 +1661,8 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
* This function automatically handles read only file systems and media,
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
-bool atime_needs_update(const struct path *path, struct inode *inode)
+bool __atime_needs_update(const struct path *path, struct inode *inode,
+ bool rcu)
{
struct vfsmount *mnt = path->mnt;
struct timespec now;
@@ -1636,9 +1686,9 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
return false;
- now = current_fs_time(inode->i_sb);
+ now = current_time(inode);
- if (!relatime_need_update(mnt, inode, now))
+ if (!relatime_need_update(path, inode, now, rcu))
return false;
if (timespec_equal(&inode->i_atime, &now))
@@ -1653,7 +1703,7 @@ void touch_atime(const struct path *path)
struct inode *inode = d_inode(path->dentry);
struct timespec now;
- if (!atime_needs_update(path, inode))
+ if (!__atime_needs_update(path, inode, false))
return;
if (!sb_start_write_trylock(inode->i_sb))
@@ -1670,7 +1720,7 @@ void touch_atime(const struct path *path)
* We may also fail on filesystems that have the ability to make parts
* of the fs read only, e.g. subvolumes in Btrfs.
*/
- now = current_fs_time(inode->i_sb);
+ now = current_time(inode);
update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
@@ -1793,7 +1843,7 @@ int file_update_time(struct file *file)
if (IS_NOCMTIME(inode))
return 0;
- now = current_fs_time(inode->i_sb);
+ now = current_time(inode);
if (!timespec_equal(&inode->i_mtime, &now))
sync_it = S_MTIME;
@@ -2049,3 +2099,26 @@ void inode_nohighmem(struct inode *inode)
mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);
+
+/**
+ * current_time - Return FS time
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs.
+ *
+ * Note that inode and inode->sb cannot be NULL.
+ * Otherwise, the function warns and returns time without truncation.
+ */
+struct timespec current_time(struct inode *inode)
+{
+ struct timespec now = current_kernel_time();
+
+ if (unlikely(!inode->i_sb)) {
+ WARN(1, "current_time() called with uninitialized super_block in the inode");
+ return now;
+ }
+
+ return timespec_trunc(now, inode->i_sb->s_time_gran);
+}
+EXPORT_SYMBOL(current_time);
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..f4da3341b4a3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
struct super_block;
struct file_system_type;
struct iomap;
+struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
@@ -120,6 +121,15 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
+extern bool __atime_needs_update(const struct path *, struct inode *, bool);
+static inline bool atime_needs_update_rcu(const struct path *path,
+ struct inode *inode)
+{
+ return __atime_needs_update(path, inode, true);
+}
+
+extern bool atime_needs_update_rcu(const struct path *, struct inode *);
+
/*
* fs-writeback.c
*/
@@ -156,7 +166,7 @@ extern void mnt_pin_kill(struct mount *m);
/*
* fs/nsfs.c
*/
-extern struct dentry_operations ns_dentry_operations;
+extern const struct dentry_operations ns_dentry_operations;
/*
* fs/ioctl.c
@@ -164,3 +174,13 @@ extern struct dentry_operations ns_dentry_operations;
extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
unsigned long arg);
extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/*
+ * iomap support:
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+ void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap_ops *ops, void *data,
+ iomap_actor_t actor);
diff --git a/fs/iomap.c b/fs/iomap.c
index 706270f21b35..a8ee8c33ca78 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
#include <linux/dax.h>
#include "internal.h"
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
- void *data, struct iomap *iomap);
-
/*
* Execute a iomap write on a segment of the mapping that spans a
* contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
* resources they require in the iomap_begin call, and release them in the
* iomap_end call.
*/
-static loff_t
+loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
@@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static struct page *
+__iomap_read_page(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ return page;
+}
+
+static loff_t
+iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ long status = 0;
+ ssize_t written = 0;
+
+ do {
+ struct page *page, *rpage;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+
+ rpage = __iomap_read_page(inode, pos);
+ if (IS_ERR(rpage))
+ return PTR_ERR(rpage);
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
+ &page, iomap);
+ put_page(rpage);
+ if (unlikely(status))
+ return status;
+
+ WARN_ON_ONCE(!PageUptodate(page));
+
+ status = iomap_write_end(inode, pos, bytes, bytes, page);
+ if (unlikely(status <= 0)) {
+ if (WARN_ON_ONCE(status == 0))
+ return -EIO;
+ return status;
+ }
+
+ cond_resched();
+
+ pos += status;
+ written += status;
+ length -= status;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (length);
+
+ return written;
+}
+
+int
+iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+ struct iomap_ops *ops)
+{
+ loff_t ret;
+
+ while (len) {
+ ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
+ iomap_dirty_actor);
+ if (ret <= 0)
+ return ret;
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_file_dirty);
+
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
unsigned bytes, struct iomap *iomap)
{
@@ -354,8 +433,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
struct page *page = data;
int ret;
- ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
- NULL, iomap);
+ ret = __block_write_begin_int(page, pos, length, NULL, iomap);
if (ret)
return ret;
@@ -430,6 +508,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
if (iomap->flags & IOMAP_F_MERGED)
flags |= FIEMAP_EXTENT_MERGED;
+ if (iomap->flags & IOMAP_F_SHARED)
+ flags |= FIEMAP_EXTENT_SHARED;
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
@@ -480,7 +560,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
while (len > 0) {
- ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+ ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
iomap_fiemap_actor);
/* inode with no (attribute) mapping will give ENOENT */
if (ret == -ENOENT)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index ad0c745ebad7..871c8b392099 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -687,6 +687,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
pri_bh = NULL;
root_found:
+ /* We don't support read-write mounts */
+ if (!(s->s_flags & MS_RDONLY)) {
+ error = -EACCES;
+ goto out_freebh;
+ }
if (joliet_level && (pri == NULL || !opt.rock)) {
/* This is the case of Joliet with the norock mount flag.
@@ -1501,9 +1506,6 @@ struct inode *__isofs_iget(struct super_block *sb,
static struct dentry *isofs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- /* We don't support read-write mounts */
- if (!(flags & MS_RDONLY))
- return ERR_PTR(-EACCES);
return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5bb565f9989c..31f8ca046639 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -269,8 +269,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
* filemap_fdatawait_range(), set it again so
* that user process can get -EIO from fsync().
*/
- set_bit(AS_EIO,
- &jinode->i_vfs_inode->i_mapping->flags);
+ mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO);
if (!ret)
ret = err;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 46261a6f902d..927da4956a89 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1090,11 +1090,15 @@ static void jbd2_stats_proc_exit(journal_t *journal)
* very few fields yet: that has to wait until we have created the
* journal structures from from scratch, or loaded them from disk. */
-static journal_t * journal_init_common (void)
+static journal_t *journal_init_common(struct block_device *bdev,
+ struct block_device *fs_dev,
+ unsigned long long start, int len, int blocksize)
{
static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
+ struct buffer_head *bh;
+ int n;
journal = kzalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
@@ -1131,6 +1135,32 @@ static journal_t * journal_init_common (void)
lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
&jbd2_trans_commit_key, 0);
+ /* journal descriptor can store up to n blocks -bzzz */
+ journal->j_blocksize = blocksize;
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_maxlen = len;
+ n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ journal->j_wbufsize = n;
+ journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!journal->j_wbuf) {
+ kfree(journal);
+ return NULL;
+ }
+
+ bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
+ if (!bh) {
+ pr_err("%s: Cannot get buffer for journal superblock\n",
+ __func__);
+ kfree(journal->j_wbuf);
+ kfree(journal);
+ return NULL;
+ }
+ journal->j_sb_buffer = bh;
+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
return journal;
}
@@ -1157,51 +1187,21 @@ static journal_t * journal_init_common (void)
* range of blocks on an arbitrary block device.
*
*/
-journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+journal_t *jbd2_journal_init_dev(struct block_device *bdev,
struct block_device *fs_dev,
unsigned long long start, int len, int blocksize)
{
- journal_t *journal = journal_init_common();
- struct buffer_head *bh;
- int n;
+ journal_t *journal;
+ journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
if (!journal)
return NULL;
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_maxlen = len;
bdevname(journal->j_dev, journal->j_devname);
strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
- journal->j_wbufsize = n;
- journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
- if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
- __func__);
- goto out_err;
- }
-
- bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
- printk(KERN_ERR
- "%s: Cannot get buffer for journal superblock\n",
- __func__);
- goto out_err;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
return journal;
-out_err:
- kfree(journal->j_wbuf);
- jbd2_stats_proc_exit(journal);
- kfree(journal);
- return NULL;
}
/**
@@ -1212,67 +1212,36 @@ out_err:
* the journal. The inode must exist already, must support bmap() and
* must have all data blocks preallocated.
*/
-journal_t * jbd2_journal_init_inode (struct inode *inode)
+journal_t *jbd2_journal_init_inode(struct inode *inode)
{
- struct buffer_head *bh;
- journal_t *journal = journal_init_common();
+ journal_t *journal;
char *p;
- int err;
- int n;
unsigned long long blocknr;
+ blocknr = bmap(inode, 0);
+ if (!blocknr) {
+ pr_err("%s: Cannot locate journal superblock\n",
+ __func__);
+ return NULL;
+ }
+
+ jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+ inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+ journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
+ blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
+ inode->i_sb->s_blocksize);
if (!journal)
return NULL;
- journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
journal->j_inode = inode;
bdevname(journal->j_dev, journal->j_devname);
p = strreplace(journal->j_devname, '/', '!');
sprintf(p, "-%lu", journal->j_inode->i_ino);
- jbd_debug(1,
- "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
- journal, inode->i_sb->s_id, inode->i_ino,
- (long long) inode->i_size,
- inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
-
- journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
- journal->j_blocksize = inode->i_sb->s_blocksize;
jbd2_stats_proc_init(journal);
- /* journal descriptor can store up to n blocks -bzzz */
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
- journal->j_wbufsize = n;
- journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
- if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
- __func__);
- goto out_err;
- }
-
- err = jbd2_journal_bmap(journal, 0, &blocknr);
- /* If that failed, give up */
- if (err) {
- printk(KERN_ERR "%s: Cannot locate journal superblock\n",
- __func__);
- goto out_err;
- }
-
- bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- printk(KERN_ERR
- "%s: Cannot get buffer for journal superblock\n",
- __func__);
- goto out_err;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
return journal;
-out_err:
- kfree(journal->j_wbuf);
- jbd2_stats_proc_exit(journal);
- kfree(journal);
- return NULL;
}
/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b5bc3e249163..e1652665bd93 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -159,6 +159,7 @@ static void wait_transaction_locked(journal_t *journal)
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
+ jbd2_might_wait_for_commit(journal);
schedule();
finish_wait(&journal->j_wait_transaction_locked, &wait);
}
@@ -182,8 +183,6 @@ static int add_transaction_credits(journal_t *journal, int blocks,
int needed;
int total = blocks + rsv_blocks;
- jbd2_might_wait_for_commit(journal);
-
/*
* If the current transaction is locked down for commit, wait
* for the lock to be released.
@@ -214,6 +213,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
if (atomic_read(&journal->j_reserved_credits) + total >
journal->j_max_transaction_buffers) {
read_unlock(&journal->j_state_lock);
+ jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
journal->j_max_transaction_buffers);
@@ -238,6 +238,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
+ jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
__jbd2_log_wait_for_space(journal);
@@ -255,6 +256,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
+ jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
<= journal->j_max_transaction_buffers / 2);
@@ -1147,6 +1149,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Reserved");
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+ spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction == journal->j_committing_transaction) {
/* first access by this transaction */
jh->b_modified = 0;
@@ -1154,8 +1157,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "set next transaction");
spin_lock(&journal->j_list_lock);
jh->b_next_transaction = transaction;
+ spin_unlock(&journal->j_list_lock);
}
- spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
/*
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index bc2693d56298..7ebacf14837f 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -233,22 +233,21 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
case ACL_TYPE_ACCESS:
xprefix = JFFS2_XPREFIX_ACL_ACCESS;
if (acl) {
- umode_t mode = inode->i_mode;
- rc = posix_acl_equiv_mode(acl, &mode);
- if (rc < 0)
+ umode_t mode;
+
+ rc = posix_acl_update_mode(inode, &mode, &acl);
+ if (rc)
return rc;
if (inode->i_mode != mode) {
struct iattr attr;
attr.ia_valid = ATTR_MODE | ATTR_CTIME;
attr.ia_mode = mode;
- attr.ia_ctime = CURRENT_TIME_SEC;
+ attr.ia_ctime = current_time(inode);
rc = jffs2_do_setattr(inode, &attr);
if (rc < 0)
return rc;
}
- if (rc == 0)
- acl = NULL;
}
break;
case ACL_TYPE_DEFAULT:
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 30eb33ff8189..0a754f38462e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -35,7 +35,8 @@ static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
static int jffs2_rmdir (struct inode *,struct dentry *);
static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t);
static int jffs2_rename (struct inode *, struct dentry *,
- struct inode *, struct dentry *);
+ struct inode *, struct dentry *,
+ unsigned int);
const struct file_operations jffs2_dir_operations =
{
@@ -61,10 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.get_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
- .setxattr = jffs2_setxattr,
- .getxattr = jffs2_getxattr,
.listxattr = jffs2_listxattr,
- .removexattr = jffs2_removexattr
};
/***********************************************************************/
@@ -759,7 +757,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
}
static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
- struct inode *new_dir_i, struct dentry *new_dentry)
+ struct inode *new_dir_i, struct dentry *new_dentry,
+ unsigned int flags)
{
int ret;
struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb);
@@ -767,6 +766,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
uint8_t type;
uint32_t now;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 0e62dec3effc..c12476e309c6 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -66,10 +66,7 @@ const struct inode_operations jffs2_file_inode_operations =
.get_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
- .setxattr = jffs2_setxattr,
- .getxattr = jffs2_getxattr,
.listxattr = jffs2_listxattr,
- .removexattr = jffs2_removexattr
};
const struct address_space_operations jffs2_file_address_operations =
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ae2ebb26b446..567653f7c0ce 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -193,7 +193,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = d_inode(dentry);
int rc;
- rc = inode_change_ok(inode, iattr);
+ rc = setattr_prepare(dentry, iattr);
if (rc)
return rc;
@@ -472,7 +472,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_mode = jemode_to_cpu(ri->mode);
i_gid_write(inode, je16_to_cpu(ri->gid));
i_uid_write(inode, je16_to_cpu(ri->uid));
- inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode);
ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
inode->i_blocks = 0;
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 2cabd649d4fb..8f3f0855fcd2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -16,8 +16,5 @@ const struct inode_operations jffs2_symlink_inode_operations =
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = jffs2_setattr,
- .setxattr = jffs2_setxattr,
- .getxattr = jffs2_getxattr,
.listxattr = jffs2_listxattr,
- .removexattr = jffs2_removexattr
};
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 467ff376ee26..720007b2fd65 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -99,9 +99,6 @@ extern const struct xattr_handler jffs2_user_xattr_handler;
extern const struct xattr_handler jffs2_trusted_xattr_handler;
extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
-#define jffs2_getxattr generic_getxattr
-#define jffs2_setxattr generic_setxattr
-#define jffs2_removexattr generic_removexattr
#else
@@ -116,9 +113,6 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
#define jffs2_xattr_handlers NULL
#define jffs2_listxattr NULL
-#define jffs2_getxattr NULL
-#define jffs2_setxattr NULL
-#define jffs2_removexattr NULL
#endif /* CONFIG_JFFS2_FS_XATTR */
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 21fa92ba2c19..7bc186f4ed4d 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -78,13 +78,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
case ACL_TYPE_ACCESS:
ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- rc = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (rc < 0)
+ rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (rc)
return rc;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
- if (rc == 0)
- acl = NULL;
}
break;
case ACL_TYPE_DEFAULT:
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f1a585a0a94..739492c7a3fd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -103,7 +103,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = d_inode(dentry);
int rc;
- rc = inode_change_ok(inode, iattr);
+ rc = setattr_prepare(dentry, iattr);
if (rc)
return rc;
@@ -140,10 +140,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
const struct inode_operations jfs_file_inode_operations = {
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = generic_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ad3e7b1effc4..054cc761b426 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -403,7 +403,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
break;
}
- ip->i_mtime = ip->i_ctime = CURRENT_TIME;
+ ip->i_mtime = ip->i_ctime = current_time(ip);
mark_inode_dirty(ip);
txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 5e33cb9a190d..375dd257a34f 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -131,7 +131,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
jfs_inode->otime = inode->i_ctime.tv_sec;
inode->i_generation = JFS_SBI(sb)->gengen++;
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 2e58978d6f45..4d973524c887 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2893,8 +2893,7 @@ restart:
* on anon_list2. Let's check.
*/
if (!list_empty(&TxAnchor.anon_list2)) {
- list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
- INIT_LIST_HEAD(&TxAnchor.anon_list2);
+ list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
goto restart;
}
TXN_UNLOCK();
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 814b0c58016c..b41596d71858 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -162,7 +162,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+ dip->i_ctime = dip->i_mtime = current_time(dip);
mark_inode_dirty(dip);
@@ -298,7 +298,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
/* update parent directory inode */
inc_nlink(dip); /* for '..' from child directory */
- dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+ dip->i_ctime = dip->i_mtime = current_time(dip);
mark_inode_dirty(dip);
rc = txCommit(tid, 2, &iplist[0], 0);
@@ -406,7 +406,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
/* update parent directory's link count corresponding
* to ".." entry of the target directory deleted
*/
- dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+ dip->i_ctime = dip->i_mtime = current_time(dip);
inode_dec_link_count(dip);
/*
@@ -528,7 +528,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
ASSERT(ip->i_nlink);
- ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+ ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip);
mark_inode_dirty(dip);
/* update target's inode */
@@ -838,8 +838,8 @@ static int jfs_link(struct dentry *old_dentry,
/* update object inode */
inc_nlink(ip); /* for new link */
- ip->i_ctime = CURRENT_TIME;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ip->i_ctime = current_time(ip);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
ihold(ip);
@@ -1039,7 +1039,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+ dip->i_ctime = dip->i_mtime = current_time(dip);
mark_inode_dirty(dip);
/*
* commit update of parent directory and link object
@@ -1078,7 +1078,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
* FUNCTION: rename a file or directory
*/
static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btstack btstack;
ino_t ino;
@@ -1097,6 +1098,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
s64 new_size = 0;
int commit_flag;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry);
@@ -1215,7 +1218,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
tblk->xflag |= COMMIT_DELETE;
tblk->u.ip = new_ip;
} else {
- new_ip->i_ctime = CURRENT_TIME;
+ new_ip->i_ctime = current_time(new_ip);
mark_inode_dirty(new_ip);
}
} else {
@@ -1278,10 +1281,10 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/*
* Update ctime on changed/moved inodes & mark dirty
*/
- old_ip->i_ctime = CURRENT_TIME;
+ old_ip->i_ctime = current_time(old_ip);
mark_inode_dirty(old_ip);
- new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+ new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
mark_inode_dirty(new_dir);
/* Build list of inodes modified by this transaction */
@@ -1293,7 +1296,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir) {
iplist[ipcount++] = new_dir;
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
mark_inode_dirty(old_dir);
}
@@ -1426,7 +1429,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
mark_inode_dirty(ip);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
@@ -1537,10 +1540,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.rmdir = jfs_rmdir,
.mknod = jfs_mknod,
.rename = jfs_rename,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = generic_removexattr,
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 90b3bc21e9b0..bd9b641ada2c 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
* cached in meta-data cache, and not written out
* by txCommit();
*/
- filemap_fdatawait(ipbmap->i_mapping);
- filemap_write_and_wait(ipbmap->i_mapping);
+ rc = filemap_fdatawait(ipbmap->i_mapping);
+ if (rc)
+ goto error_out;
+
+ rc = filemap_write_and_wait(ipbmap->i_mapping);
+ if (rc)
+ goto error_out;
+
diWriteSpecial(ipbmap, 0);
newPage = nPages; /* first new page number */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index cec8814a3b8b..85671f7f8518 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -830,7 +830,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode->i_version++;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
inode_unlock(inode);
return len - towrite;
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index c94c7e4a1323..c82404fee6cd 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -25,19 +25,13 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = jfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = generic_removexattr,
};
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = page_get_link,
.setattr = jfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = jfs_listxattr,
- .removexattr = generic_removexattr,
};
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 0bf3c33aedff..c60f3d32ee91 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -658,7 +658,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
if (old_blocks)
dquot_free_block(inode, old_blocks);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
return 0;
}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e57174d43683..cf4c636ff4da 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
- * return value: length of the string. If greater than buflen,
- * then contents of buf are undefined. On error, -1 is returned.
+ * Returns the length of the full path. If the full length is equal to or
+ * greater than @buflen, @buf contains the truncated path with the trailing
+ * '\0'. On error, -errno is returned.
*/
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn_from,
@@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
{
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
- size_t depth_from, depth_to, len = 0, nlen = 0;
- char *p;
- int i;
+ size_t depth_from, depth_to, len = 0;
+ int i, j;
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
@@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
- return -1;
+ return -EINVAL;
depth_to = kernfs_depth(common, kn_to);
depth_from = kernfs_depth(common, kn_from);
@@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
len < buflen ? buflen - len : 0);
/* Calculate how many bytes we need for the rest */
- for (kn = kn_to; kn != common; kn = kn->parent)
- nlen += strlen(kn->name) + 1;
-
- if (len + nlen >= buflen)
- return len + nlen;
-
- p = buf + len + nlen;
- *p = '\0';
- for (kn = kn_to; kn != common; kn = kn->parent) {
- size_t tmp = strlen(kn->name);
- p -= tmp;
- memcpy(p, kn->name, tmp);
- *(--p) = '/';
+ for (i = depth_to - 1; i >= 0; i--) {
+ for (kn = kn_to, j = 0; j < i; j++)
+ kn = kn->parent;
+ len += strlcpy(buf + len, "/",
+ len < buflen ? buflen - len : 0);
+ len += strlcpy(buf + len, kn->name,
+ len < buflen ? buflen - len : 0);
}
- return len + nlen;
+ return len;
}
/**
@@ -186,29 +180,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
- * kernfs_path_len - determine the length of the full path of a given node
- * @kn: kernfs_node of interest
- *
- * The returned length doesn't include the space for the terminating '\0'.
- */
-size_t kernfs_path_len(struct kernfs_node *kn)
-{
- size_t len = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&kernfs_rename_lock, flags);
-
- do {
- len += strlen(kn->name) + 1;
- kn = kn->parent;
- } while (kn && kn->parent);
-
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
-
- return len;
-}
-
-/**
* kernfs_path_from_node - build path of node @to relative to @from.
* @from: parent kernfs_node relative to which we need to build the path
* @to: kernfs_node of interest
@@ -220,8 +191,9 @@ size_t kernfs_path_len(struct kernfs_node *kn)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * If @buf isn't long enough, the return value will be greater than @buflen
- * and @buf contents are undefined.
+ * Returns the length of the full path. If the full length is equal to or
+ * greater than @buflen, @buf contains the truncated path with the trailing
+ * '\0'. On error, -errno is returned.
*/
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
char *buf, size_t buflen)
@@ -237,28 +209,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
/**
- * kernfs_path - build full path of a given node
- * @kn: kernfs_node of interest
- * @buf: buffer to copy @kn's name into
- * @buflen: size of @buf
- *
- * Builds and returns the full path of @kn in @buf of @buflen bytes. The
- * path is built from the end of @buf so the returned pointer usually
- * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
- * and %NULL is returned.
- */
-char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
-{
- int ret;
-
- ret = kernfs_path_from_node(kn, NULL, buf, buflen);
- if (ret < 0 || ret >= buflen)
- return NULL;
- return buf;
-}
-EXPORT_SYMBOL_GPL(kernfs_path);
-
-/**
* pr_cont_kernfs_name - pr_cont name of a kernfs_node
* @kn: kernfs_node of interest
*
@@ -1096,13 +1046,17 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
}
static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct kernfs_node *kn = old_dentry->d_fsdata;
struct kernfs_node *new_parent = new_dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
int ret;
+ if (flags)
+ return -EINVAL;
+
if (!scops || !scops->rename)
return -EPERM;
@@ -1126,9 +1080,6 @@ const struct inode_operations kernfs_dir_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
- .setxattr = kernfs_iop_setxattr,
- .removexattr = kernfs_iop_removexattr,
- .getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.mkdir = kernfs_iop_mkdir,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 63b925d5ba1e..a1982118f92f 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -28,9 +28,6 @@ static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
- .setxattr = kernfs_iop_setxattr,
- .removexattr = kernfs_iop_removexattr,
- .getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
};
@@ -122,7 +119,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
return -EINVAL;
mutex_lock(&kernfs_mutex);
- error = inode_change_ok(inode, iattr);
+ error = setattr_prepare(dentry, iattr);
if (error)
goto out;
@@ -138,17 +135,12 @@ out:
return error;
}
-static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata,
u32 *secdata_len)
{
- struct kernfs_iattrs *attrs;
void *old_secdata;
size_t old_secdata_len;
- attrs = kernfs_iattrs(kn);
- if (!attrs)
- return -ENOMEM;
-
old_secdata = attrs->ia_secdata;
old_secdata_len = attrs->ia_secdata_len;
@@ -160,71 +152,6 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
return 0;
}
-int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct kernfs_node *kn = inode->i_private;
- struct kernfs_iattrs *attrs;
- void *secdata;
- int error;
- u32 secdata_len = 0;
-
- attrs = kernfs_iattrs(kn);
- if (!attrs)
- return -ENOMEM;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
- const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
- error = security_inode_setsecurity(inode, suffix,
- value, size, flags);
- if (error)
- return error;
- error = security_inode_getsecctx(inode,
- &secdata, &secdata_len);
- if (error)
- return error;
-
- mutex_lock(&kernfs_mutex);
- error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
- mutex_unlock(&kernfs_mutex);
-
- if (secdata)
- security_release_secctx(secdata, secdata_len);
- return error;
- } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
- return simple_xattr_set(&attrs->xattrs, name, value, size,
- flags);
- }
-
- return -EINVAL;
-}
-
-int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
-{
- struct kernfs_node *kn = dentry->d_fsdata;
- struct kernfs_iattrs *attrs;
-
- attrs = kernfs_iattrs(kn);
- if (!attrs)
- return -ENOMEM;
-
- return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
-}
-
-ssize_t kernfs_iop_getxattr(struct dentry *unused, struct inode *inode,
- const char *name, void *buf, size_t size)
-{
- struct kernfs_node *kn = inode->i_private;
- struct kernfs_iattrs *attrs;
-
- attrs = kernfs_iattrs(kn);
- if (!attrs)
- return -ENOMEM;
-
- return simple_xattr_get(&attrs->xattrs, name, buf, size);
-}
-
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
{
struct kernfs_node *kn = dentry->d_fsdata;
@@ -241,7 +168,7 @@ static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
}
static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
@@ -376,3 +303,83 @@ int kernfs_iop_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
+
+static int kernfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, void *value, size_t size)
+{
+ const char *name = xattr_full_name(handler, suffix);
+ struct kernfs_node *kn = inode->i_private;
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ return simple_xattr_get(&attrs->xattrs, name, value, size);
+}
+
+static int kernfs_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, const void *value,
+ size_t size, int flags)
+{
+ const char *name = xattr_full_name(handler, suffix);
+ struct kernfs_node *kn = inode->i_private;
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
+}
+
+const struct xattr_handler kernfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = kernfs_xattr_get,
+ .set = kernfs_xattr_set,
+};
+
+static int kernfs_security_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, const void *value,
+ size_t size, int flags)
+{
+ struct kernfs_node *kn = inode->i_private;
+ struct kernfs_iattrs *attrs;
+ void *secdata;
+ u32 secdata_len = 0;
+ int error;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ error = security_inode_setsecurity(inode, suffix, value, size, flags);
+ if (error)
+ return error;
+ error = security_inode_getsecctx(inode, &secdata, &secdata_len);
+ if (error)
+ return error;
+
+ mutex_lock(&kernfs_mutex);
+ error = kernfs_node_setsecdata(attrs, &secdata, &secdata_len);
+ mutex_unlock(&kernfs_mutex);
+
+ if (secdata)
+ security_release_secctx(secdata, secdata_len);
+ return error;
+}
+
+const struct xattr_handler kernfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = kernfs_xattr_get,
+ .set = kernfs_security_xattr_set,
+};
+
+const struct xattr_handler *kernfs_xattr_handlers[] = {
+ &kernfs_trusted_xattr_handler,
+ &kernfs_security_xattr_handler,
+ NULL
+};
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 37159235ac10..bfd551bbf231 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,17 +76,12 @@ extern struct kmem_cache *kernfs_node_cache;
/*
* inode.c
*/
+extern const struct xattr_handler *kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct inode *inode, int mask);
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags);
-int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
-ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *buf, size_t size);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
/*
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index b3d73ad52b22..d5b149a45be1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -158,6 +158,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = magic;
sb->s_op = &kernfs_sops;
+ sb->s_xattr = kernfs_xattr_handlers;
sb->s_time_gran = 1;
/* get root inode, initialize and unlock it */
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 117b8b3416f9..9b43ca02b7ab 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -134,9 +134,6 @@ static const char *kernfs_iop_get_link(struct dentry *dentry,
}
const struct inode_operations kernfs_symlink_iops = {
- .setxattr = kernfs_iop_setxattr,
- .removexattr = kernfs_iop_removexattr,
- .getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
.get_link = kernfs_iop_get_link,
diff --git a/fs/libfs.c b/fs/libfs.c
index 74dc8b9e7f53..48826d4da189 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -236,8 +236,8 @@ static const struct super_operations simple_super_operations = {
* Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
* will never be mountable)
*/
-struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
- const struct super_operations *ops,
+struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
+ const struct super_operations *ops, const struct xattr_handler **xattr,
const struct dentry_operations *dops, unsigned long magic)
{
struct super_block *s;
@@ -254,6 +254,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
s->s_op = ops ? ops : &simple_super_operations;
+ s->s_xattr = xattr;
s->s_time_gran = 1;
root = new_inode(s);
if (!root)
@@ -265,7 +266,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+ root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
dentry = __d_alloc(s, &d_name);
if (!dentry) {
iput(root);
@@ -281,7 +282,7 @@ Enomem:
deactivate_locked_super(s);
return ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL(mount_pseudo);
+EXPORT_SYMBOL(mount_pseudo_xattr);
int simple_open(struct inode *inode, struct file *file)
{
@@ -295,7 +296,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
inc_nlink(inode);
ihold(inode);
dget(dentry);
@@ -329,7 +330,7 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
drop_nlink(inode);
dput(dentry);
return 0;
@@ -349,11 +350,15 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
EXPORT_SYMBOL(simple_rmdir);
int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@ -369,7 +374,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
}
old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
- new_dir->i_mtime = inode->i_ctime = CURRENT_TIME;
+ new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
return 0;
}
@@ -394,7 +399,7 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, iattr);
+ error = setattr_prepare(dentry, iattr);
if (error)
return error;
@@ -520,7 +525,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
@@ -546,7 +551,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
@@ -1092,7 +1097,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
@@ -1149,24 +1154,6 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
return -EPERM;
}
-static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static ssize_t empty_dir_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static int empty_dir_removexattr(struct dentry *dentry, const char *name)
-{
- return -EOPNOTSUPP;
-}
-
static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
return -EOPNOTSUPP;
@@ -1177,9 +1164,6 @@ static const struct inode_operations empty_dir_inode_operations = {
.permission = generic_permission,
.setattr = empty_dir_setattr,
.getattr = empty_dir_getattr,
- .setxattr = empty_dir_setxattr,
- .getxattr = empty_dir_getxattr,
- .removexattr = empty_dir_removexattr,
.listxattr = empty_dir_listxattr,
};
@@ -1215,6 +1199,7 @@ void make_empty_dir_inode(struct inode *inode)
inode->i_blocks = 0;
inode->i_op = &empty_dir_inode_operations;
+ inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &empty_dir_operations;
}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
index 2257a1311027..184a15edd18d 100644
--- a/fs/lockd/procfs.h
+++ b/fs/lockd/procfs.h
@@ -6,8 +6,6 @@
#ifndef _LOCKD_PROCFS_H
#define _LOCKD_PROCFS_H
-#include <linux/kconfig.h>
-
#if IS_ENABLED(CONFIG_PROC_FS)
int lockd_create_procfs(void);
void lockd_remove_procfs(void);
diff --git a/fs/locks.c b/fs/locks.c
index ee1b15f6fc13..22c5b4aa4961 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
-#include <linux/lglock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
@@ -139,6 +138,11 @@
#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
+static inline bool is_remote_lock(struct file *filp)
+{
+ return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+}
+
static bool lease_breaking(struct file_lock *fl)
{
return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
@@ -158,12 +162,18 @@ int lease_break_time = 45;
/*
* The global file_lock_list is only used for displaying /proc/locks, so we
- * keep a list on each CPU, with each list protected by its own spinlock via
- * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant flc_lock is held.
+ * keep a list on each CPU, with each list protected by its own spinlock.
+ * Global serialization is done using file_rwsem.
+ *
+ * Note that alterations to the list also require that the relevant flc_lock is
+ * held.
*/
-DEFINE_STATIC_LGLOCK(file_lock_lglock);
-static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+struct file_lock_list_struct {
+ spinlock_t lock;
+ struct hlist_head hlist;
+};
+static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
+DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -587,15 +597,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
- lg_local_lock(&file_lock_lglock);
+ struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
+
+ percpu_rwsem_assert_held(&file_rwsem);
+
+ spin_lock(&fll->lock);
fl->fl_link_cpu = smp_processor_id();
- hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
- lg_local_unlock(&file_lock_lglock);
+ hlist_add_head(&fl->fl_link, &fll->hlist);
+ spin_unlock(&fll->lock);
}
/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
+ struct file_lock_list_struct *fll;
+
+ percpu_rwsem_assert_held(&file_rwsem);
+
/*
* Avoid taking lock if already unhashed. This is safe since this check
* is done while holding the flc_lock, and new insertions into the list
@@ -603,9 +621,11 @@ static void locks_delete_global_locks(struct file_lock *fl)
*/
if (hlist_unhashed(&fl->fl_link))
return;
- lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+
+ fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+ spin_lock(&fll->lock);
hlist_del_init(&fl->fl_link);
- lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+ spin_unlock(&fll->lock);
}
static unsigned long
@@ -791,7 +811,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
{
struct file_lock *cfl;
struct file_lock_context *ctx;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -915,6 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
return -ENOMEM;
}
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -955,6 +976,7 @@ find_conflict:
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -991,6 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1162,6 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1192,7 +1216,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
- return posix_lock_inode(file_inode(filp), fl, conflock);
+ return posix_lock_inode(locks_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
@@ -1232,7 +1256,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
int locks_mandatory_locked(struct file *file)
{
int ret;
- struct inode *inode = file_inode(file);
+ struct inode *inode = locks_inode(file);
struct file_lock_context *ctx;
struct file_lock *fl;
@@ -1436,6 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return error;
}
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1487,9 +1512,13 @@ restart:
locks_insert_block(fl, new_fl);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
+
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
+
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
@@ -1506,6 +1535,7 @@ restart:
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1539,7 +1569,7 @@ void lease_get_mtime(struct inode *inode, struct timespec *time)
}
if (has_lease)
- *time = current_fs_time(inode->i_sb);
+ *time = current_time(inode);
else
*time = inode->i_mtime;
}
@@ -1572,15 +1602,16 @@ EXPORT_SYMBOL(lease_get_mtime);
int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
- time_out_leases(file_inode(filp), &dispose);
+ time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file != filp)
continue;
@@ -1588,6 +1619,8 @@ int fcntl_getlease(struct file *filp)
break;
}
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
+
locks_dispose_list(&dispose);
}
return type;
@@ -1613,7 +1646,8 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
if (flags & FL_LAYOUT)
return 0;
- if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+ if ((arg == F_RDLCK) &&
+ (atomic_read(&d_real_inode(dentry)->i_writecount) > 0))
return -EAGAIN;
if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
@@ -1628,7 +1662,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
{
struct file_lock *fl, *my_fl = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = dentry->d_inode;
struct file_lock_context *ctx;
bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
@@ -1660,6 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1730,6 +1765,7 @@ out_setup:
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
@@ -1742,7 +1778,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
LIST_HEAD(dispose);
@@ -1752,6 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
return error;
}
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1764,6 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -1782,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
int error;
if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -1830,7 +1868,7 @@ EXPORT_SYMBOL(generic_setlease);
int
vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
{
- if (filp->f_op->setlease)
+ if (filp->f_op->setlease && is_remote_lock(filp))
return filp->f_op->setlease(filp, arg, lease, priv);
else
return generic_setlease(filp, arg, lease, priv);
@@ -1979,7 +2017,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (error)
goto out_free;
- if (f.file->f_op->flock)
+ if (f.file->f_op->flock && is_remote_lock(f.file))
error = f.file->f_op->flock(f.file,
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
@@ -2005,7 +2043,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
- if (filp->f_op->lock)
+ if (filp->f_op->lock && is_remote_lock(filp))
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
return 0;
@@ -2129,7 +2167,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
- if (filp->f_op->lock)
+ if (filp->f_op->lock && is_remote_lock(filp))
return filp->f_op->lock(filp, cmd, fl);
else
return posix_lock_file(filp, fl, conf);
@@ -2191,7 +2229,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- inode = file_inode(filp);
+ inode = locks_inode(filp);
/*
* This might block, so we do it before checking the inode.
@@ -2343,7 +2381,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = file_inode(filp);
+ inode = locks_inode(filp);
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
@@ -2426,6 +2464,7 @@ out:
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
int error;
+ struct inode *inode = locks_inode(filp);
struct file_lock lock;
struct file_lock_context *ctx;
@@ -2434,7 +2473,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2452,7 +2491,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
if (lock.fl_ops && lock.fl_ops->fl_release_private)
lock.fl_ops->fl_release_private(&lock);
- trace_locks_remove_posix(file_inode(filp), &lock, error);
+ trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);
@@ -2469,12 +2508,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
.fl_type = F_UNLCK,
.fl_end = OFFSET_MAX,
};
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
if (list_empty(&flctx->flc_flock))
return;
- if (filp->f_op->flock)
+ if (filp->f_op->flock && is_remote_lock(filp))
filp->f_op->flock(filp, F_SETLKW, &fl);
else
flock_lock_inode(inode, &fl);
@@ -2493,11 +2532,14 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
if (list_empty(&ctx->flc_lease))
return;
+ percpu_down_read_preempt_disable(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
if (filp == fl->fl_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
+ percpu_up_read_preempt_enable(&file_rwsem);
+
locks_dispose_list(&dispose);
}
@@ -2508,7 +2550,7 @@ void locks_remove_file(struct file *filp)
{
struct file_lock_context *ctx;
- ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+ ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
if (!ctx)
return;
@@ -2552,7 +2594,7 @@ EXPORT_SYMBOL(posix_unblock_lock);
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
- if (filp->f_op->lock)
+ if (filp->f_op->lock && is_remote_lock(filp))
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
@@ -2574,13 +2616,24 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
- if (fl->fl_nspid)
- fl_pid = pid_vnr(fl->fl_nspid);
- else
+ if (fl->fl_nspid) {
+ struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
+
+ /* Don't let fl_pid change based on who is reading the file */
+ fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns);
+
+ /*
+ * If there isn't a fl_pid don't display who is waiting on
+ * the lock if we are called from locks_show, or if we are
+ * called from __show_fd_info - skip lock entirely
+ */
+ if (fl_pid == 0)
+ return;
+ } else
fl_pid = fl->fl_pid;
if (fl->fl_file != NULL)
- inode = file_inode(fl->fl_file);
+ inode = locks_inode(fl->fl_file);
seq_printf(f, "%lld:%s ", id, pfx);
if (IS_POSIX(fl)) {
@@ -2648,9 +2701,13 @@ static int locks_show(struct seq_file *f, void *v)
{
struct locks_iterator *iter = f->private;
struct file_lock *fl, *bfl;
+ struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
fl = hlist_entry(v, struct file_lock, fl_link);
+ if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns))
+ return 0;
+
lock_get_status(f, fl, iter->li_pos, "");
list_for_each_entry(bfl, &fl->fl_block, fl_block)
@@ -2682,7 +2739,7 @@ static void __show_fd_locks(struct seq_file *f,
void show_fd_locks(struct seq_file *f,
struct file *filp, struct files_struct *files)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
int id = 0;
@@ -2703,9 +2760,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos)
struct locks_iterator *iter = f->private;
iter->li_pos = *pos + 1;
- lg_global_lock(&file_lock_lglock);
+ percpu_down_write(&file_rwsem);
spin_lock(&blocked_lock_lock);
- return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+ return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
@@ -2713,14 +2770,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
struct locks_iterator *iter = f->private;
++iter->li_pos;
- return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+ return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
__releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);
- lg_global_unlock(&file_lock_lglock);
+ percpu_up_write(&file_rwsem);
}
static const struct seq_operations locks_seq_operations = {
@@ -2761,10 +2818,13 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
- lg_lock_init(&file_lock_lglock, "file_lock_lglock");
- for_each_possible_cpu(i)
- INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+ for_each_possible_cpu(i) {
+ struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
+
+ spin_lock_init(&fll->lock);
+ INIT_HLIST_HEAD(&fll->hlist);
+ }
return 0;
}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9568064ecadf..c87ea52de3d9 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -226,7 +226,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry)
ta->state = UNLINK_1;
ta->ino = inode->i_ino;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
page = logfs_get_dd_page(dir, dentry);
if (!page) {
@@ -540,7 +540,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
ihold(inode);
inc_nlink(inode);
mark_inode_dirty_sync(inode);
@@ -573,7 +573,7 @@ static int logfs_delete_dd(struct inode *dir, loff_t pos)
* (crc-protected) journal.
*/
BUG_ON(beyond_eof(dir, pos));
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
return logfs_delete(dir, pos, NULL);
}
@@ -718,8 +718,12 @@ out:
}
static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
if (d_really_is_positive(new_dentry))
return logfs_rename_target(old_dir, old_dentry,
new_dir, new_dentry);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index f01ddfb1a03b..1db04930ad57 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -211,7 +211,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
li->li_flags = flags;
inode_unlock(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
return 0;
@@ -244,7 +244,7 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int err = 0;
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index db9cfc598883..f440a1525da8 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -213,8 +213,8 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
i_gid_write(inode, 0);
inode->i_size = 0;
inode->i_blocks = 0;
- inode->i_ctime = CURRENT_TIME;
- inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
+ inode->i_mtime = current_time(inode);
li->li_refcount = 1;
INIT_LIST_HEAD(&li->li_freeing_list);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 3fb8c6d67303..bf19bf4a243f 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1546,7 +1546,7 @@ static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
int err;
flags |= WF_WRITE | WF_DELETE;
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
logfs_unpack_index(index, &bix, &level);
if (logfs_block(page) && logfs_block(page)->reserved_bytes)
@@ -1578,7 +1578,7 @@ static int __logfs_delete(struct inode *inode, struct page *page)
long flags = WF_DELETE;
int err;
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
if (page->index < I0_BLOCKS)
return logfs_write_direct(inode, page, flags);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index eccda3a02de6..c5bd19ffa326 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -366,7 +366,11 @@ struct mb_cache *mb_cache_create(int bucket_bits)
cache->c_shrink.count_objects = mb_cache_count;
cache->c_shrink.scan_objects = mb_cache_scan;
cache->c_shrink.seeks = DEFAULT_SEEKS;
- register_shrinker(&cache->c_shrink);
+ if (register_shrinker(&cache->c_shrink)) {
+ kfree(cache->c_hash);
+ kfree(cache);
+ goto err_out;
+ }
INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 742942a983be..c2c3fd3277b5 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -253,7 +253,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error)
}
inode_init_owner(inode, dir, mode);
inode->i_ino = j;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_blocks = 0;
memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 31dcd515b9d5..7edc9b395700 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -274,7 +274,7 @@ got_it:
de->inode = inode->i_ino;
}
err = dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
out_put:
dir_put_page(page);
@@ -306,7 +306,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
unlock_page(page);
}
dir_put_page(page);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
return err;
}
@@ -430,7 +430,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
unlock_page(page);
}
dir_put_page(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
}
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 94f0eb9a6e2c..a6a4797aa0d4 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -26,7 +26,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index a731cabf1540..4c57c9af6946 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -124,7 +124,7 @@ static inline int splice_branch(struct inode *inode,
/* We are done with atomic stuff, now do the rest of housekeeping */
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -343,7 +343,7 @@ do_indirects:
}
first_whole++;
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 2887d1d95ce2..1e0f11f5dac9 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -106,7 +106,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
return add_nondir(dentry, inode);
@@ -185,7 +185,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
}
static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
- struct inode * new_dir, struct dentry *new_dentry)
+ struct inode * new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
@@ -195,6 +196,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct minix_dir_entry * old_de;
int err = -ENOENT;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_de = minix_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
@@ -219,7 +223,7 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
if (!new_de)
goto out_dir;
minix_set_link(new_de, new_page, old_inode);
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/mount.h b/fs/mount.h
index 14db05d424f7..d2e25d7b64b3 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,9 +10,12 @@ struct mnt_namespace {
struct mount * root;
struct list_head list;
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
+ unsigned int mounts; /* # of mounts in the namespace */
+ unsigned int pending_mounts;
};
struct mnt_pcp {
diff --git a/fs/namei.c b/fs/namei.c
index adb04146df09..5b4eed221530 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd)
if (!(nd->flags & LOOKUP_RCU)) {
touch_atime(&last->link);
cond_resched();
- } else if (atime_needs_update(&last->link, inode)) {
+ } else if (atime_needs_update_rcu(&last->link, inode)) {
if (unlikely(unlazy_walk(nd, NULL, 0)))
return ERR_PTR(-ECHILD);
touch_atime(&last->link);
@@ -4369,12 +4369,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
+ if (!old_dir->i_op->rename)
return -EPERM;
- if (flags && !old_dir->i_op->rename2)
- return -EINVAL;
-
/*
* If we are going to change the parent - check write permissions,
* we'll need to flip '..'.
@@ -4428,14 +4425,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
goto out;
}
- if (!old_dir->i_op->rename2) {
- error = old_dir->i_op->rename(old_dir, old_dentry,
- new_dir, new_dentry);
- } else {
- WARN_ON(old_dir->i_op->rename != NULL);
- error = old_dir->i_op->rename2(old_dir, old_dentry,
- new_dir, new_dentry, flags);
- }
+ error = old_dir->i_op->rename(old_dir, old_dentry,
+ new_dir, new_dentry, flags);
if (error)
goto out;
@@ -4677,6 +4668,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
}
EXPORT_SYMBOL(generic_readlink);
+/**
+ * vfs_get_link - get symlink body
+ * @dentry: dentry on which to get symbolic link
+ * @done: caller needs to free returned data with this
+ *
+ * Calls security hook and i_op->get_link() on the supplied inode.
+ *
+ * It does not touch atime. That's up to the caller if necessary.
+ *
+ * Does not work on "special" symlinks like /proc/$$/fd/N
+ */
+const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
+{
+ const char *res = ERR_PTR(-EINVAL);
+ struct inode *inode = d_inode(dentry);
+
+ if (d_is_symlink(dentry)) {
+ res = ERR_PTR(security_inode_readlink(dentry));
+ if (!res)
+ res = inode->i_op->get_link(dentry, inode, done);
+ }
+ return res;
+}
+EXPORT_SYMBOL(vfs_get_link);
+
/* get the link contents into pagecache */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda3bfef..e6c234b1a645 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,9 @@
#include "pnode.h"
#include "internal.h"
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
@@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
list_splice(&head, n->list.prev);
+ n->mounts += n->pending_mounts;
+ n->pending_mounts = 0;
+
attach_shadowed(mnt, parent, shadows);
touch_mnt_namespace(n);
}
@@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
propagate_umount(&tmp_list);
while (!list_empty(&tmp_list)) {
+ struct mnt_namespace *ns;
bool disconnect;
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
- __touch_mnt_namespace(p->mnt_ns);
+ ns = p->mnt_ns;
+ if (ns) {
+ ns->mounts--;
+ __touch_mnt_namespace(ns);
+ }
p->mnt_ns = NULL;
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
return 0;
}
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+ unsigned int max = READ_ONCE(sysctl_mount_max);
+ unsigned int mounts = 0, old, pending, sum;
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt))
+ mounts++;
+
+ old = ns->mounts;
+ pending = ns->pending_mounts;
+ sum = old + pending;
+ if ((old > sum) ||
+ (pending > sum) ||
+ (max < sum) ||
+ (mounts > (max - sum)))
+ return -ENOSPC;
+
+ ns->pending_mounts = pending + mounts;
+ return 0;
+}
+
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
@@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
struct path *parent_path)
{
HLIST_HEAD(tree_list);
+ struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mount *child, *p;
struct hlist_node *n;
int err;
+ /* Is there space to add these mounts to the mount namespace? */
+ if (!parent_path) {
+ err = count_mounts(ns, source_mnt);
+ if (err)
+ goto out;
+ }
+
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
@@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ child->mnt_parent->mnt_ns->pending_mounts = 0;
umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
out:
+ ns->pending_mounts = 0;
return err;
}
@@ -2700,7 +2743,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
- MS_STRICTATIME);
+ MS_STRICTATIME | MS_NOREMOTELOCK);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
@@ -2719,9 +2762,20 @@ dput_out:
return retval;
}
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
static void free_mnt_ns(struct mnt_namespace *ns)
{
ns_free_inum(&ns->ns);
+ dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
}
@@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
{
struct mnt_namespace *new_ns;
+ struct ucounts *ucounts;
int ret;
+ ucounts = inc_mnt_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
- if (!new_ns)
+ if (!new_ns) {
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
new_ns->ns.ops = &mntns_operations;
@@ -2756,9 +2818,13 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
init_waitqueue_head(&new_ns->poll);
new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->mounts = 0;
+ new_ns->pending_mounts = 0;
return new_ns;
}
+__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
@@ -2805,6 +2871,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
q = new;
while (p) {
q->mnt_ns = new_ns;
+ new_ns->mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
@@ -2843,6 +2910,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
struct mount *mnt = real_mount(m);
mnt->mnt_ns = new_ns;
new_ns->root = mnt;
+ new_ns->mounts++;
list_add(&mnt->mnt_list, &new_ns->list);
} else {
mntput(m);
@@ -3348,10 +3416,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+ return to_mnt_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
.type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
+ .owner = mntns_owner,
};
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 17de5c13dfae..6df2a3827574 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -36,7 +36,7 @@ static int ncp_unlink(struct inode *, struct dentry *);
static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
static int ncp_rmdir(struct inode *, struct dentry *);
static int ncp_rename(struct inode *, struct dentry *,
- struct inode *, struct dentry *);
+ struct inode *, struct dentry *, unsigned int);
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
umode_t mode, dev_t rdev);
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
@@ -133,12 +133,11 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
return 0;
if (!ncp_case_sensitive(inode)) {
- struct super_block *sb = dentry->d_sb;
struct nls_table *t;
unsigned long hash;
int i;
- t = NCP_IO_TABLE(sb);
+ t = NCP_IO_TABLE(dentry->d_sb);
hash = init_name_hash(dentry);
for (i=0; i<this->len ; i++)
hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -1106,13 +1105,17 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
}
static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct ncp_server *server = NCP_SERVER(old_dir);
int error;
int old_len, new_len;
__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
+ if (flags)
+ return -EINVAL;
+
ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
ncp_age_dentry(server, old_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 1af15fcbe57b..f6cf4c7e92b1 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -884,7 +884,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
/* ageing the dentry to force validation */
ncp_age_dentry(server, dentry);
- result = inode_change_ok(inode, attr);
+ result = setattr_prepare(dentry, attr);
if (result < 0)
goto out;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 217847679f0e..2905479f214a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work)
u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
u64 end = (hdr->args.offset + hdr->args.count +
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
+ u64 lwb = hdr->args.offset + hdr->args.count;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
- (end - start) >> SECTOR_SHIFT, end);
+ (end - start) >> SECTOR_SHIFT, lwb);
}
pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 5f7b053720ee..6de15709d024 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
- complete_all(&dreq->completion);
+ complete(&dreq->completion);
nfs_cache_defer_req_put(dreq);
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 52a28311e2a4..532d8e242d4d 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -31,8 +31,6 @@
struct nfs_callback_data {
unsigned int users;
struct svc_serv *serv;
- struct svc_rqst *rqst;
- struct task_struct *task;
};
static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
@@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp)
return 0;
}
-/*
- * Prepare to bring up the NFSv4 callback service
- */
-static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
-{
- return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-}
-
#if defined(CONFIG_NFS_V4_1)
/*
* The callback service for NFSv4.1 callbacks
@@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp)
return 0;
}
-/*
- * Bring up the NFSv4.1 callback service
- */
-static struct svc_rqst *
-nfs41_callback_up(struct svc_serv *serv)
-{
- struct svc_rqst *rqstp;
-
- INIT_LIST_HEAD(&serv->sv_cb_list);
- spin_lock_init(&serv->sv_cb_lock);
- init_waitqueue_head(&serv->sv_cb_waitq);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
- return rqstp;
-}
-
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
- struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
- *rqstpp = nfs41_callback_up(serv);
- *callback_svc = nfs41_callback_svc;
-}
-
static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
@@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
xprt->bc_serv = serv;
}
#else
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
- struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
- *rqstpp = ERR_PTR(-ENOTSUPP);
- *callback_svc = ERR_PTR(-ENOTSUPP);
-}
-
static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
@@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
- struct svc_rqst *rqstp;
- int (*callback_svc)(void *vrqstp);
- struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ int nrservs = nfs_callback_nr_threads;
int ret;
nfs_callback_bc_serv(minorversion, xprt, serv);
- if (cb_info->task)
- return 0;
+ if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+ nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
- switch (minorversion) {
- case 0:
- /* v4.0 callback setup */
- rqstp = nfs4_callback_up(serv);
- callback_svc = nfs4_callback_svc;
- break;
- default:
- nfs_minorversion_callback_svc_setup(serv,
- &rqstp, &callback_svc);
- }
-
- if (IS_ERR(rqstp))
- return PTR_ERR(rqstp);
-
- svc_sock_update_bufs(serv);
+ if (serv->sv_nrthreads-1 == nrservs)
+ return 0;
- cb_info->serv = serv;
- cb_info->rqst = rqstp;
- cb_info->task = kthread_create(callback_svc, cb_info->rqst,
- "nfsv4.%u-svc", minorversion);
- if (IS_ERR(cb_info->task)) {
- ret = PTR_ERR(cb_info->task);
- svc_exit_thread(cb_info->rqst);
- cb_info->rqst = NULL;
- cb_info->task = NULL;
+ ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+ if (ret) {
+ serv->sv_ops->svo_setup(serv, NULL, 0);
return ret;
}
- rqstp->rq_task = cb_info->task;
- wake_up_process(cb_info->task);
dprintk("nfs_callback_up: service started\n");
return 0;
}
@@ -281,19 +217,41 @@ err_bind:
return ret;
}
-static struct svc_serv_ops nfs_cb_sv_ops = {
+static struct svc_serv_ops nfs40_cb_sv_ops = {
+ .svo_function = nfs4_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads,
+ .svo_module = THIS_MODULE,
+};
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_serv_ops nfs41_cb_sv_ops = {
+ .svo_function = nfs41_callback_svc,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads,
+ .svo_module = THIS_MODULE,
+};
+
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+ [0] = &nfs40_cb_sv_ops,
+ [1] = &nfs41_cb_sv_ops,
+};
+#else
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+ [0] = &nfs40_cb_sv_ops,
+ [1] = NULL,
};
+#endif
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
struct svc_serv *serv;
+ struct svc_serv_ops *sv_ops;
/*
* Check whether we're already up and running.
*/
- if (cb_info->task) {
+ if (cb_info->serv) {
/*
* Note: increase service usage, because later in case of error
* svc_destroy() will be called.
@@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
return cb_info->serv;
}
+ switch (minorversion) {
+ case 0:
+ sv_ops = nfs4_cb_sv_ops[0];
+ break;
+ default:
+ sv_ops = nfs4_cb_sv_ops[1];
+ }
+
+ if (sv_ops == NULL)
+ return ERR_PTR(-ENOTSUPP);
+
/*
* Sanity check: if there's no task,
* we should be the first user ...
@@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
}
+ cb_info->serv = serv;
/* As there is only one thread we need to over-ride the
* default maximum of 80 connections
*/
@@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
* thread exits.
*/
err_net:
+ if (!cb_info->users)
+ cb_info->serv = NULL;
svc_destroy(serv);
err_create:
mutex_unlock(&nfs_callback_mutex);
@@ -374,18 +346,18 @@ err_start:
void nfs_callback_down(int minorversion, struct net *net)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ struct svc_serv *serv;
mutex_lock(&nfs_callback_mutex);
- nfs_callback_down_net(minorversion, cb_info->serv, net);
+ serv = cb_info->serv;
+ nfs_callback_down_net(minorversion, serv, net);
cb_info->users--;
- if (cb_info->users == 0 && cb_info->task != NULL) {
- kthread_stop(cb_info->task);
- dprintk("nfs_callback_down: service stopped\n");
- svc_exit_thread(cb_info->rqst);
+ if (cb_info->users == 0) {
+ svc_get(serv);
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ svc_destroy(serv);
dprintk("nfs_callback_down: service destroyed\n");
cb_info->serv = NULL;
- cb_info->rqst = NULL;
- cb_info->task = NULL;
}
mutex_unlock(&nfs_callback_mutex);
}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5fe1cecbf9f0..c701c308fac5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify(
struct cb_devicenotifyargs *args,
void *dummy, struct cb_process_state *cps);
+struct cb_notify_lock_args {
+ struct nfs_fh cbnl_fh;
+ struct nfs_lowner cbnl_owner;
+ bool cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args,
+ void *dummy,
+ struct cb_process_state *cps);
#endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net);
#define NFS41_BC_MIN_CALLBACKS 1
#define NFS41_BC_MAX_CALLBACKS 1
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f953ef6b2f2e..e9aa235e9d10 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -628,4 +628,20 @@ out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
}
+
+__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy,
+ struct cb_process_state *cps)
+{
+ if (!cps->clp) /* set in cb_sequence */
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+ dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ /* Don't wake anybody if the string looked bogus */
+ if (args->cbnl_valid)
+ __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+ return htonl(NFS4_OK);
+}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 656f68f7fe53..eb094c6011d8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -35,6 +35,7 @@
(1 + 3) * 4) // seqid, 3 slotids
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
return xdr_ressize_check(rqstp, p);
}
-static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
@@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
return 0;
}
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+ __be32 *p;
+ unsigned int len;
+
+ p = read_buf(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+ len = be32_to_cpu(*p);
+
+ p = read_buf(xdr, len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ /* Only try to decode if the length is right */
+ if (len == 20) {
+ p += 2; /* skip "lock id:" */
+ args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+ xdr_decode_hyper(p, &args->cbnl_owner.id);
+ args->cbnl_valid = true;
+ } else {
+ args->cbnl_owner.s_dev = 0;
+ args->cbnl_owner.id = 0;
+ args->cbnl_valid = false;
+ }
+ return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+ __be32 status;
+
+ status = decode_fh(xdr, &args->cbnl_fh);
+ if (unlikely(status != 0))
+ goto out;
+ status = decode_lockowner(xdr, args);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
#endif /* CONFIG_NFS_V4_1 */
static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL:
case OP_CB_NOTIFY_DEVICEID:
+ case OP_CB_NOTIFY_LOCK:
*op = &callback_ops[op_nr];
break;
@@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
case OP_CB_WANTS_CANCELLED:
- case OP_CB_NOTIFY_LOCK:
return htonl(NFS4ERR_NOTSUPP);
default:
@@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = {
.decode_args = (callback_decode_arg_t)decode_recallslot_args,
.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
},
+ [OP_CB_NOTIFY_LOCK] = {
+ .process_op = (callback_process_op_t)nfs4_callback_notify_lock,
+ .decode_args = (callback_decode_arg_t)decode_notify_lock_args,
+ .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+ },
#endif /* CONFIG_NFS_V4_1 */
};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1e106780a237..7555ba889d1f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
continue;
/* Match the full socket address */
if (!rpc_cmp_addr_port(sap, clap))
- continue;
+ /* Match all xprt_switch full socket addresses */
+ if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+ sap))
+ continue;
atomic_inc(&clp->cl_count);
return clp;
@@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
}
fsinfo.fattr = fattr;
- fsinfo.layouttype = 0;
+ fsinfo.nlayouttypes = 0;
+ memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
if (error < 0)
goto out_error;
@@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net)
idr_init(&nn->cb_ident_idr);
#endif
spin_lock_init(&nn->nfs_client_lock);
- nn->boot_time = CURRENT_TIME;
+ nn->boot_time = ktime_get_real();
}
#ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 322c2585bc34..dff600ae0d74 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
}
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+ fmode_t flags)
+{
+ if (delegation != NULL && (delegation->type & flags) == flags &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+ !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ return true;
+ return false;
+}
+
static int
nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
@@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL && (delegation->type & flags) == flags &&
- !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ if (nfs4_is_valid_delegation(delegation, flags)) {
if (mark)
nfs_mark_delegation_referenced(delegation);
ret = 1;
@@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
rcu_read_unlock();
put_rpccred(oldcred);
trace_nfs4_reclaim_delegation(inode, res->delegation_type);
- } else {
- /* We appear to have raced with a delegation return. */
- spin_unlock(&delegation->lock);
- rcu_read_unlock();
- nfs_inode_set_delegation(inode, cred, res);
+ return;
}
- } else {
- rcu_read_unlock();
+ /* We appear to have raced with a delegation return. */
+ spin_unlock(&delegation->lock);
}
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, res);
}
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
-static void nfs_revoke_delegation(struct inode *inode)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+ nfs_mark_return_delegation(server, delegation);
+}
+
+static bool nfs_revoke_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
{
struct nfs_delegation *delegation;
+ nfs4_stateid tmp;
+ bool ret = false;
+
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL) {
- set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
- nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
- }
+ if (delegation == NULL)
+ goto out;
+ if (stateid == NULL) {
+ nfs4_stateid_copy(&tmp, &delegation->stateid);
+ stateid = &tmp;
+ } else if (!nfs4_stateid_match(stateid, &delegation->stateid))
+ goto out;
+ nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
+ ret = true;
+out:
rcu_read_unlock();
+ if (ret)
+ nfs_inode_find_state_and_recover(inode, stateid);
+ return ret;
}
-void nfs_remove_bad_delegation(struct inode *inode)
+void nfs_remove_bad_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
{
struct nfs_delegation *delegation;
- nfs_revoke_delegation(inode);
+ if (!nfs_revoke_delegation(inode, stateid))
+ return;
delegation = nfs_inode_detach_delegation(inode);
- if (delegation) {
- nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+ if (delegation)
nfs_free_delegation(delegation);
- }
}
EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
@@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
{
struct nfs_delegation *delegation;
- list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ /*
+ * If the delegation may have been admin revoked, then we
+ * cannot reclaim it.
+ */
+ if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+ continue;
set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ }
}
/**
@@ -851,6 +887,141 @@ restart:
rcu_read_unlock();
}
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+ return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+ BIT(NFS4CLNT_LEASE_EXPIRED) |
+ BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+ return;
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+ struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation)
+ nfs_mark_test_expired_delegation(server, delegation);
+ rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+ nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_test_expired_server(server);
+ rcu_read_unlock();
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
+ struct nfs_delegation *delegation;
+ struct nfs_server *server;
+ struct inode *inode;
+ struct rpc_cred *cred;
+ nfs4_stateid stateid;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(delegation, &server->delegations,
+ super_list) {
+ if (test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags))
+ continue;
+ if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ if (!nfs_sb_active(server->super))
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
+ cred = get_rpccred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ if (cred != NULL &&
+ ops->test_and_free_expired(server, &stateid, cred) < 0) {
+ nfs_revoke_delegation(inode, &stateid);
+ nfs_inode_find_state_and_recover(inode, &stateid);
+ }
+ put_rpccred(cred);
+ if (nfs4_server_rebooted(clp)) {
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ nfs_sb_deactive(server->super);
+ return;
+ }
+ iput(inode);
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_delegation *delegation;
+ bool found = false;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation &&
+ nfs4_stateid_match_other(&delegation->stateid, stateid)) {
+ nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+ found = true;
+ }
+ rcu_read_unlock();
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
/**
* nfs_delegations_present - check for existence of delegations
* @clp: client state handle
@@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- ret = (delegation != NULL && (delegation->type & flags) == flags);
+ ret = nfs4_is_valid_delegation(delegation, flags);
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
nfs_mark_delegation_referenced(delegation);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 64724d252a79..e9d555796873 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -32,6 +32,7 @@ enum {
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
NFS_DELEGATION_REVOKED,
+ NFS_DELEGATION_TEST_EXPIRED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
int nfs_client_return_marked_delegations(struct nfs_client *clp);
int nfs_delegations_present(struct nfs_client *clp);
-void nfs_remove_bad_delegation(struct inode *inode);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
void nfs_delegation_mark_reclaim(struct nfs_client *clp);
void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
/* NFSv4 delegation-related procedures */
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
@@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 177fefb26c18..5f1af4cd1a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
return 0;
nfsi = NFS_I(inode);
- if (entry->fattr->fileid == nfsi->fileid)
- return 1;
- if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
- return 1;
- return 0;
+ if (entry->fattr->fileid != nfsi->fileid)
+ return 0;
+ if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+ return 0;
+ return 1;
}
static
@@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
return;
if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
return;
+ if (filename.len == 0)
+ return;
+ /* Validate that the name doesn't contain any illegal '\0' */
+ if (strnlen(filename.name, filename.len) != filename.len)
+ return;
+ /* ...or '/' */
+ if (strnchr(filename.name, filename.len, '/'))
+ return;
if (filename.name[0] == '.') {
if (filename.len == 1)
return;
@@ -517,6 +525,8 @@ again:
&entry->fattr->fsid))
goto out;
if (nfs_same_file(dentry, entry)) {
+ if (!entry->fh->size)
+ goto out;
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
@@ -529,6 +539,10 @@ again:
goto again;
}
}
+ if (!entry->fh->size) {
+ d_lookup_done(dentry);
+ goto out;
+ }
inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
alias = d_splice_alias(inode, dentry);
@@ -2013,7 +2027,8 @@ EXPORT_SYMBOL_GPL(nfs_link);
* the rename.
*/
int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
@@ -2021,6 +2036,9 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct rpc_task *task;
int error = -EBUSY;
+ if (flags)
+ return -EINVAL;
+
dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
old_dentry, new_dentry,
d_count(new_dentry));
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 72b7d13ee3c6..bd81bcf3ffcf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
dreq->iocb->ki_complete(dreq->iocb, res, 0);
}
- complete_all(&dreq->completion);
+ complete(&dreq->completion);
nfs_direct_req_release(dreq);
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ca699ddc11c1..9ea85ae23c32 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -182,29 +182,6 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
}
EXPORT_SYMBOL_GPL(nfs_file_read);
-ssize_t
-nfs_file_splice_read(struct file *filp, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t count,
- unsigned int flags)
-{
- struct inode *inode = file_inode(filp);
- ssize_t res;
-
- dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
- filp, (unsigned long) count, (unsigned long long) *ppos);
-
- nfs_start_io_read(inode);
- res = nfs_revalidate_mapping(inode, filp->f_mapping);
- if (!res) {
- res = generic_file_splice_read(filp, ppos, pipe, count, flags);
- if (res > 0)
- nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
- }
- nfs_end_io_read(inode);
- return res;
-}
-EXPORT_SYMBOL_GPL(nfs_file_splice_read);
-
int
nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
{
@@ -543,7 +520,9 @@ const struct address_space_operations nfs_file_aops = {
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
+#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
+#endif
.launder_page = nfs_launder_page,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
@@ -708,11 +687,6 @@ out_noconflict:
goto out;
}
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
-{
- return locks_lock_file_wait(file, fl);
-}
-
static int
do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
@@ -745,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
if (!is_local)
status = NFS_PROTO(inode)->lock(filp, cmd, fl);
else
- status = do_vfs_lock(filp, fl);
+ status = locks_lock_file_wait(filp, fl);
return status;
}
@@ -770,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
if (!is_local)
status = NFS_PROTO(inode)->lock(filp, cmd, fl);
else
- status = do_vfs_lock(filp, fl);
+ status = locks_lock_file_wait(filp, fl);
if (status < 0)
goto out;
@@ -871,7 +845,7 @@ const struct file_operations nfs_file_operations = {
.fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
- .splice_read = nfs_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 51b51369704c..98ace127bf86 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
case -NFS4ERR_BAD_STATEID:
if (state == NULL)
break;
- nfs_remove_bad_delegation(state->inode);
+ nfs_remove_bad_delegation(state->inode, NULL);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74935a19e4bf..80bcc0befb07 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -359,14 +359,13 @@ int nfs_unlink(struct inode *, struct dentry *);
int nfs_symlink(struct inode *, struct dentry *, const char *);
int nfs_link(struct dentry *, struct inode *, struct dentry *);
int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+int nfs_rename(struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
/* file.c */
int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
-ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
- size_t, unsigned int);
int nfs_file_mmap(struct file *, struct vm_area_struct *);
ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
@@ -535,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
}
#endif
-
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
-#else
-#define nfs_migrate_page NULL
#endif
static inline int
@@ -563,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
@@ -572,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp,
extern int nfs41_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
+extern int nfs4_test_session_trunk(struct rpc_clnt *,
+ struct rpc_xprt *,
+ void *);
static inline struct inode *nfs_igrab_and_active(struct inode *inode)
{
@@ -681,11 +679,11 @@ unsigned int nfs_page_length(struct page *page)
loff_t i_size = i_size_read(page_file_mapping(page)->host);
if (i_size > 0) {
- pgoff_t page_index = page_file_index(page);
+ pgoff_t index = page_index(page);
pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
- if (page_index < end_index)
+ if (index < end_index)
return PAGE_SIZE;
- if (page_index == end_index)
+ if (index == end_index)
return ((i_size - 1) & ~PAGE_MASK) + 1;
}
return 0;
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index f0e06e4acbef..fbce0d885d4c 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,7 +29,7 @@ struct nfs_net {
int cb_users[NFS4_MAX_MINOR_VERSION + 1];
#endif
spinlock_t nfs_client_lock;
- struct timespec boot_time;
+ ktime_t boot_time;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *proc_nfsfs;
#endif
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 698be9361280..dc925b531f32 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -899,9 +899,6 @@ static const struct inode_operations nfs3_dir_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .getxattr = generic_getxattr,
- .setxattr = generic_setxattr,
- .removexattr = generic_removexattr,
.get_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
@@ -913,9 +910,6 @@ static const struct inode_operations nfs3_file_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .getxattr = generic_getxattr,
- .setxattr = generic_setxattr,
- .removexattr = generic_removexattr,
.get_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 64b43b4ad9dd..608501971fe0 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
task = rpc_run_task(&task_setup);
if (IS_ERR(task))
return PTR_ERR(task);
+ rpc_put_task(task);
return 0;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9bf64eacba5b..9b3a82abab07 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -39,6 +39,7 @@ enum nfs4_client_state {
NFS4CLNT_BIND_CONN_TO_SESSION,
NFS4CLNT_MOVED,
NFS4CLNT_LEASE_MOVED,
+ NFS4CLNT_DELEGATION_EXPIRED,
};
#define NFS4_RENEW_TIMEOUT 0x01
@@ -57,8 +58,11 @@ struct nfs4_minor_version_ops {
struct nfs_fsinfo *);
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
+ int (*test_and_free_expired)(struct nfs_server *,
+ nfs4_stateid *, struct rpc_cred *);
struct nfs_seqid *
(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -156,6 +160,7 @@ enum {
NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
+ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */
};
struct nfs4_state {
@@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops {
struct rpc_cred *);
};
+struct nfs4_add_xprt_data {
+ struct nfs_client *clp;
+ struct rpc_cred *cred;
+};
+
struct nfs4_state_maintenance_ops {
int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
@@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
static inline bool
is_ds_only_client(struct nfs_client *clp)
@@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
-extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
extern void nfs41_handle_server_scope(struct nfs_client *,
struct nfs41_server_scope **);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
@@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4;
struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *);
extern bool nfs4_disable_idmapping;
extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
extern unsigned short send_implementation_id;
extern bool recover_lost_locks;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index cd3b7cfdde16..074ac7131459 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
return clp;
error:
@@ -562,15 +565,15 @@ out:
/*
* Returns true if the client IDs match
*/
-static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+static bool nfs4_match_clientids(u64 a, u64 b)
{
- if (a->cl_clientid != b->cl_clientid) {
+ if (a != b) {
dprintk("NFS: --> %s client ID %llx does not match %llx\n",
- __func__, a->cl_clientid, b->cl_clientid);
+ __func__, a, b);
return false;
}
dprintk("NFS: --> %s client ID %llx matches %llx\n",
- __func__, a->cl_clientid, b->cl_clientid);
+ __func__, a, b);
return true;
}
@@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
* Returns true if the server major ids match
*/
static bool
-nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2)
{
- struct nfs41_server_owner *o1 = a->cl_serverowner;
- struct nfs41_server_owner *o2 = b->cl_serverowner;
-
if (o1->major_id_sz != o2->major_id_sz)
goto out_major_mismatch;
if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
goto out_major_mismatch;
- dprintk("NFS: --> %s server owners match\n", __func__);
+ dprintk("NFS: --> %s server owner major IDs match\n", __func__);
return true;
out_major_mismatch:
@@ -597,6 +598,100 @@ out_major_mismatch:
return false;
}
+/*
+ * Returns true if server minor ids match
+ */
+static bool
+nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2)
+{
+ /* Check eir_server_owner so_minor_id */
+ if (o1->minor_id != o2->minor_id)
+ goto out_minor_mismatch;
+
+ dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
+ return true;
+
+out_minor_mismatch:
+ dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
+ return false;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+ struct nfs41_server_scope *s2)
+{
+ if (s1->server_scope_sz != s2->server_scope_sz)
+ goto out_scope_mismatch;
+ if (memcmp(s1->server_scope, s2->server_scope,
+ s1->server_scope_sz) != 0)
+ goto out_scope_mismatch;
+
+ dprintk("NFS: --> %s server scopes match\n", __func__);
+ return true;
+
+out_scope_mismatch:
+ dprintk("NFS: --> %s server scopes do not match\n",
+ __func__);
+ return false;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * @clp: original mount nfs_client
+ * @res: result structure from an exchange_id using the original mount
+ * nfs_client with a new multi_addr transport
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res,
+ struct rpc_xprt *xprt)
+{
+ /* Check eir_clientid */
+ if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+ goto out_err;
+
+ /* Check eir_server_owner so_major_id */
+ if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+ res->server_owner))
+ goto out_err;
+
+ /* Check eir_server_owner so_minor_id */
+ if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
+ res->server_owner))
+ goto out_err;
+
+ /* Check eir_server_scope */
+ if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+ goto out_err;
+
+ /* Session trunking passed, add the xprt */
+ rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt);
+
+ pr_info("NFS: %s: Session trunking succeeded for %s\n",
+ clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return 0;
+out_err:
+ pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return -EINVAL;
+}
+
/**
* nfs41_walk_client_list - Find nfs_client that matches a client/server owner
*
@@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (!nfs4_match_clientids(pos, new))
+ if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
continue;
/*
@@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
* client id trunking. In either case, we want to fall back
* to using the existing nfs_client.
*/
- if (!nfs4_check_clientid_trunking(pos, new))
+ if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+ new->cl_serverowner))
continue;
/* Unlike NFSv4.0, we know that NFSv4.1 always uses the
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d085ad794884..89a77950e0b0 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -248,7 +248,7 @@ const struct file_operations nfs4_file_operations = {
.fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
- .splice_read = nfs_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a9dec32ba9ba..7897826d7c51 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
#ifdef CONFIG_NFS_V4_1
static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
struct rpc_cred *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
- struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+ struct rpc_cred *, bool);
#endif
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+ ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ stateid->type = NFS4_REVOKED_STATEID_TYPE;
+ nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+ const nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ nfs4_stateid tmp;
+
+ nfs4_stateid_copy(&tmp, stateid);
+ __nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
static long nfs4_update_delay(long *timeout)
{
long ret;
@@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
exception->delay = 0;
exception->recovering = 0;
exception->retry = 0;
+
+ if (stateid == NULL && state != NULL)
+ stateid = &state->stateid;
+
switch(errorcode) {
case 0:
return 0;
- case -NFS4ERR_OPENMODE:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
+ if (inode != NULL && stateid != NULL) {
+ nfs_inode_find_state_and_recover(inode,
+ stateid);
+ goto wait_on_recovery;
+ }
+ case -NFS4ERR_OPENMODE:
if (inode) {
int err;
@@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
if (ret < 0)
break;
goto wait_on_recovery;
- case -NFS4ERR_EXPIRED:
- if (state != NULL) {
- ret = nfs4_schedule_stateid_recovery(server, state);
- if (ret < 0)
- break;
- }
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
nfs4_schedule_lease_recovery(clp);
@@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
}
spin_unlock(&tbl->slot_tbl_lock);
+ slot->privileged = args->sa_privileged ? 1 : 0;
args->sa_slot = slot;
res->sr_slot = slot;
@@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task,
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
+ /* If previous op on slot was interrupted and we reused
+ * the seq# and got a reply from the cache, then retry
+ */
+ if (task->tk_status == -EREMOTEIO && interrupted) {
+ ++slot->seq_nr;
+ goto retry_nowait;
+ }
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
- nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+ !!slot->privileged);
nfs41_update_target_slotid(slot->table, slot, res);
break;
case 1:
@@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
}
spin_unlock(&tbl->slot_tbl_lock);
+ slot->privileged = args->sa_privileged ? 1 : 0;
args->sa_slot = slot;
dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
@@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
nfs4_state_set_mode_locked(state, state->state | fmode);
}
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+ if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+ return true;
+ if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+ return true;
+ if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+ return true;
+ return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
{
struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
}
static bool nfs_need_update_open_stateid(struct nfs4_state *state,
- nfs4_stateid *stateid)
+ const nfs4_stateid *stateid, nfs4_stateid *freeme)
{
if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
return true;
if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs4_stateid_copy(freeme, &state->open_stateid);
nfs_test_and_clear_all_open_stateid(state);
return true;
}
@@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
}
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+ const nfs4_stateid *stateid, fmode_t fmode,
+ nfs4_stateid *freeme)
{
switch (fmode) {
case FMODE_READ:
@@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
case FMODE_READ|FMODE_WRITE:
set_bit(NFS_O_RDWR_STATE, &state->flags);
}
- if (!nfs_need_update_open_stateid(state, stateid))
+ if (!nfs_need_update_open_stateid(state, stateid, freeme))
return;
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
}
-static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+static void __update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode,
+ nfs4_stateid *freeme)
{
/*
* Protect the call to nfs4_state_set_mode_locked and
@@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
set_bit(NFS_DELEGATED_STATE, &state->flags);
}
if (open_stateid != NULL)
- nfs_set_open_stateid_locked(state, open_stateid, fmode);
+ nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
write_sequnlock(&state->seqlock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
-static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+static int update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *delegation,
+ fmode_t fmode)
{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *deleg_cur;
+ nfs4_stateid freeme = { };
int ret = 0;
fmode &= (FMODE_READ|FMODE_WRITE);
@@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
goto no_delegation_unlock;
nfs_mark_delegation_referenced(deleg_cur);
- __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+ __update_open_stateid(state, open_stateid, &deleg_cur->stateid,
+ fmode, &freeme);
ret = 1;
no_delegation_unlock:
spin_unlock(&deleg_cur->lock);
@@ -1508,11 +1576,14 @@ no_delegation:
rcu_read_unlock();
if (!ret && open_stateid != NULL) {
- __update_open_stateid(state, open_stateid, NULL, fmode);
+ __update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
ret = 1;
}
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
- nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+ nfs4_schedule_state_manager(clp);
+ if (freeme.type != 0)
+ nfs4_test_and_free_stateid(server, &freeme,
+ state->owner->so_cred);
return ret;
}
@@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
set_bit(NFS_DELEGATED_STATE, &state->flags);
- case -NFS4ERR_EXPIRED:
/* Don't recall a delegation if it was lost */
nfs4_schedule_lease_recovery(server->nfs_client);
return -EAGAIN;
@@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
return -EAGAIN;
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OPENMODE:
nfs_inode_find_state_and_recover(state->inode,
@@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
-static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
{
- nfs_remove_bad_delegation(state->inode);
+ nfs_remove_bad_delegation(state->inode, stateid);
write_seqlock(&state->seqlock);
nfs4_stateid_copy(&state->stateid, &state->open_stateid);
write_sequnlock(&state->seqlock);
@@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
{
if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
- nfs_finish_clear_delegation_stateid(state);
+ nfs_finish_clear_delegation_stateid(state, NULL);
}
static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
return nfs4_open_expired(sp, state);
}
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ return -NFS4ERR_BAD_STATEID;
+}
+
#if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ int status;
+
+ switch (stateid->type) {
+ default:
+ break;
+ case NFS4_INVALID_STATEID_TYPE:
+ case NFS4_SPECIAL_STATEID_TYPE:
+ return -NFS4ERR_BAD_STATEID;
+ case NFS4_REVOKED_STATEID_TYPE:
+ goto out_free;
+ }
+
+ status = nfs41_test_stateid(server, stateid, cred);
+ switch (status) {
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ break;
+ default:
+ return status;
+ }
+out_free:
+ /* Ack the revoked state to the server */
+ nfs41_free_stateid(server, stateid, cred, true);
+ return -NFS4ERR_EXPIRED;
+}
+
static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
@@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
}
nfs4_stateid_copy(&stateid, &delegation->stateid);
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ rcu_read_unlock();
+ nfs_finish_clear_delegation_stateid(state, &stateid);
+ return;
+ }
+
+ if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) {
+ rcu_read_unlock();
+ return;
+ }
+
cred = get_rpccred(delegation->cred);
rcu_read_unlock();
- status = nfs41_test_stateid(server, &stateid, cred);
+ status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
trace_nfs4_test_delegation_stateid(state, NULL, status);
-
- if (status != NFS_OK) {
- /* Free the stateid unless the server explicitly
- * informs us the stateid is unrecognized. */
- if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, &stateid, cred);
- nfs_finish_clear_delegation_stateid(state);
- }
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_finish_clear_delegation_stateid(state, &stateid);
put_rpccred(cred);
}
/**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+ int status, ret = NFS_OK;
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+
+ if (!test_bit(LK_STATE_IN_USE, &state->flags))
+ goto out;
+ list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+ status = nfs41_test_and_free_expired_stateid(server,
+ &lsp->ls_stateid,
+ cred);
+ trace_nfs4_test_lock_stateid(state, lsp, status);
+ if (status == -NFS4ERR_EXPIRED ||
+ status == -NFS4ERR_BAD_STATEID) {
+ clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+ if (!recover_lost_locks)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ } else if (status != NFS_OK) {
+ ret = status;
+ break;
+ }
+ }
+ };
+out:
+ return ret;
+}
+
+/**
* nfs41_check_open_stateid - possibly free an open stateid
*
* @state: NFSv4 state for an inode
@@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
struct rpc_cred *cred = state->owner->so_cred;
int status;
- /* If a state reset has been done, test_stateid is unneeded */
- if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
- (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
- (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) {
+ if (nfs4_have_delegation(state->inode, state->state))
+ return NFS_OK;
+ return -NFS4ERR_OPENMODE;
+ }
return -NFS4ERR_BAD_STATEID;
-
- status = nfs41_test_stateid(server, stateid, cred);
+ }
+ status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
trace_nfs4_test_open_stateid(state, NULL, status);
- if (status != NFS_OK) {
- /* Free the stateid unless the server explicitly
- * informs us the stateid is unrecognized. */
- if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
-
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
clear_bit(NFS_O_RDWR_STATE, &state->flags);
clear_bit(NFS_OPEN_STATE, &state->flags);
+ stateid->type = NFS4_INVALID_STATEID_TYPE;
}
- return status;
+ if (status != NFS_OK)
+ return status;
+ if (nfs_open_stateid_recover_openmode(state))
+ return -NFS4ERR_OPENMODE;
+ return NFS_OK;
}
static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
int status;
nfs41_check_delegation_stateid(state);
+ status = nfs41_check_expired_locks(state);
+ if (status != NFS_OK)
+ return status;
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
goto out;
if (server->caps & NFS_CAP_POSIX_LOCK)
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+ set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
dentry = opendata->dentry;
if (d_really_is_negative(dentry)) {
@@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_EXPIRED:
if (!nfs4_stateid_match(&calldata->arg.stateid,
&state->open_stateid)) {
rpc_restart_call_prepare(task);
@@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
if (error == 0) {
/* block layout checks this! */
server->pnfs_blksize = fsinfo->blksize;
- set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+ set_pnfs_layoutdriver(server, fhandle, fsinfo);
}
return error;
@@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err)
return false;
}
-void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
-{
- nfs_invalidate_atime(hdr->inode);
-}
-
static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
struct nfs_server *server = NFS_SERVER(hdr->inode);
trace_nfs4_read(hdr, task->tk_status);
- if (nfs4_async_handle_error(task, server,
- hdr->args.context->state,
- NULL) == -EAGAIN) {
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ server, task->tk_status, &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
}
- __nfs4_read_done_cb(hdr);
if (task->tk_status > 0)
renew_lease(server, hdr->timestamp);
return 0;
@@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
return -EAGAIN;
+ if (task->tk_status > 0)
+ nfs_invalidate_atime(hdr->inode);
return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
nfs4_read_done_cb(task, hdr);
}
@@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task,
struct inode *inode = hdr->inode;
trace_nfs4_write(hdr, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode),
- hdr->args.context->state,
- NULL) == -EAGAIN) {
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ NFS_SERVER(inode), task->tk_status,
+ &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
}
if (task->tk_status >= 0) {
renew_lease(NFS_SERVER(inode), hdr->timestamp);
@@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
/* An impossible timestamp guarantees this value
* will never match a generated boot time. */
- verf[0] = 0;
- verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+ verf[0] = cpu_to_be32(U32_MAX);
+ verf[1] = cpu_to_be32(U32_MAX);
} else {
struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
- verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
- verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+ u64 ns = ktime_to_ns(nn->boot_time);
+
+ verf[0] = cpu_to_be32(ns >> 32);
+ verf[1] = cpu_to_be32(ns);
}
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
@@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
renew_lease(data->res.server, data->timestamp);
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(data->res.server,
+ data->args.stateid,
+ task->tk_msg.rpc_cred);
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_EXPIRED:
task->tk_status = 0;
if (data->roc)
pnfs_roc_set_barrier(data->inode, data->roc_barrier);
@@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
return err;
}
-#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
-#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
-
-/*
- * sleep, with exponential backoff, and retry the LOCK operation.
- */
-static unsigned long
-nfs4_set_lock_task_retry(unsigned long timeout)
-{
- freezable_schedule_timeout_killable_unsafe(timeout);
- timeout <<= 1;
- if (timeout > NFS4_LOCK_MAXTIMEOUT)
- return NFS4_LOCK_MAXTIMEOUT;
- return timeout;
-}
-
static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
struct inode *inode = state->inode;
@@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
return err;
}
-static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
-{
- return locks_lock_inode_wait(inode, fl);
-}
-
struct nfs4_unlockdata {
struct nfs_locku_args arg;
struct nfs_locku_res res;
@@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
renew_lease(calldata->server, calldata->timestamp);
- do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
+ locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
if (nfs4_update_lock_stateid(calldata->lsp,
&calldata->res.stateid))
break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(calldata->server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_EXPIRED:
if (!nfs4_stateid_match(&calldata->arg.stateid,
&calldata->lsp->ls_stateid))
rpc_restart_call_prepare(task);
@@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
mutex_lock(&sp->so_delegreturn_mutex);
/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
down_read(&nfsi->rwsem);
- if (do_vfs_lock(inode, request) == -ENOENT) {
+ if (locks_lock_inode_wait(inode, request) == -ENOENT) {
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
@@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->timestamp);
if (data->arg.new_lock) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
- if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
+ if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) {
rpc_restart_call_prepare(task);
break;
}
@@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
{
switch (error) {
case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
if (new_lock_owner != 0 ||
@@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
break;
case -NFS4ERR_STALE_STATEID:
lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
- case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(server->nfs_client);
};
}
@@ -6083,52 +6247,19 @@ out:
}
#if defined(CONFIG_NFS_V4_1)
-/**
- * nfs41_check_expired_locks - possibly free a lock stateid
- *
- * @state: NFSv4 state for an inode
- *
- * Returns NFS_OK if recovery for this stateid is now finished.
- * Otherwise a negative NFS4ERR value is returned.
- */
-static int nfs41_check_expired_locks(struct nfs4_state *state)
-{
- int status, ret = -NFS4ERR_BAD_STATEID;
- struct nfs4_lock_state *lsp;
- struct nfs_server *server = NFS_SERVER(state->inode);
-
- list_for_each_entry(lsp, &state->lock_states, ls_locks) {
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-
- status = nfs41_test_stateid(server,
- &lsp->ls_stateid,
- cred);
- trace_nfs4_test_lock_stateid(state, lsp, status);
- if (status != NFS_OK) {
- /* Free the stateid unless the server
- * informs us the stateid is unrecognized. */
- if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server,
- &lsp->ls_stateid,
- cred);
- clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
- ret = status;
- }
- }
- };
-
- return ret;
-}
-
static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
{
- int status = NFS_OK;
+ struct nfs4_lock_state *lsp;
+ int status;
- if (test_bit(LK_STATE_IN_USE, &state->flags))
- status = nfs41_check_expired_locks(state);
- if (status != NFS_OK)
- status = nfs4_lock_expired(state, request);
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+ lsp = request->fl_u.nfs4_fl.owner;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+ test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ return 0;
+ status = nfs4_lock_expired(state, request);
return status;
}
#endif
@@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs4_state_owner *sp = state->owner;
unsigned char fl_flags = request->fl_flags;
- int status = -ENOLCK;
+ int status;
- if ((fl_flags & FL_POSIX) &&
- !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
- goto out;
- /* Is this a delegated open? */
- status = nfs4_set_lock_state(state, request);
- if (status != 0)
- goto out;
request->fl_flags |= FL_ACCESS;
- status = do_vfs_lock(state->inode, request);
+ status = locks_lock_inode_wait(state->inode, request);
if (status < 0)
goto out;
mutex_lock(&sp->so_delegreturn_mutex);
@@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
/* Yes: cache locks! */
/* ...but avoid races with delegation recall... */
request->fl_flags = fl_flags & ~FL_SLEEP;
- status = do_vfs_lock(state->inode, request);
+ status = locks_lock_inode_wait(state->inode, request);
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
@@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
return err;
}
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+ struct file_lock *request)
+{
+ int status = -ERESTARTSYS;
+ unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+
+ while(!signalled()) {
+ status = nfs4_proc_setlk(state, cmd, request);
+ if ((status != -EAGAIN) || IS_SETLK(cmd))
+ break;
+ freezable_schedule_timeout_interruptible(timeout);
+ timeout *= 2;
+ timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+ status = -ERESTARTSYS;
+ }
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+ struct task_struct *task;
+ struct inode *inode;
+ struct nfs_lowner *owner;
+ bool notified;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+{
+ int ret;
+ struct cb_notify_lock_args *cbnl = key;
+ struct nfs4_lock_waiter *waiter = wait->private;
+ struct nfs_lowner *lowner = &cbnl->cbnl_owner,
+ *wowner = waiter->owner;
+
+ /* Only wake if the callback was for the same owner */
+ if (lowner->clientid != wowner->clientid ||
+ lowner->id != wowner->id ||
+ lowner->s_dev != wowner->s_dev)
+ return 0;
+
+ /* Make sure it's for the right inode */
+ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+ return 0;
+
+ waiter->notified = true;
+
+ /* override "private" so we can use default_wake_function */
+ wait->private = waiter->task;
+ ret = autoremove_wake_function(wait, mode, flags, key);
+ wait->private = waiter;
+ return ret;
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ int status = -ERESTARTSYS;
+ unsigned long flags;
+ struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
+ wait_queue_head_t *q = &clp->cl_lock_waitq;
+ struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+ .id = lsp->ls_seqid.owner_id,
+ .s_dev = server->s_dev };
+ struct nfs4_lock_waiter waiter = { .task = current,
+ .inode = state->inode,
+ .owner = &owner,
+ .notified = false };
+ wait_queue_t wait;
+
+ /* Don't bother with waitqueue if we don't expect a callback */
+ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+ return nfs4_retry_setlk_simple(state, cmd, request);
+
+ init_wait(&wait);
+ wait.private = &waiter;
+ wait.func = nfs4_wake_lock_waiter;
+ add_wait_queue(q, &wait);
+
+ while(!signalled()) {
+ status = nfs4_proc_setlk(state, cmd, request);
+ if ((status != -EAGAIN) || IS_SETLK(cmd))
+ break;
+
+ status = -ERESTARTSYS;
+ spin_lock_irqsave(&q->lock, flags);
+ if (waiter.notified) {
+ spin_unlock_irqrestore(&q->lock, flags);
+ continue;
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+ }
+
+ finish_wait(q, &wait);
+ return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
static int
nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
{
struct nfs_open_context *ctx;
struct nfs4_state *state;
- unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
int status;
/* verify open state */
@@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
if (state == NULL)
return -ENOLCK;
+
+ if ((request->fl_flags & FL_POSIX) &&
+ !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+ return -ENOLCK;
+
/*
* Don't rely on the VFS having checked the file open mode,
* since it won't do this for flock() locks.
@@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
return -EBADF;
}
- do {
- status = nfs4_proc_setlk(state, cmd, request);
- if ((status != -EAGAIN) || IS_SETLK(cmd))
- break;
- timeout = nfs4_set_lock_task_retry(timeout);
- status = -ERESTARTSYS;
- if (signalled())
- break;
- } while(status < 0);
- return status;
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+
+ return nfs4_retry_setlk(state, cmd, request);
}
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
@@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
return 0;
}
+struct nfs41_exchange_id_data {
+ struct nfs41_exchange_id_res res;
+ struct nfs41_exchange_id_args args;
+ struct rpc_xprt *xprt;
+ int rpc_status;
+};
+
+static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
+{
+ struct nfs41_exchange_id_data *cdata =
+ (struct nfs41_exchange_id_data *)data;
+ struct nfs_client *clp = cdata->args.client;
+ int status = task->tk_status;
+
+ trace_nfs4_exchange_id(clp, status);
+
+ if (status == 0)
+ status = nfs4_check_cl_exchange_flags(cdata->res.flags);
+
+ if (cdata->xprt && status == 0) {
+ status = nfs4_detect_session_trunking(clp, &cdata->res,
+ cdata->xprt);
+ goto out;
+ }
+
+ if (status == 0)
+ status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
+
+ if (status == 0) {
+ clp->cl_clientid = cdata->res.clientid;
+ clp->cl_exchange_flags = cdata->res.flags;
+ /* Client ID is not confirmed */
+ if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
+ clp->cl_seqid = cdata->res.seqid;
+ }
+
+ kfree(clp->cl_serverowner);
+ clp->cl_serverowner = cdata->res.server_owner;
+ cdata->res.server_owner = NULL;
+
+ /* use the most recent implementation id */
+ kfree(clp->cl_implid);
+ clp->cl_implid = cdata->res.impl_id;
+ cdata->res.impl_id = NULL;
+
+ if (clp->cl_serverscope != NULL &&
+ !nfs41_same_server_scope(clp->cl_serverscope,
+ cdata->res.server_scope)) {
+ dprintk("%s: server_scope mismatch detected\n",
+ __func__);
+ set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+ kfree(clp->cl_serverscope);
+ clp->cl_serverscope = NULL;
+ }
+
+ if (clp->cl_serverscope == NULL) {
+ clp->cl_serverscope = cdata->res.server_scope;
+ cdata->res.server_scope = NULL;
+ }
+ /* Save the EXCHANGE_ID verifier session trunk tests */
+ memcpy(clp->cl_confirm.data, cdata->args.verifier->data,
+ sizeof(clp->cl_confirm.data));
+ }
+out:
+ cdata->rpc_status = status;
+ return;
+}
+
+static void nfs4_exchange_id_release(void *data)
+{
+ struct nfs41_exchange_id_data *cdata =
+ (struct nfs41_exchange_id_data *)data;
+
+ nfs_put_client(cdata->args.client);
+ if (cdata->xprt) {
+ xprt_put(cdata->xprt);
+ rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient);
+ }
+ kfree(cdata->res.impl_id);
+ kfree(cdata->res.server_scope);
+ kfree(cdata->res.server_owner);
+ kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+ .rpc_call_done = nfs4_exchange_id_done,
+ .rpc_release = nfs4_exchange_id_release,
+};
+
/*
* _nfs4_proc_exchange_id()
*
* Wrapper for EXCHANGE_ID operation.
*/
static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
- u32 sp4_how)
+ u32 sp4_how, struct rpc_xprt *xprt)
{
nfs4_verifier verifier;
- struct nfs41_exchange_id_args args = {
- .verifier = &verifier,
- .client = clp,
-#ifdef CONFIG_NFS_V4_1_MIGRATION
- .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
- EXCHGID4_FLAG_BIND_PRINC_STATEID |
- EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
- .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
- EXCHGID4_FLAG_BIND_PRINC_STATEID,
-#endif
- };
- struct nfs41_exchange_id_res res = {
- 0
- };
- int status;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
- .rpc_argp = &args,
- .rpc_resp = &res,
.rpc_cred = cred,
};
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .callback_ops = &nfs4_exchange_id_call_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ };
+ struct nfs41_exchange_id_data *calldata;
+ struct rpc_task *task;
+ int status = -EIO;
+
+ if (!atomic_inc_not_zero(&clp->cl_count))
+ goto out;
+
+ status = -ENOMEM;
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (!calldata)
+ goto out;
- nfs4_init_boot_verifier(clp, &verifier);
+ if (!xprt)
+ nfs4_init_boot_verifier(clp, &verifier);
status = nfs4_init_uniform_client_string(clp);
if (status)
- goto out;
+ goto out_calldata;
dprintk("NFS call exchange_id auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
clp->cl_owner_id);
- res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
- GFP_NOFS);
- if (unlikely(res.server_owner == NULL)) {
- status = -ENOMEM;
- goto out;
- }
+ calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+ GFP_NOFS);
+ status = -ENOMEM;
+ if (unlikely(calldata->res.server_owner == NULL))
+ goto out_calldata;
- res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+ calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
GFP_NOFS);
- if (unlikely(res.server_scope == NULL)) {
- status = -ENOMEM;
+ if (unlikely(calldata->res.server_scope == NULL))
goto out_server_owner;
- }
- res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
- if (unlikely(res.impl_id == NULL)) {
- status = -ENOMEM;
+ calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+ if (unlikely(calldata->res.impl_id == NULL))
goto out_server_scope;
- }
switch (sp4_how) {
case SP4_NONE:
- args.state_protect.how = SP4_NONE;
+ calldata->args.state_protect.how = SP4_NONE;
break;
case SP4_MACH_CRED:
- args.state_protect = nfs4_sp4_mach_cred_request;
+ calldata->args.state_protect = nfs4_sp4_mach_cred_request;
break;
default:
@@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
status = -EINVAL;
goto out_impl_id;
}
+ if (xprt) {
+ calldata->xprt = xprt;
+ task_setup_data.rpc_xprt = xprt;
+ task_setup_data.flags =
+ RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC;
+ calldata->args.verifier = &clp->cl_confirm;
+ } else {
+ calldata->args.verifier = &verifier;
+ }
+ calldata->args.client = clp;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+ calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID |
+ EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+ calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
+ msg.rpc_argp = &calldata->args;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
- status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
- trace_nfs4_exchange_id(clp, status);
- if (status == 0)
- status = nfs4_check_cl_exchange_flags(res.flags);
-
- if (status == 0)
- status = nfs4_sp4_select_mode(clp, &res.state_protect);
-
- if (status == 0) {
- clp->cl_clientid = res.clientid;
- clp->cl_exchange_flags = res.flags;
- /* Client ID is not confirmed */
- if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
- clear_bit(NFS4_SESSION_ESTABLISHED,
- &clp->cl_session->session_state);
- clp->cl_seqid = res.seqid;
- }
-
- kfree(clp->cl_serverowner);
- clp->cl_serverowner = res.server_owner;
- res.server_owner = NULL;
-
- /* use the most recent implementation id */
- kfree(clp->cl_implid);
- clp->cl_implid = res.impl_id;
- res.impl_id = NULL;
-
- if (clp->cl_serverscope != NULL &&
- !nfs41_same_server_scope(clp->cl_serverscope,
- res.server_scope)) {
- dprintk("%s: server_scope mismatch detected\n",
- __func__);
- set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
- kfree(clp->cl_serverscope);
- clp->cl_serverscope = NULL;
- }
-
- if (clp->cl_serverscope == NULL) {
- clp->cl_serverscope = res.server_scope;
- res.server_scope = NULL;
- }
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ status = PTR_ERR(task);
+ goto out_impl_id;
}
-out_impl_id:
- kfree(res.impl_id);
-out_server_scope:
- kfree(res.server_scope);
-out_server_owner:
- kfree(res.server_owner);
+ if (!xprt) {
+ status = rpc_wait_for_completion_task(task);
+ if (!status)
+ status = calldata->rpc_status;
+ } else /* session trunking test */
+ status = calldata->rpc_status;
+
+ rpc_put_task(task);
out:
if (clp->cl_implid != NULL)
dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7240,6 +7548,16 @@ out:
clp->cl_implid->date.nseconds);
dprintk("NFS reply exchange_id: %d\n", status);
return status;
+
+out_impl_id:
+ kfree(calldata->res.impl_id);
+out_server_scope:
+ kfree(calldata->res.server_scope);
+out_server_owner:
+ kfree(calldata->res.server_owner);
+out_calldata:
+ kfree(calldata);
+ goto out;
}
/*
@@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
/* try SP4_MACH_CRED if krb5i/p */
if (authflavor == RPC_AUTH_GSS_KRB5I ||
authflavor == RPC_AUTH_GSS_KRB5P) {
- status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+ status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
if (!status)
return 0;
}
/* try SP4_NONE */
- return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+ return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+}
+
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+ void *data)
+{
+ struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+ u32 sp4_how;
+
+ dprintk("--> %s try %s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+ /* Test connection for session trunking. Async exchange_id call */
+ return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
}
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
struct rpc_cred *cred)
@@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
args->bc_attrs.max_resp_sz = max_bc_payload;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
- args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
+ args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1);
dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
return -EINVAL;
if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
return -EINVAL;
- /* These would render the backchannel useless: */
- if (rcvd->max_ops != sent->max_ops)
+ if (rcvd->max_ops > sent->max_ops)
return -EINVAL;
- if (rcvd->max_reqs != sent->max_reqs)
+ if (rcvd->max_reqs > sent->max_reqs)
return -EINVAL;
out:
return 0;
@@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
case -NFS4ERR_RECALLCONFLICT:
status = -ERECALLCONFLICT;
break;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
exception->timeout = 0;
@@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
&lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
exception->state = lgp->args.ctx->state;
+ exception->stateid = &lgp->args.stateid;
break;
}
@@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server,
return -res.status;
}
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ exception->retry = 0;
+ switch(err) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ nfs4_handle_exception(server, err, exception);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_do_handle_exception(server, err, exception);
+ }
+}
+
/**
* nfs41_test_stateid - perform a TEST_STATEID operation
*
@@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server,
int err;
do {
err = _nfs41_test_stateid(server, stateid, cred);
- if (err != -NFS4ERR_DELAY)
- break;
- nfs4_handle_exception(server, err, &exception);
+ nfs4_handle_delay_or_session_error(server, err, &exception);
} while (exception.retry);
return err;
}
@@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
};
static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
+ const nfs4_stateid *stateid,
struct rpc_cred *cred,
bool privileged)
{
@@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
if (privileged)
nfs4_set_sequence_privileged(&data->args.seq_args);
@@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
* @cred: credential
+ * @is_recovery: set to true if this call needs to be privileged
*
- * Returns NFS_OK if the server freed "stateid". Otherwise a
- * negative NFS4ERR value is returned.
+ * Note: this function is always asynchronous.
*/
static int nfs41_free_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
- struct rpc_cred *cred)
+ const nfs4_stateid *stateid,
+ struct rpc_cred *cred,
+ bool is_recovery)
{
struct rpc_task *task;
- int ret;
- task = _nfs41_free_stateid(server, stateid, cred, true);
+ task = _nfs41_free_stateid(server, stateid, cred, is_recovery);
if (IS_ERR(task))
return PTR_ERR(task);
- ret = rpc_wait_for_completion_task(task);
- if (!ret)
- ret = task->tk_status;
rpc_put_task(task);
- return ret;
+ return 0;
}
static void
nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- struct rpc_task *task;
struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
- task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+ nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
- if (IS_ERR(task))
- return;
- rpc_put_task(task);
}
static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.match_stateid = nfs4_match_stateid,
.find_root_sec = nfs4_find_root_sec,
.free_lock_state = nfs4_release_lockowner,
+ .test_and_free_expired = nfs40_test_and_free_expired_stateid,
.alloc_seqid = nfs_alloc_seqid,
.call_sync_ops = &nfs40_call_sync_ops,
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
@@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
.alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
.call_sync_ops = &nfs41_call_sync_ops,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
.call_sync_ops = &nfs41_call_sync_ops,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
.alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
@@ -8941,20 +9306,14 @@ static const struct inode_operations nfs4_dir_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
- .getxattr = generic_getxattr,
- .setxattr = generic_setxattr,
.listxattr = nfs4_listxattr,
- .removexattr = generic_removexattr,
};
static const struct inode_operations nfs4_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
- .getxattr = generic_getxattr,
- .setxattr = generic_setxattr,
.listxattr = nfs4_listxattr,
- .removexattr = generic_removexattr,
};
const struct nfs_rpc_ops nfs_v4_clientops = {
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index f703b755351b..dae385500005 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -9,6 +9,7 @@
/* maximum number of slots to use */
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -22,6 +23,7 @@ struct nfs4_slot {
u32 slot_nr;
u32 seq_nr;
unsigned int interrupted : 1,
+ privileged : 1,
seq_done : 1;
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cada00aa5096..5f4281ec5f72 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
{
int ret;
+ if (!nfs4_valid_open_stateid(state))
+ return -EIO;
if (cred != NULL)
*cred = NULL;
ret = nfs4_copy_lock_stateid(dst, state, lockowner);
@@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
{
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
/* Don't recover state that expired before the reboot */
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
@@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
{
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
@@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
{
struct nfs_client *clp = server->nfs_client;
- if (!nfs4_valid_open_stateid(state))
+ if (!nfs4_state_mark_reclaim_nograce(clp, state))
return -EBADF;
- nfs4_state_mark_reclaim_nograce(clp, state);
dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
clp->cl_hostname);
nfs4_schedule_state_manager(clp);
@@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
}
EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_lock_state *pos;
+
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+ continue;
+ if (nfs4_stateid_match_other(&pos->ls_stateid, stateid))
+ return pos;
+ }
+ return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ bool found = false;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ if (nfs_state_find_lock_state_by_stateid(state, stateid))
+ found = true;
+ spin_unlock(&state->state_lock);
+ }
+ return found;
+}
+
void nfs_inode_find_state_and_recover(struct inode *inode,
const nfs4_stateid *stateid)
{
@@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
state = ctx->state;
if (state == NULL)
continue;
- if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+ if (nfs4_stateid_match_other(&state->stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
continue;
- if (!nfs4_stateid_match(&state->stateid, stateid))
- continue;
- nfs4_state_mark_reclaim_nograce(clp, state);
- found = true;
+ }
+ if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state))
+ found = true;
}
spin_unlock(&inode->i_lock);
+
+ nfs_inode_find_delegation_state_and_recover(inode, stateid);
if (found)
nfs4_schedule_state_manager(clp);
}
@@ -1498,6 +1536,9 @@ restart:
__func__, status);
case -ENOENT:
case -ENOMEM:
+ case -EACCES:
+ case -EROFS:
+ case -EIO:
case -ESTALE:
/* Open state on this file cannot be recovered */
nfs4_state_mark_recovery_failed(state, status);
@@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
put_rpccred(cred);
}
-static void nfs_delegation_clear_all(struct nfs_client *clp)
-{
- nfs_delegation_mark_reclaim(clp);
- nfs_delegation_reap_unclaimed(clp);
-}
-
static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
{
- nfs_delegation_clear_all(clp);
+ nfs_mark_test_expired_all_delegations(clp);
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
}
@@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
{
- nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+ nfs4_state_start_reclaim_nograce(clp);
nfs4_schedule_state_manager(clp);
dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
@@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
nfs4_schedule_state_manager(clp);
}
-void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+ bool recovery)
{
if (!flags)
return;
dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
__func__, clp->cl_hostname, clp->cl_clientid, flags);
+ /*
+ * If we're called from the state manager thread, then assume we're
+ * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+ * Those flags are expected to remain set until we're done
+ * recovering (see RFC5661, section 18.46.3).
+ */
+ if (recovery)
+ goto out_recovery;
if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
nfs41_handle_server_reboot(clp);
@@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
nfs4_schedule_lease_moved_recovery(clp);
if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
nfs41_handle_backchannel_fault(clp);
else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
@@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_state_end_reclaim_reboot(clp);
}
+ /* Detect expired delegations... */
+ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+ section = "detect expired delegations";
+ nfs_reap_expired_delegations(clp);
+ continue;
+ }
+
/* Now recover expired state... */
if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
section = "reclaim nograce";
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7bd3a5c09d31..fc89e5ed07ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr,
*p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
/* authsys_parms rfc1831 */
- *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
+ *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */
p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
*p++ = cpu_to_be32(0); /* UID */
*p++ = cpu_to_be32(0); /* GID */
@@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
}
/*
- * Decode potentially multiple layout types. Currently we only support
- * one layout driver per file system.
+ * Decode potentially multiple layout types.
*/
-static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
- uint32_t *layouttype)
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+ struct nfs_fsinfo *fsinfo)
{
__be32 *p;
- int num;
+ uint32_t i;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
- num = be32_to_cpup(p);
+ fsinfo->nlayouttypes = be32_to_cpup(p);
/* pNFS is not supported by the underlying file system */
- if (num == 0) {
- *layouttype = 0;
+ if (fsinfo->nlayouttypes == 0)
return 0;
- }
- if (num > 1)
- printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
- "drivers per filesystem not supported\n", __func__);
/* Decode and set first layout type, move xdr->p past unused types */
- p = xdr_inline_decode(xdr, num * 4);
+ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
if (unlikely(!p))
goto out_overflow;
- *layouttype = be32_to_cpup(p);
+
+ /* If we get too many, then just cap it at the max */
+ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+ printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+ __func__, fsinfo->nlayouttypes);
+ fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+ }
+
+ for(i = 0; i < fsinfo->nlayouttypes; ++i)
+ fsinfo->layouttype[i] = be32_to_cpup(p++);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -4764,7 +4767,7 @@ out_overflow:
* Note we must ensure that layouttype is set in any non-error case.
*/
static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
- uint32_t *layouttype)
+ struct nfs_fsinfo *fsinfo)
{
int status = 0;
@@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
return -EIO;
if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
- status = decode_first_pnfs_layout_type(xdr, layouttype);
+ status = decode_pnfs_layout_types(xdr, fsinfo);
bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
- } else
- *layouttype = 0;
+ }
return status;
}
@@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
if (status != 0)
goto xdr_error;
- status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+ status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
if (status != 0)
goto xdr_error;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 174dd4cf5747..965db474f4b0 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
if (page) {
- req->wb_index = page_file_index(page);
+ req->wb_index = page_index(page);
get_page(page);
}
req->wb_offset = offset;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2c93a85eda51..56b2d96f9103 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/sort.h>
#include "internal.h"
#include "pnfs.h"
#include "iostat.h"
@@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
}
/*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ * mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+ LAYOUT_SCSI,
+ LAYOUT_BLOCK_VOLUME,
+ LAYOUT_OSD2_OBJECTS,
+ LAYOUT_FLEX_FILES,
+ LAYOUT_NFSV4_1_FILES,
+ 0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+ u32 ld1 = *((u32 *)e1);
+ u32 ld2 = *((u32 *)e2);
+ int i;
+
+ for (i = 0; ld_prefs[i] != 0; i++) {
+ if (ld1 == ld_prefs[i])
+ return -1;
+
+ if (ld2 == ld_prefs[i])
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Try to set the server's pnfs module to the pnfs layout type specified by id.
* Currently only one pNFS layout driver per filesystem is supported.
*
- * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ * @ids array of layout types supported by MDS.
*/
void
set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
- u32 id)
+ struct nfs_fsinfo *fsinfo)
{
struct pnfs_layoutdriver_type *ld_type = NULL;
+ u32 id;
+ int i;
- if (id == 0)
- goto out_no_driver;
if (!(server->nfs_client->cl_exchange_flags &
(EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
- printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
- __func__, id, server->nfs_client->cl_exchange_flags);
+ printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+ __func__, server->nfs_client->cl_exchange_flags);
goto out_no_driver;
}
- ld_type = find_pnfs_driver(id);
- if (!ld_type) {
- request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+
+ sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+ sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+ for (i = 0; i < fsinfo->nlayouttypes; i++) {
+ id = fsinfo->layouttype[i];
ld_type = find_pnfs_driver(id);
if (!ld_type) {
- dprintk("%s: No pNFS module found for %u.\n",
- __func__, id);
- goto out_no_driver;
+ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+ id);
+ ld_type = find_pnfs_driver(id);
}
+ if (ld_type)
+ break;
+ }
+
+ if (!ld_type) {
+ dprintk("%s: No pNFS module found!\n", __func__);
+ goto out_no_driver;
}
+
server->pnfs_curr_ld = ld_type;
if (ld_type->set_layoutdriver
&& ld_type->set_layoutdriver(server, mntfh)) {
@@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- if (likely(!hdr->pnfs_error)) {
- __nfs4_read_done_cb(hdr);
+ if (likely(!hdr->pnfs_error))
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- }
trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 31d99b2927b0..5c295512c967 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
-void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
}
static inline void set_pnfs_layoutdriver(struct nfs_server *s,
- const struct nfs_fh *mntfh, u32 id)
+ const struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
{
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f3468b57a32a..53b4705abcc7 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = nfs4_set_ds_client(mds_srv,
- (struct sockaddr *)&da->da_addr,
- da->da_addrlen, IPPROTO_TCP,
- timeo, retrans, minor_version,
- au_flavor);
- if (!IS_ERR(clp))
- break;
+ if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+ struct xprt_create xprt_args = {
+ .ident = XPRT_TRANSPORT_TCP,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ .cred = nfs4_get_clid_cred(clp),
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+
+ /**
+ * Test this address for session trunking and
+ * add as an alias
+ */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_setup_test_and_add_xprt,
+ &rpcdata);
+ if (xprtdata.cred)
+ put_rpccred(xprtdata.cred);
+ } else {
+ clp = nfs4_set_ds_client(mds_srv,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans, minor_version,
+ au_flavor);
+ if (IS_ERR(clp))
+ continue;
+
+ status = nfs4_init_ds_session(clp,
+ mds_srv->nfs_client->cl_lease_time);
+ if (status) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(-EIO);
+ continue;
+ }
+
+ }
}
if (IS_ERR(clp)) {
@@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
goto out;
}
- status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
- if (status)
- goto out_put;
-
smp_wmb();
ds->ds_clp = clp;
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
-out_put:
- nfs_put_client(clp);
- goto out;
}
/*
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 572e5b3b06f1..defc9233e985 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_SIZE, page_file_index(page));
+ page, PAGE_SIZE, page_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d39601381adf..001796bcd6c8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2848,19 +2848,23 @@ out_invalid_transport_udp:
* NFS client for backwards compatibility
*/
unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
/* Default cache timeout is 10 minutes */
unsigned int nfs_idmap_cache_timeout = 600;
/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
bool nfs4_disable_idmapping = true;
unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
unsigned short send_implementation_id = 1;
char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
EXPORT_SYMBOL_GPL(send_implementation_id);
EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
EXPORT_SYMBOL_GPL(recover_lost_locks);
@@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = {
#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+ "assigned to the NFSv4 callback channels.");
module_param(nfs_idmap_cache_timeout, int, 0644);
module_param(nfs4_disable_idmapping, bool, 0644);
module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
@@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
module_param(max_session_slots, ushort, 0644);
MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
"requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+ "callbacks the client will process for a given server");
module_param(send_implementation_id, ushort, 0644);
MODULE_PARM_DESC(send_implementation_id,
"Send implementation ID with NFSv4.1 exchange_id");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3a6724c6eb5f..53211838f72a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
end_index = (i_size - 1) >> PAGE_SHIFT;
- if (i_size > 0 && page_file_index(page) < end_index)
+ if (i_size > 0 && page_index(page) < end_index)
goto out;
end = page_file_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
@@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
{
int ret;
- nfs_pageio_cond_complete(pgio, page_file_index(page));
+ nfs_pageio_cond_complete(pgio, page_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
launder);
if (ret == -EAGAIN) {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 9d46a0bdd9f9..62469c60be23 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
goto oom;
for (i = 0; i < rqgi->ngroups; i++) {
- if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
- GROUP_AT(gi, i) = exp->ex_anon_gid;
+ if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i]))
+ gi->gid[i] = exp->ex_anon_gid;
else
- GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+ gi->gid[i] = rqgi->gid[i];
}
} else {
gi = get_group_info(rqgi);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 5a1708441510..0780ff864539 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -123,7 +123,7 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
- lcp->lc_mtime = current_fs_time(inode->i_sb);
+ lcp->lc_mtime = current_time(inode);
iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index df880e9fa71f..b67287383010 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
const struct nfsd4_layout_ops ff_layout_ops = {
.notify_types =
NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .disable_recalls = true,
.proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo,
.encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo,
.proc_layoutget = nfsd4_ff_proc_layoutget,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 5fbf3bbd00d0..b10d557f9c9e 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -84,6 +84,7 @@ struct nfsd_net {
struct list_head client_lru;
struct list_head close_lru;
struct list_head del_recall_lru;
+ struct list_head blocked_locks_lru;
struct delayed_work laundromat_work;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 04c68d900324..211dc2aed8e1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
{
int status;
- if (cb->cb_minorversion == 0)
+ if (cb->cb_clp->cl_minorversion == 0)
return 0;
status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
@@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct nfs4_delegation *dp = cb_to_delegation(cb);
struct nfs4_cb_compound_hdr hdr = {
.ident = cb->cb_clp->cl_cb_ident,
- .minorversion = cb->cb_minorversion,
+ .minorversion = cb->cb_clp->cl_minorversion,
};
encode_cb_compound4args(xdr, &hdr);
@@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
container_of(cb, struct nfs4_layout_stateid, ls_recall);
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
- .minorversion = cb->cb_minorversion,
+ .minorversion = cb->cb_clp->cl_minorversion,
};
encode_cb_compound4args(xdr, &hdr);
@@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
}
#endif /* CONFIG_NFSD_PNFS */
+static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len);
+ p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8);
+ xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len);
+}
+
+static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb)
+{
+ const struct nfsd4_blocked_lock *nbl =
+ container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ __be32 *p;
+
+ BUG_ON(hdr.minorversion == 0);
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_CB_NOTIFY_LOCK);
+ encode_nfs_fh4(xdr, &nbl->nbl_fh);
+ encode_stateowner(xdr, &lo->lo_owner);
+ hdr.nops++;
+
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+ }
+ return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
+}
+
/*
* RPC procedure tables
*/
@@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
#ifdef CONFIG_NFSD_PNFS
PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
#endif
+ PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
};
static struct rpc_version nfs_cb_version4 = {
@@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
struct nfs4_client *clp = cb->cb_clp;
u32 minorversion = clp->cl_minorversion;
- cb->cb_minorversion = minorversion;
/*
* cb_seq_status is only set in decode_cb_sequence4res,
* and so will remain 1 if an rpc level failure occurs.
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2be9602b0221..42aace4fc4c8 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
list_del_init(&ls->ls_perfile);
spin_unlock(&fp->fi_lock);
- vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+ if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+ vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
fput(ls->ls_file);
if (ls->ls_recalled)
@@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
struct file_lock *fl;
int status;
+ if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+ return 0;
+
fl = locks_alloc_lock();
if (!fl)
return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 1fb222752b2b..abb09b580389 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
-nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- struct nfsd4_clone *clone)
+nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ stateid_t *src_stateid, struct file **src,
+ stateid_t *dst_stateid, struct file **dst)
{
- struct file *src, *dst;
__be32 status;
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
- &clone->cl_src_stateid, RD_STATE,
- &src, NULL);
+ src_stateid, RD_STATE, src, NULL);
if (status) {
dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
goto out;
}
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
- &clone->cl_dst_stateid, WR_STATE,
- &dst, NULL);
+ dst_stateid, WR_STATE, dst, NULL);
if (status) {
dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
goto out_put_src;
}
/* fix up for NFS-specific error code */
- if (!S_ISREG(file_inode(src)->i_mode) ||
- !S_ISREG(file_inode(dst)->i_mode)) {
+ if (!S_ISREG(file_inode(*src)->i_mode) ||
+ !S_ISREG(file_inode(*dst)->i_mode)) {
status = nfserr_wrong_type;
goto out_put_dst;
}
+out:
+ return status;
+out_put_dst:
+ fput(*dst);
+out_put_src:
+ fput(*src);
+ goto out;
+}
+
+static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_clone *clone)
+{
+ struct file *src, *dst;
+ __be32 status;
+
+ status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
+ &clone->cl_dst_stateid, &dst);
+ if (status)
+ goto out;
+
status = nfsd4_clone_file_range(src, clone->cl_src_pos,
dst, clone->cl_dst_pos, clone->cl_count);
-out_put_dst:
fput(dst);
-out_put_src:
fput(src);
out:
return status;
}
static __be32
+nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_copy *copy)
+{
+ struct file *src, *dst;
+ __be32 status;
+ ssize_t bytes;
+
+ status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
+ &copy->cp_dst_stateid, &dst);
+ if (status)
+ goto out;
+
+ bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
+ dst, copy->cp_dst_pos, copy->cp_count);
+
+ if (bytes < 0)
+ status = nfserrno(bytes);
+ else {
+ copy->cp_res.wr_bytes_written = bytes;
+ copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_consecutive = 1;
+ copy->cp_synchronous = 1;
+ gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+ status = nfs_ok;
+ }
+
+ fput(src);
+ fput(dst);
+out:
+ return status;
+}
+
+static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
@@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}
+static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* wr_callback */ +
+ op_encode_stateid_maxsz /* wr_callback */ +
+ 2 /* wr_count */ +
+ 1 /* wr_committed */ +
+ op_encode_verifier_maxsz +
+ 1 /* cr_consecutive */ +
+ 1 /* cr_synchronous */) * sizeof(__be32);
+}
+
#ifdef CONFIG_NFSD_PNFS
/*
* At this stage we don't really know what layout driver will handle the request,
@@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_CLONE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_COPY] = {
+ .op_func = (nfsd4op_func)nfsd4_copy,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_COPY",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a204d7e109d4..9752beb78659 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
static bool is_session_dead(struct nfsd4_session *ses)
{
@@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses)
spin_unlock(&nn->client_lock);
}
+static struct nfsd4_blocked_lock *
+find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ struct nfsd_net *nn)
+{
+ struct nfsd4_blocked_lock *cur, *found = NULL;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
+ if (fh_match(fh, &cur->nbl_fh)) {
+ list_del_init(&cur->nbl_list);
+ list_del_init(&cur->nbl_lru);
+ found = cur;
+ break;
+ }
+ }
+ spin_unlock(&nn->client_lock);
+ if (found)
+ posix_unblock_lock(&found->nbl_lock);
+ return found;
+}
+
+static struct nfsd4_blocked_lock *
+find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ struct nfsd_net *nn)
+{
+ struct nfsd4_blocked_lock *nbl;
+
+ nbl = find_blocked_lock(lo, fh, nn);
+ if (!nbl) {
+ nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+ if (nbl) {
+ fh_copy_shallow(&nbl->nbl_fh, fh);
+ locks_init_lock(&nbl->nbl_lock);
+ nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
+ &nfsd4_cb_notify_lock_ops,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK);
+ }
+ }
+ return nbl;
+}
+
+static void
+free_blocked_lock(struct nfsd4_blocked_lock *nbl)
+{
+ locks_release_private(&nbl->nbl_lock);
+ kfree(nbl);
+}
+
+static int
+nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ /*
+ * Since this is just an optimization, we don't try very hard if it
+ * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
+ * just quit trying on anything else.
+ */
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 1 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
+{
+ struct nfsd4_blocked_lock *nbl = container_of(cb,
+ struct nfsd4_blocked_lock, nbl_cb);
+
+ free_blocked_lock(nbl);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+ .done = nfsd4_cb_notify_lock_done,
+ .release = nfsd4_cb_notify_lock_release,
+};
+
static inline struct nfs4_stateowner *
nfs4_get_stateowner(struct nfs4_stateowner *sop)
{
@@ -1903,7 +1983,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
if (g1->ngroups != g2->ngroups)
return false;
for (i=0; i<g1->ngroups; i++)
- if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
+ if (!gid_eq(g1->gid[i], g2->gid[i]))
return false;
return true;
}
@@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
goto out;
/* cases below refer to rfc 3530 section 14.2.34: */
if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
- if (conf && !unconf) /* case 2: probable retransmit */
+ if (conf && same_verf(&confirm, &conf->cl_confirm)) {
+ /* case 2: probable retransmit */
status = nfs_ok;
- else /* case 4: client hasn't noticed we rebooted yet? */
+ } else /* case 4: client hasn't noticed we rebooted yet? */
status = nfserr_stale_clientid;
goto out;
}
@@ -4410,9 +4491,11 @@ out:
* To finish the open response, we just need to set the rflags.
*/
open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
- if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
- !nfsd4_has_session(&resp->cstate))
+ if (nfsd4_has_session(&resp->cstate))
+ open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK;
+ else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED))
open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+
if (dp)
nfs4_put_stid(&dp->dl_stid);
if (stp)
@@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn)
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct nfs4_ol_stateid *stp;
+ struct nfsd4_blocked_lock *nbl;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nn->nfsd4_lease;
time_t t, new_timeo = nn->nfsd4_lease;
@@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn)
}
spin_unlock(&nn->client_lock);
+ /*
+ * It's possible for a client to try and acquire an already held lock
+ * that is being held for a long time, and then lose interest in it.
+ * So, we clean out any un-revisited request after a lease period
+ * under the assumption that the client is no longer interested.
+ *
+ * RFC5661, sec. 9.6 states that the client must not rely on getting
+ * notifications and must continue to poll for locks, even when the
+ * server supports them. Thus this shouldn't lead to clients blocking
+ * indefinitely once the lock does become free.
+ */
+ BUG_ON(!list_empty(&reaplist));
+ spin_lock(&nn->client_lock);
+ while (!list_empty(&nn->blocked_locks_lru)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+ if (time_after((unsigned long)nbl->nbl_time,
+ (unsigned long)cutoff)) {
+ t = nbl->nbl_time - cutoff;
+ new_timeo = min(new_timeo, t);
+ break;
+ }
+ list_move(&nbl->nbl_lru, &reaplist);
+ list_del_init(&nbl->nbl_list);
+ }
+ spin_unlock(&nn->client_lock);
+
+ while (!list_empty(&reaplist)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+ list_del_init(&nbl->nbl_lru);
+ posix_unblock_lock(&nbl->nbl_lock);
+ free_blocked_lock(nbl);
+ }
+
new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
return new_timeo;
}
@@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner)
nfs4_put_stateowner(&lo->lo_owner);
}
+static void
+nfsd4_lm_notify(struct file_lock *fl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+ struct net *net = lo->lo_owner.so_client->net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd4_blocked_lock *nbl = container_of(fl,
+ struct nfsd4_blocked_lock, nbl_lock);
+ bool queue = false;
+
+ /* An empty list means that something else is going to be using it */
+ spin_lock(&nn->client_lock);
+ if (!list_empty(&nbl->nbl_list)) {
+ list_del_init(&nbl->nbl_list);
+ list_del_init(&nbl->nbl_lru);
+ queue = true;
+ }
+ spin_unlock(&nn->client_lock);
+
+ if (queue)
+ nfsd4_run_cb(&nbl->nbl_cb);
+}
+
static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_notify = nfsd4_lm_notify,
.lm_get_owner = nfsd4_fl_get_owner,
.lm_put_owner = nfsd4_fl_put_owner,
};
@@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
if (!lo)
return NULL;
+ INIT_LIST_HEAD(&lo->lo_blocked);
INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
lo->lo_owner.so_is_open_owner = 0;
lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
@@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_ol_stateid *open_stp = NULL;
struct nfs4_file *fp;
struct file *filp = NULL;
+ struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
__be32 status = 0;
int lkflg;
int err;
bool new = false;
+ unsigned char fl_type;
+ unsigned int fl_flags = FL_POSIX;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!locks_in_grace(net) && lock->lk_reclaim)
goto out;
- file_lock = locks_alloc_lock();
- if (!file_lock) {
- dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
- status = nfserr_jukebox;
- goto out;
- }
-
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
- case NFS4_READ_LT:
case NFS4_READW_LT:
+ if (nfsd4_has_session(cstate))
+ fl_flags |= FL_SLEEP;
+ /* Fallthrough */
+ case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
filp = find_readable_file_locked(fp);
if (filp)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
spin_unlock(&fp->fi_lock);
- file_lock->fl_type = F_RDLCK;
+ fl_type = F_RDLCK;
break;
- case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
+ if (nfsd4_has_session(cstate))
+ fl_flags |= FL_SLEEP;
+ /* Fallthrough */
+ case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
filp = find_writeable_file_locked(fp);
if (filp)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
spin_unlock(&fp->fi_lock);
- file_lock->fl_type = F_WRLCK;
+ fl_type = F_WRLCK;
break;
default:
status = nfserr_inval;
goto out;
}
+
if (!filp) {
status = nfserr_openmode;
goto out;
}
+ nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
+ if (!nbl) {
+ dprintk("NFSD: %s: unable to allocate block!\n", __func__);
+ status = nfserr_jukebox;
+ goto out;
+ }
+
+ file_lock = &nbl->nbl_lock;
+ file_lock->fl_type = fl_type;
file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
file_lock->fl_pid = current->tgid;
file_lock->fl_file = filp;
- file_lock->fl_flags = FL_POSIX;
+ file_lock->fl_flags = fl_flags;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = lock->lk_offset;
file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
+ if (fl_flags & FL_SLEEP) {
+ nbl->nbl_time = jiffies;
+ spin_lock(&nn->client_lock);
+ list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
+ list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
+ spin_unlock(&nn->client_lock);
+ }
+
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
- switch (-err) {
+ switch (err) {
case 0: /* success! */
nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
break;
- case (EAGAIN): /* conflock holds conflicting lock */
+ case FILE_LOCK_DEFERRED:
+ nbl = NULL;
+ /* Fallthrough */
+ case -EAGAIN: /* conflock holds conflicting lock */
status = nfserr_denied;
dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
nfs4_set_lock_denied(conflock, &lock->lk_denied);
break;
- case (EDEADLK):
+ case -EDEADLK:
status = nfserr_deadlock;
break;
default:
@@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
}
out:
+ if (nbl) {
+ /* dequeue it if we queued it before */
+ if (fl_flags & FL_SLEEP) {
+ spin_lock(&nn->client_lock);
+ list_del_init(&nbl->nbl_list);
+ list_del_init(&nbl->nbl_lru);
+ spin_unlock(&nn->client_lock);
+ }
+ free_blocked_lock(nbl);
+ }
if (filp)
fput(filp);
if (lock_stp) {
@@ -5753,8 +5930,6 @@ out:
if (open_stp)
nfs4_put_stid(&open_stp->st_stid);
nfsd4_bump_seqid(cstate, status);
- if (file_lock)
- locks_free_lock(file_lock);
if (conflock)
locks_free_lock(conflock);
return status;
@@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->client_lru);
INIT_LIST_HEAD(&nn->close_lru);
INIT_LIST_HEAD(&nn->del_recall_lru);
+ INIT_LIST_HEAD(&nn->blocked_locks_lru);
spin_lock_init(&nn->client_lock);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net)
struct nfs4_delegation *dp = NULL;
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd4_blocked_lock *nbl;
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
@@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net)
nfs4_put_stid(&dp->dl_stid);
}
+ BUG_ON(!list_empty(&reaplist));
+ spin_lock(&nn->client_lock);
+ while (!list_empty(&nn->blocked_locks_lru)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+ list_move(&nbl->nbl_lru, &reaplist);
+ list_del_init(&nbl->nbl_list);
+ }
+ spin_unlock(&nn->client_lock);
+
+ while (!list_empty(&reaplist)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+ list_del_init(&nbl->nbl_lru);
+ posix_unblock_lock(&nbl->nbl_lock);
+ free_blocked_lock(nbl);
+ }
+
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0aa0236a1429..c2d2895a1ec1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
}
static __be32
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+{
+ DECODE_HEAD;
+ unsigned int tmp;
+
+ status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+ p = xdr_decode_hyper(p, &copy->cp_src_pos);
+ p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+ p = xdr_decode_hyper(p, &copy->cp_count);
+ copy->cp_consecutive = be32_to_cpup(p++);
+ copy->cp_synchronous = be32_to_cpup(p++);
+ tmp = be32_to_cpup(p); /* Source server list not supported */
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
/* new operations for NFSv4.2 */
[OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
[OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
[OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
u32 starting_len = xdr->buf->len, needed_len;
__be32 *p;
- dprintk("%s: err %d\n", __func__, nfserr);
+ dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr));
if (nfserr)
goto out;
@@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */
static __be32
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(0);
+ p = xdr_encode_hyper(p, write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_copy *copy)
+{
+ __be32 *p;
+
+ if (!nfserr) {
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 4);
+ *p++ = cpu_to_be32(copy->cp_consecutive);
+ *p++ = cpu_to_be32(copy->cp_synchronous);
+ }
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
/* NFSv4.2 operations */
[OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
[OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
[OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
[OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 65ad0165a94f..36b2af931e06 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_idmap_error;
nn->nfsd4_lease = 90; /* default lease time */
nn->nfsd4_grace = 90;
+ nn->clverifier_counter = prandom_u32();
+ nn->clientid_counter = prandom_u32();
return 0;
out_idmap_error:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e9214768cde9..010aff5c5a79 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -74,10 +74,10 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
* which only requires access, and "set-[ac]time-to-X" which
* requires ownership.
* So if it looks like it might be "set both to the same time which
- * is close to now", and if inode_change_ok fails, then we
+ * is close to now", and if setattr_prepare fails, then we
* convert to "set to now" instead of "set to explicit time"
*
- * We only call inode_change_ok as the last test as technically
+ * We only call setattr_prepare as the last test as technically
* it is not an interface that we should be using.
*/
#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
@@ -92,17 +92,15 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
* request is. We require it be within 30 minutes of now.
*/
time_t delta = iap->ia_atime.tv_sec - get_seconds();
- struct inode *inode;
nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
if (nfserr)
goto done;
- inode = d_inode(fhp->fh_dentry);
if (delta < 0)
delta = -delta;
if (delta < MAX_TOUCH_TIME_ERROR &&
- inode_change_ok(inode, iap) != 0) {
+ setattr_prepare(fhp->fh_dentry, iap) != 0) {
/*
* Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
* This will cause notify_change to set these times
@@ -791,6 +789,7 @@ nfserrno (int errno)
{ nfserr_toosmall, -ETOOSMALL },
{ nfserr_serverfault, -ESERVERFAULT },
{ nfserr_serverfault, -ENFILE },
+ { nfserr_io, -EUCLEAN },
};
int i;
@@ -798,7 +797,7 @@ nfserrno (int errno)
if (nfs_errtbl[i].syserr == errno)
return nfs_errtbl[i].nfserr;
}
- WARN(1, "nfsd: non-standard errno: %d\n", errno);
+ WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
return nfserr_io;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 45007acaf364..a2b65fc56dd6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = {
};
#endif
+/* Only used under nfsd_mutex, so this atomic may be overkill: */
+static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
+
static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
#if IS_ENABLED(CONFIG_IPV6)
- unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
+ }
+
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
@@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net)
}
set_max_drc();
- register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+ /* check if the notifier is already set */
+ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
#if IS_ENABLED(CONFIG_IPV6)
- register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
+ }
do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 0c2a716e8741..d27a5aa60022 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -19,6 +19,7 @@ struct nfsd4_deviceid_map {
struct nfsd4_layout_ops {
u32 notify_types;
+ bool disable_recalls;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b95adf9a1595..c9399366f9df 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -63,7 +63,6 @@ typedef struct {
struct nfsd4_callback {
struct nfs4_client *cb_clp;
- u32 cb_minorversion;
struct rpc_message cb_msg;
const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
@@ -441,11 +440,11 @@ struct nfs4_openowner {
/*
* Represents a generic "lockowner". Similar to an openowner. References to it
* are held by the lock stateids that are created on its behalf. This object is
- * a superset of the nfs4_stateowner struct (or would be if it needed any extra
- * fields).
+ * a superset of the nfs4_stateowner struct.
*/
struct nfs4_lockowner {
- struct nfs4_stateowner lo_owner; /* must be first element */
+ struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_blocked; /* blocked file_locks */
};
static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -572,6 +571,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_RECALL,
NFSPROC4_CLNT_CB_LAYOUT,
NFSPROC4_CLNT_CB_SEQUENCE,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK,
};
/* Returns true iff a is later than b: */
@@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
return (s32)(a->si_generation - b->si_generation) > 0;
}
+/*
+ * When a client tries to get a lock on a file, we set one of these objects
+ * on the blocking lock. When the lock becomes free, we can then issue a
+ * CB_NOTIFY_LOCK to the server.
+ */
+struct nfsd4_blocked_lock {
+ struct list_head nbl_list;
+ struct list_head nbl_lru;
+ unsigned long nbl_time;
+ struct file_lock nbl_lock;
+ struct knfsd_fh nbl_fh;
+ struct nfsd4_callback nbl_cb;
+};
+
struct nfsd4_compound_state;
struct nfsd_net;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ff476e654b8f..8ca642fe9b21 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
count));
}
+ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+
+ /*
+ * Limit copy to 4MB to prevent indefinitely blocking an nfsd
+ * thread and client rpc slot. The choice of 4MB is somewhat
+ * arbitrary. We might instead base this on r/wsize, or make it
+ * tunable, or use a time instead of a byte limit, or implement
+ * asynchronous copy. In theory a client could also recognize a
+ * limit like this and pipeline multiple COPY requests.
+ */
+ count = min_t(u64, count, 1 << 22);
+ return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+}
+
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 3cbb1b33777b..0bf9e7bf5800 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -96,6 +96,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
struct svc_fh *res);
__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
char *, int, struct svc_fh *);
+ssize_t nfsd_copy_file_range(struct file *, u64,
+ struct file *, u64, u64);
__be32 nfsd_rename(struct svc_rqst *,
struct svc_fh *, char *, int,
struct svc_fh *, char *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index beea0c5edc51..8fda4abdf3b1 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -503,6 +503,28 @@ struct nfsd4_clone {
u64 cl_count;
};
+struct nfsd42_write_res {
+ u64 wr_bytes_written;
+ u32 wr_stable_how;
+ nfs4_verifier wr_verifier;
+};
+
+struct nfsd4_copy {
+ /* request */
+ stateid_t cp_src_stateid;
+ stateid_t cp_dst_stateid;
+ u64 cp_src_pos;
+ u64 cp_dst_pos;
+ u64 cp_count;
+
+ /* both */
+ bool cp_consecutive;
+ bool cp_synchronous;
+
+ /* response */
+ struct nfsd42_write_res cp_res;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -568,6 +590,7 @@ struct nfsd4_op {
struct nfsd4_fallocate allocate;
struct nfsd4_fallocate deallocate;
struct nfsd4_clone clone;
+ struct nfsd4_copy copy;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c47f6fdb111a..49b719dfef95 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -28,3 +28,12 @@
#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+
+#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 2 + 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ enc_nfs4_fh_sz)
+#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 908ebbf0ac7e..582831ab3eb9 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -438,7 +438,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
nilfs_put_page(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
}
/*
@@ -528,7 +528,7 @@ got_it:
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, page->mapping, from, to);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
@@ -576,7 +576,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
out:
nilfs_put_page(page);
return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index af04f553d7c9..c7f4fef9ebf5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -367,7 +367,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -749,7 +749,7 @@ void nilfs_truncate(struct inode *inode)
nilfs_truncate_bmap(ii, blkoff);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
@@ -829,7 +829,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct super_block *sb = inode->i_sb;
int err;
- err = inode_change_ok(inode, iattr);
+ err = setattr_prepare(dentry, iattr);
if (err)
return err;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f1d7989459fd..1d2c3d7711fe 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -174,7 +174,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
(flags & FS_FL_USER_MODIFIABLE);
nilfs_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index dbcf1dc93a51..2b71c60fe982 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -194,7 +194,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -350,7 +350,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
}
static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
@@ -361,6 +362,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct nilfs_transaction_info ti;
int err;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
if (unlikely(err))
return err;
@@ -391,7 +395,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_dir;
nilfs_set_link(new_dir, new_de, new_page, old_inode);
nilfs_mark_inode_dirty(new_dir);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
drop_nlink(new_inode);
@@ -410,7 +414,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = CURRENT_TIME;
+ old_inode->i_ctime = current_time(old_inode);
nilfs_delete_entry(old_de, old_page);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a64313868d3a..7ebfca6a1427 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -49,12 +49,12 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
* enough to fit in "count". Return an error pointer if the count
* is not large enough.
*
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- BUG_ON(!mutex_is_locked(&group->notification_mutex));
+ assert_spin_locked(&group->notification_lock);
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
@@ -64,7 +64,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
if (FAN_EVENT_METADATA_LEN > count)
return ERR_PTR(-EINVAL);
- /* held the notification_mutex the whole time, so this is the
+ /* held the notification_lock the whole time, so this is the
* same event we peeked above */
return fsnotify_remove_first_event(group);
}
@@ -147,7 +147,7 @@ static struct fanotify_perm_event_info *dequeue_event(
{
struct fanotify_perm_event_info *event, *return_e = NULL;
- spin_lock(&group->fanotify_data.access_lock);
+ spin_lock(&group->notification_lock);
list_for_each_entry(event, &group->fanotify_data.access_list,
fae.fse.list) {
if (event->fd != fd)
@@ -157,7 +157,7 @@ static struct fanotify_perm_event_info *dequeue_event(
return_e = event;
break;
}
- spin_unlock(&group->fanotify_data.access_lock);
+ spin_unlock(&group->notification_lock);
pr_debug("%s: found return_re=%p\n", __func__, return_e);
@@ -244,10 +244,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait)
int ret = 0;
poll_wait(file, &group->notification_waitq, wait);
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
if (!fsnotify_notify_queue_is_empty(group))
ret = POLLIN | POLLRDNORM;
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
return ret;
}
@@ -268,9 +268,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
@@ -309,10 +309,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
wake_up(&group->fanotify_data.access_waitq);
break;
}
- spin_lock(&group->fanotify_data.access_lock);
+ spin_lock(&group->notification_lock);
list_add_tail(&kevent->list,
&group->fanotify_data.access_list);
- spin_unlock(&group->fanotify_data.access_lock);
+ spin_unlock(&group->notification_lock);
#endif
}
buf += ret;
@@ -371,7 +371,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* Process all permission events on access_list and notification queue
* and simulate reply from userspace.
*/
- spin_lock(&group->fanotify_data.access_lock);
+ spin_lock(&group->notification_lock);
list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
fae.fse.list) {
pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -380,22 +380,22 @@ static int fanotify_release(struct inode *ignored, struct file *file)
list_del_init(&event->fae.fse.list);
event->response = FAN_ALLOW;
}
- spin_unlock(&group->fanotify_data.access_lock);
/*
* Destroy all non-permission events. For permission events just
* dequeue them and set the response. They will be freed once the
* response is consumed and fanotify_get_response() returns.
*/
- mutex_lock(&group->notification_mutex);
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+ if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+ spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
- else
+ spin_lock(&group->notification_lock);
+ } else
FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
}
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
/* Response for all permission events it set, wakeup waiters */
wake_up(&group->fanotify_data.access_waitq);
@@ -421,10 +421,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
switch (cmd) {
case FIONREAD:
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
list_for_each_entry(fsn_event, &group->notification_list, list)
send_len += FAN_EVENT_METADATA_LEN;
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
ret = put_user(send_len, (int __user *) p);
break;
}
@@ -765,7 +765,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
event_f_flags |= O_LARGEFILE;
group->fanotify_data.f_flags = event_f_flags;
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- spin_lock_init(&group->fanotify_data.access_lock);
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
#endif
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b47f7cfdcaa4..fbe3cbebec16 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_group_stop_queueing(struct fsnotify_group *group)
{
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
group->shutdown = true;
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
}
/*
@@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
atomic_set(&group->refcnt, 1);
atomic_set(&group->num_marks, 0);
- mutex_init(&group->notification_mutex);
+ spin_lock_init(&group->notification_lock);
INIT_LIST_HEAD(&group->notification_list);
init_waitqueue_head(&group->notification_waitq);
group->max_events = UINT_MAX;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b8d08d0d0a4d..69d1ea3d292a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
int ret = 0;
poll_wait(file, &group->notification_waitq, wait);
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
if (!fsnotify_notify_queue_is_empty(group))
ret = POLLIN | POLLRDNORM;
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
return ret;
}
@@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event)
* enough to fit in "count". Return an error pointer if
* not large enough.
*
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
@@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
if (event_size > count)
return ERR_PTR(-EINVAL);
- /* held the notification_mutex the whole time, so this is the
+ /* held the notification_lock the whole time, so this is the
* same event we peeked above */
fsnotify_remove_first_event(group);
@@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
@@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case FIONREAD:
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
list_for_each_entry(fsn_event, &group->notification_list,
list) {
send_len += sizeof(struct inotify_event);
send_len += round_event_name_len(fsn_event);
}
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
ret = put_user(send_len, (int __user *) p);
break;
}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index e455e83ceeeb..66f85c651c52 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
/* return true if the notify queue is empty, false otherwise */
bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
- BUG_ON(!mutex_is_locked(&group->notification_mutex));
+ assert_spin_locked(&group->notification_lock);
return list_empty(&group->notification_list) ? true : false;
}
@@ -73,8 +73,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
/* Overflow events are per-group and we don't want to free them */
if (!event || event->mask == FS_Q_OVERFLOW)
return;
- /* If the event is still queued, we have a problem... */
- WARN_ON(!list_empty(&event->list));
+ /*
+ * If the event is still queued, we have a problem... Do an unreliable
+ * lockless check first to avoid locking in the common case. The
+ * locking may be necessary for permission events which got removed
+ * from the list by a different CPU than the one freeing the event.
+ */
+ if (!list_empty(&event->list)) {
+ spin_lock(&group->notification_lock);
+ WARN_ON(!list_empty(&event->list));
+ spin_unlock(&group->notification_lock);
+ }
group->ops->free_event(event);
}
@@ -95,10 +104,10 @@ int fsnotify_add_event(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
if (group->shutdown) {
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
return 2;
}
@@ -106,7 +115,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
ret = 2;
/* Queue overflow event only if it isn't already queued */
if (!list_empty(&group->overflow_event->list)) {
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
return ret;
}
event = group->overflow_event;
@@ -116,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
if (!list_empty(list) && merge) {
ret = merge(list, event);
if (ret) {
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
return ret;
}
}
@@ -124,7 +133,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
queue:
group->q_len++;
list_add_tail(&event->list, list);
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
wake_up(&group->notification_waitq);
kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
@@ -139,7 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- BUG_ON(!mutex_is_locked(&group->notification_mutex));
+ assert_spin_locked(&group->notification_lock);
pr_debug("%s: group=%p\n", __func__, group);
@@ -161,7 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
*/
struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
- BUG_ON(!mutex_is_locked(&group->notification_mutex));
+ assert_spin_locked(&group->notification_lock);
return list_first_entry(&group->notification_list,
struct fsnotify_event, list);
@@ -175,12 +184,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- mutex_lock(&group->notification_mutex);
+ spin_lock(&group->notification_lock);
while (!fsnotify_notify_queue_is_empty(group)) {
event = fsnotify_remove_first_event(group);
+ spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, event);
+ spin_lock(&group->notification_lock);
}
- mutex_unlock(&group->notification_mutex);
+ spin_unlock(&group->notification_lock);
}
/*
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..8718af895eab 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -5,11 +5,16 @@
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
+#include <linux/user_namespace.h>
+#include <linux/nsfs.h>
static struct vfsmount *nsfs_mnt;
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg);
static const struct file_operations ns_file_operations = {
.llseek = no_llseek,
+ .unlocked_ioctl = ns_ioctl,
};
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
}
-void *ns_get_path(struct path *path, struct task_struct *task,
- const struct proc_ns_operations *ns_ops)
+static void *__ns_get_path(struct path *path, struct ns_common *ns)
{
- struct vfsmount *mnt = mntget(nsfs_mnt);
+ struct vfsmount *mnt = nsfs_mnt;
struct qstr qname = { .name = "", };
struct dentry *dentry;
struct inode *inode;
- struct ns_common *ns;
unsigned long d;
-again:
- ns = ns_ops->get(task);
- if (!ns) {
- mntput(mnt);
- return ERR_PTR(-ENOENT);
- }
rcu_read_lock();
d = atomic_long_read(&ns->stashed);
if (!d)
@@ -68,21 +65,20 @@ again:
if (!lockref_get_not_dead(&dentry->d_lockref))
goto slow;
rcu_read_unlock();
- ns_ops->put(ns);
+ ns->ops->put(ns);
got_it:
- path->mnt = mnt;
+ path->mnt = mntget(mnt);
path->dentry = dentry;
return NULL;
slow:
rcu_read_unlock();
inode = new_inode_pseudo(mnt->mnt_sb);
if (!inode) {
- ns_ops->put(ns);
- mntput(mnt);
+ ns->ops->put(ns);
return ERR_PTR(-ENOMEM);
}
inode->i_ino = ns->inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_flags |= S_IMMUTABLE;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
@@ -91,21 +87,96 @@ slow:
dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
if (!dentry) {
iput(inode);
- mntput(mnt);
return ERR_PTR(-ENOMEM);
}
d_instantiate(dentry, inode);
- dentry->d_fsdata = (void *)ns_ops;
+ dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) {
d_delete(dentry); /* make sure ->d_prune() does nothing */
dput(dentry);
cpu_relax();
- goto again;
+ return ERR_PTR(-EAGAIN);
}
goto got_it;
}
+void *ns_get_path(struct path *path, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct ns_common *ns;
+ void *ret;
+
+again:
+ ns = ns_ops->get(task);
+ if (!ns)
+ return ERR_PTR(-ENOENT);
+
+ ret = __ns_get_path(path, ns);
+ if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
+ goto again;
+ return ret;
+}
+
+static int open_related_ns(struct ns_common *ns,
+ struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+ struct path path = {};
+ struct file *f;
+ void *err;
+ int fd;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ while (1) {
+ struct ns_common *relative;
+
+ relative = get_ns(ns);
+ if (IS_ERR(relative)) {
+ put_unused_fd(fd);
+ return PTR_ERR(relative);
+ }
+
+ err = __ns_get_path(&path, relative);
+ if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
+ continue;
+ break;
+ }
+ if (IS_ERR(err)) {
+ put_unused_fd(fd);
+ return PTR_ERR(err);
+ }
+
+ f = dentry_open(&path, O_RDONLY, current_cred());
+ path_put(&path);
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ } else
+ fd_install(fd, f);
+
+ return fd;
+}
+
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct ns_common *ns = get_proc_ns(file_inode(filp));
+
+ switch (ioctl) {
+ case NS_GET_USERNS:
+ return open_related_ns(ns, ns_get_owner);
+ case NS_GET_PARENT:
+ if (!ns->ops->get_parent)
+ return -EINVAL;
+ return open_related_ns(ns, ns->ops->get_parent);
+ default:
+ return -ENOTTY;
+ }
+}
+
int ns_get_name(char *buf, size_t size, struct task_struct *task,
const struct proc_ns_operations *ns_ops)
{
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f548629dfaac..bf72a2c58b75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1850,7 +1850,7 @@ again:
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e01287c964a8..7c410f879412 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2813,7 +2813,7 @@ done:
* for real.
*/
if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
- struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+ struct timespec now = current_time(VFS_I(base_ni));
int sync_it = 0;
if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
@@ -2893,7 +2893,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
int err;
unsigned int ia_valid = attr->ia_valid;
- err = inode_change_ok(vi, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
goto out;
/* We do not support NTFS ACLs yet. */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index d15d492ce47b..d3c009626032 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2692,7 +2692,7 @@ mft_rec_already_initialized:
/* Set the inode times to the current time. */
vi->i_atime = vi->i_mtime = vi->i_ctime =
- current_fs_time(vi->i_sb);
+ current_time(vi);
/*
* Set the file size to 0, the ntfs inode sizes are set to 0 by
* the call to ntfs_init_big_inode() below.
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 2162434728c0..bed1fcb63088 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -201,7 +201,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_mode = new_mode;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
@@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle,
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- umode_t mode = inode->i_mode;
- ret = posix_acl_equiv_mode(acl, &mode);
- if (ret < 0)
- return ret;
+ umode_t mode;
- if (ret == 0)
- acl = NULL;
+ ret = posix_acl_update_mode(inode, &mode, &acl);
+ if (ret)
+ return ret;
ret = ocfs2_acl_set_mode(inode, di_bh,
handle, mode);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f165f867f332..f72712f6c28d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7293,7 +7293,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 98d36548153d..c5c5b9748ea3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1842,6 +1842,16 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
+ /*
+ * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
+ * even in case of error here like ENOSPC and ENOMEM. So, we need
+ * to unlock the target page manually to prevent deadlocks when
+ * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
+ * to VM code.
+ */
+ if (wc->w_target_locked)
+ unlock_page(mmap_page);
+
ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
@@ -2020,7 +2030,7 @@ out_write_size:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1d67fcbf7160..8abab16b4602 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node)
BUG_ON(o2net_listen_sock != NULL);
mlog(ML_KTHREAD, "starting o2net thread...\n");
- o2net_wq = create_singlethread_workqueue("o2net");
+ o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM);
if (o2net_wq == NULL) {
mlog(ML_ERROR, "unable to launch o2net thread\n");
return -ENOMEM; /* ? */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e1adf285fc31..e7054e2ac922 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1677,7 +1677,7 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -2990,7 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
di->i_size = cpu_to_le64(sb->s_blocksize);
di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 533bd524e41e..733e4e79c8e2 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
}
snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
- dlm->dlm_worker = create_singlethread_workqueue(wq_name);
+ dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6ea06f8a7d29..3f828a187049 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3188,6 +3188,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
migrate->new_master,
migrate->master);
+ if (ret < 0)
+ kmem_cache_free(dlm_mle_cache, mle);
+
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index ef474cdd6404..1079fae5aa12 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -211,7 +211,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
attr->ia_valid &= ~ATTR_SIZE;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -398,7 +398,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -421,7 +421,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
@@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void)
}
cleanup_inode = 1;
- user_dlm_worker = create_singlethread_workqueue("user_dlm");
+ user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0b055bfb8e86..000c234d7bbd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -253,7 +253,7 @@ int ocfs2_should_update_atime(struct inode *inode,
return 0;
}
- now = CURRENT_TIME;
+ now = current_time(inode);
if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
return 0;
else
@@ -287,7 +287,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
* have i_mutex to guard against concurrent changes to other
* inode fields.
*/
- inode->i_atime = CURRENT_TIME;
+ inode->i_atime = current_time(inode);
di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
@@ -308,7 +308,7 @@ int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -429,7 +429,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
@@ -840,7 +840,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
di->i_mtime_nsec = di->i_ctime_nsec;
@@ -1155,7 +1155,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- status = inode_change_ok(inode, attr);
+ status = setattr_prepare(dentry, attr);
if (status)
return status;
@@ -1950,7 +1950,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (change_size && i_size_read(inode) < size)
i_size_write(inode, size);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
@@ -2321,36 +2321,6 @@ out_mutex:
return ret;
}
-static ssize_t ocfs2_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- int ret = 0, lock_level = 0;
- struct inode *inode = file_inode(in);
-
- trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- in->f_path.dentry->d_name.len,
- in->f_path.dentry->d_name.name, len);
-
- /*
- * See the comment in ocfs2_file_read_iter()
- */
- ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
- ocfs2_inode_unlock(inode, lock_level);
-
- ret = generic_file_splice_read(in, ppos, pipe, len, flags);
-
-bail:
- return ret;
-}
-
static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
struct iov_iter *to)
{
@@ -2474,10 +2444,7 @@ const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
@@ -2509,7 +2476,7 @@ const struct file_operations ocfs2_fops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
- .splice_read = ocfs2_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
@@ -2554,7 +2521,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
- .splice_read = ocfs2_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 50cc55047443..5af68fcdf9d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
-extern struct kmem_cache *ocfs2_inode_cache;
-
extern const struct address_space_operations ocfs2_aops;
extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e3d05d9901a3..4e8f32eb0bdb 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -953,7 +953,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a8f1225e6d9b..8d887c75765c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -798,7 +798,7 @@ static int ocfs2_link(struct dentry *old_dentry,
}
inc_nlink(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
@@ -1000,7 +1000,7 @@ static int ocfs2_unlink(struct inode *dir,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -1203,7 +1203,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
static int ocfs2_rename(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
- struct dentry *new_dentry)
+ struct dentry *new_dentry,
+ unsigned int flags)
{
int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
@@ -1228,6 +1229,9 @@ static int ocfs2_rename(struct inode *old_dir,
struct ocfs2_dir_lookup_result target_insert = { NULL, };
bool should_add_orphan = false;
+ if (flags)
+ return -EINVAL;
+
/* At some point it might be nice to break this function up a
* bit. */
@@ -1537,7 +1541,7 @@ static int ocfs2_rename(struct inode *old_dir,
new_dir_bh, &target_insert);
}
- old_inode->i_ctime = CURRENT_TIME;
+ old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
@@ -1586,9 +1590,9 @@ static int ocfs2_rename(struct inode *old_dir,
if (new_inode) {
drop_nlink(new_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_time(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
@@ -2913,10 +2917,7 @@ const struct inode_operations ocfs2_dir_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index f8f5fc5e6c05..0b58abcf1c6d 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1314,8 +1314,6 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
-
DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 92bbe93bfe10..19238512a324 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3778,7 +3778,7 @@ static int ocfs2_change_ctime(struct inode *inode,
goto out_commit;
}
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
@@ -4094,7 +4094,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
* we want mtime to appear identical to the source and
* update ctime.
*/
- t_inode->i_ctime = CURRENT_TIME;
+ t_inode->i_ctime = current_time(t_inode);
di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 603b28d6f008..f56fe39fab04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
- osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
if (!osb->ocfs2_wq) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 6c2a3e3c521c..6ad8eecefe21 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -91,9 +91,6 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
};
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5bb44f7a78ee..cb157a34a656 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3431,7 +3431,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c8cbf3b60645..b7146526afff 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_ctime = current_time(dir);
/* mark affected inodes dirty to rebuild checksums */
mark_inode_dirty(dir);
@@ -371,12 +371,16 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
}
static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
int err;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
if (new_inode) {
/* overwriting existing file/dir */
err = omfs_remove(new_dir, new_dentry);
@@ -395,7 +399,7 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto out;
- old_inode->i_ctime = CURRENT_TIME_SEC;
+ old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
out:
return err;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d9e26cfbb793..bf83e6644333 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -349,7 +349,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 3d935c81789a..df7ea8543a2e 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -49,7 +49,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &omfs_dir_inops;
diff --git a/fs/open.c b/fs/open.c
index 4fd6e256f4f4..d3ed8171e8e0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -68,6 +68,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
long vfs_truncate(const struct path *path, loff_t length)
{
struct inode *inode;
+ struct dentry *upperdentry;
long error;
inode = path->dentry->d_inode;
@@ -90,7 +91,17 @@ long vfs_truncate(const struct path *path, loff_t length)
if (IS_APPEND(inode))
goto mnt_drop_write_and_out;
- error = get_write_access(inode);
+ /*
+ * If this is an overlayfs then do as if opening the file so we get
+ * write access on the upper inode, not on the overlay inode. For
+ * non-overlay filesystems d_real() is an identity function.
+ */
+ upperdentry = d_real(path->dentry, NULL, O_WRONLY);
+ error = PTR_ERR(upperdentry);
+ if (IS_ERR(upperdentry))
+ goto mnt_drop_write_and_out;
+
+ error = get_write_access(upperdentry->d_inode);
if (error)
goto mnt_drop_write_and_out;
@@ -109,7 +120,7 @@ long vfs_truncate(const struct path *path, loff_t length)
error = do_truncate(path->dentry, length, 0, NULL);
put_write_and_out:
- put_write_access(inode);
+ put_write_access(upperdentry->d_inode);
mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
out:
@@ -256,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
(mode & ~FALLOC_FL_INSERT_RANGE))
return -EINVAL;
+ /* Unshare range should only be used with allocate mode. */
+ if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
+ (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
+ return -EINVAL;
+
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
@@ -289,7 +305,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* Let individual file system decide if it supports preallocation
* for directories or not.
*/
- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
+ !S_ISBLK(inode->i_mode))
return -ENODEV;
/* Check for wrap through zero too */
@@ -726,7 +743,7 @@ static int do_dentry_open(struct file *f,
if (error)
goto cleanup_all;
- error = break_lease(inode, f->f_flags);
+ error = break_lease(locks_inode(f), f->f_flags);
if (error)
goto cleanup_all;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c7a86993d97e..c003a667ed1a 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -355,7 +355,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
if (inode->i_ino == OPENPROM_ROOT_INO) {
inode->i_op = &openprom_inode_operations;
inode->i_fop = &openprom_operations;
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 28f2195cd798..7a3754488312 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -73,14 +73,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- umode_t mode = inode->i_mode;
- /*
- * can we represent this with the traditional file
- * mode permission bits?
- */
- error = posix_acl_equiv_mode(acl, &mode);
- if (error < 0) {
- gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+ umode_t mode;
+
+ error = posix_acl_update_mode(inode, &mode, &acl);
+ if (error) {
+ gossip_err("%s: posix_acl_update_mode err: %d\n",
__func__,
error);
return error;
@@ -90,8 +87,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
SetModeFlag(orangefs_inode);
inode->i_mode = mode;
mark_inode_dirty_sync(inode);
- if (error == 0)
- acl = NULL;
}
break;
case ACL_TYPE_DEFAULT:
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 00235bf644dc..5355efba4bc8 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
}
}
- dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ret = 1;
out_release_op:
op_release(new_op);
@@ -94,8 +94,9 @@ out_drop:
static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int ret;
+ unsigned long time = (unsigned long) dentry->d_fsdata;
- if (time_before(jiffies, dentry->d_time))
+ if (time_before(jiffies, time))
return 1;
if (flags & LOOKUP_RCU)
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index a287a66d94e3..516ffb4dc9a0 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -11,14 +11,19 @@
#include "orangefs-kernel.h"
#include "orangefs-dev-proto.h"
#include "orangefs-bufmap.h"
+#include "orangefs-debugfs.h"
#include <linux/debugfs.h>
#include <linux/slab.h>
/* this file implements the /dev/pvfs2-req device node */
+uint32_t orangefs_userspace_version;
+
static int open_access_count;
+static DEFINE_MUTEX(devreq_mutex);
+
#define DUMP_DEVICE_ERROR() \
do { \
gossip_err("*****************************************************\n");\
@@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
{
int index = hash_func(op->tag, hash_table_size);
- list_add_tail(&op->list, &htable_ops_in_progress[index]);
+ list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]);
}
/*
@@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
index = hash_func(tag, hash_table_size);
- spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&orangefs_htable_ops_in_progress_lock);
list_for_each_entry_safe(op,
next,
- &htable_ops_in_progress[index],
+ &orangefs_htable_ops_in_progress[index],
list) {
if (op->tag == tag && !op_state_purged(op) &&
!op_state_given_up(op)) {
list_del_init(&op->list);
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
return op;
}
}
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
return NULL;
}
@@ -276,11 +281,11 @@ restart:
if (ret != 0)
goto error;
- spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&orangefs_htable_ops_in_progress_lock);
spin_lock(&cur_op->lock);
if (unlikely(op_state_given_up(cur_op))) {
spin_unlock(&cur_op->lock);
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
complete(&cur_op->waitq);
goto restart;
}
@@ -298,7 +303,7 @@ restart:
current->comm);
orangefs_devreq_add_op(cur_op);
spin_unlock(&cur_op->lock);
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
/* The client only asks to read one size buffer. */
return MAX_DEV_REQ_UPSIZE;
@@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
return -EPROTO;
}
+ if (!orangefs_userspace_version) {
+ orangefs_userspace_version = head.version;
+ } else if (orangefs_userspace_version != head.version) {
+ gossip_err("Error: userspace version changes\n");
+ return -EPROTO;
+ }
+
/* remove the op from the in progress hash table */
op = orangefs_devreq_remove_op(head.tag);
if (!op) {
@@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file)
gossip_debug(GOSSIP_DEV_DEBUG,
"pvfs2-client-core: device close complete\n");
open_access_count = 0;
+ orangefs_userspace_version = 0;
mutex_unlock(&devreq_mutex);
return 0;
}
@@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
struct ORANGEFS_dev_map_desc user_desc;
int ret = 0;
- struct dev_mask_info_s mask_info = { 0 };
- struct dev_mask2_info_s mask2_info = { 0, 0 };
int upstream_kmod = 1;
struct orangefs_sb_info_s *orangefs_sb;
@@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
* all of the remounts are serviced (to avoid ops between
* mounts to fail)
*/
- ret = mutex_lock_interruptible(&request_mutex);
+ ret = mutex_lock_interruptible(&orangefs_request_mutex);
if (ret < 0)
return ret;
gossip_debug(GOSSIP_DEV_DEBUG,
@@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
gossip_debug(GOSSIP_DEV_DEBUG,
"%s: priority remount complete\n",
__func__);
- mutex_unlock(&request_mutex);
+ mutex_unlock(&orangefs_request_mutex);
return ret;
case ORANGEFS_DEV_UPSTREAM:
@@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
return ret;
case ORANGEFS_DEV_CLIENT_MASK:
- ret = copy_from_user(&mask2_info,
- (void __user *)arg,
- sizeof(struct dev_mask2_info_s));
-
- if (ret != 0)
- return -EIO;
-
- client_debug_mask.mask1 = mask2_info.mask1_value;
- client_debug_mask.mask2 = mask2_info.mask2_value;
-
- pr_info("%s: client debug mask has been been received "
- ":%llx: :%llx:\n",
- __func__,
- (unsigned long long)client_debug_mask.mask1,
- (unsigned long long)client_debug_mask.mask2);
-
- return ret;
-
+ return orangefs_debugfs_new_client_mask((void __user *)arg);
case ORANGEFS_DEV_CLIENT_STRING:
- ret = copy_from_user(&client_debug_array_string,
- (void __user *)arg,
- ORANGEFS_MAX_DEBUG_STRING_LEN);
- /*
- * The real client-core makes an effort to ensure
- * that actual strings that aren't too long to fit in
- * this buffer is what we get here. We're going to use
- * string functions on the stuff we got, so we'll make
- * this extra effort to try and keep from
- * flowing out of this buffer when we use the string
- * functions, even if somehow the stuff we end up
- * with here is garbage.
- */
- client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
- '\0';
-
- if (ret != 0) {
- pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
- __func__);
- return -EIO;
- }
-
- pr_info("%s: client debug array string has been received.\n",
- __func__);
-
- if (!help_string_initialized) {
-
- /* Free the "we don't know yet" default string... */
- kfree(debug_help_string);
-
- /* build a proper debug help string */
- if (orangefs_prepare_debugfs_help_string(0)) {
- gossip_err("%s: no debug help string \n",
- __func__);
- return -EIO;
- }
-
- /* Replace the boilerplate boot-time debug-help file. */
- debugfs_remove(help_file_dentry);
-
- help_file_dentry =
- debugfs_create_file(
- ORANGEFS_KMOD_DEBUG_HELP_FILE,
- 0444,
- debug_dir,
- debug_help_string,
- &debug_help_fops);
-
- if (!help_file_dentry) {
- gossip_err("%s: debugfs_create_file failed for"
- " :%s:!\n",
- __func__,
- ORANGEFS_KMOD_DEBUG_HELP_FILE);
- return -EIO;
- }
- }
-
- debug_mask_to_string(&client_debug_mask, 1);
-
- debugfs_remove(client_debug_dentry);
-
- orangefs_client_debug_init();
-
- help_string_initialized++;
-
- return ret;
-
+ return orangefs_debugfs_new_client_string((void __user *)arg);
case ORANGEFS_DEV_DEBUG:
- ret = copy_from_user(&mask_info,
- (void __user *)arg,
- sizeof(mask_info));
-
- if (ret != 0)
- return -EIO;
-
- if (mask_info.mask_type == KERNEL_MASK) {
- if ((mask_info.mask_value == 0)
- && (kernel_mask_set_mod_init)) {
- /*
- * the kernel debug mask was set when the
- * kernel module was loaded; don't override
- * it if the client-core was started without
- * a value for ORANGEFS_KMODMASK.
- */
- return 0;
- }
- debug_mask_to_string(&mask_info.mask_value,
- mask_info.mask_type);
- gossip_debug_mask = mask_info.mask_value;
- pr_info("%s: kernel debug mask has been modified to "
- ":%s: :%llx:\n",
- __func__,
- kernel_debug_string,
- (unsigned long long)gossip_debug_mask);
- } else if (mask_info.mask_type == CLIENT_MASK) {
- debug_mask_to_string(&mask_info.mask_value,
- mask_info.mask_type);
- pr_info("%s: client debug mask has been modified to"
- ":%s: :%llx:\n",
- __func__,
- client_debug_string,
- llu(mask_info.mask_value));
- } else {
- gossip_lerr("Invalid mask type....\n");
- return -EINVAL;
- }
-
- return ret;
-
+ return orangefs_debugfs_new_debug((void __user *)arg);
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 324f0af40d7b..284373a57a08 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -177,8 +177,8 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx)
}
gossip_debug(GOSSIP_DIR_DEBUG,
- "orangefs_readdir called on %s (pos=%llu)\n",
- dentry->d_name.name, llu(pos));
+ "orangefs_readdir called on %pd (pos=%llu)\n",
+ dentry, llu(pos));
memset(&readdir_response, 0, sizeof(readdir_response));
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 66b99210f1f9..3b8923f8bf21 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -83,7 +83,10 @@ struct orangefs_listxattr_response {
};
struct orangefs_param_response {
- __s64 value;
+ union {
+ __s64 value64;
+ __s32 value32[2];
+ } u;
};
#define PERF_COUNT_BUF_SIZE 4096
@@ -98,6 +101,11 @@ struct orangefs_fs_key_response {
char fs_key[FS_KEY_BUF_SIZE];
};
+/* 2.9.6 */
+struct orangefs_features_response {
+ __u64 features;
+};
+
struct orangefs_downcall_s {
__s32 type;
__s32 status;
@@ -119,6 +127,7 @@ struct orangefs_downcall_s {
struct orangefs_param_response param;
struct orangefs_perf_count_response perf_count;
struct orangefs_fs_key_response fs_key;
+ struct orangefs_features_response features;
} resp;
};
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 526040e09f78..02cc6139ec90 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -14,6 +14,32 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
+static int flush_racache(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
+ get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
+ orangefs_inode->refn.fs_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
+
+ ret = service_operation(new_op, "orangefs_flush_racache",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
+ __func__, ret);
+
+ op_release(new_op);
+ return ret;
+}
+
/*
* Copy to client-core's address space from the buffers specified
* by the iovec upto total_size bytes.
@@ -358,7 +384,7 @@ out:
file_accessed(file);
} else {
SetMtimeFlag(orangefs_inode);
- inode->i_mtime = CURRENT_TIME;
+ inode->i_mtime = current_time(inode);
mark_inode_dirty_sync(inode);
}
}
@@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode,
size_t bufmap_size;
ssize_t ret = -EINVAL;
- g_orangefs_stats.reads++;
+ orangefs_stats.reads++;
bufmap_size = orangefs_bufmap_size_query();
if (count > bufmap_size) {
@@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
- g_orangefs_stats.reads++;
+ orangefs_stats.reads++;
rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
iocb->ki_pos = pos;
@@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
}
iocb->ki_pos = pos;
- g_orangefs_stats.writes++;
+ orangefs_stats.writes++;
out:
@@ -585,21 +611,30 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int orangefs_file_release(struct inode *inode, struct file *file)
{
gossip_debug(GOSSIP_FILE_DEBUG,
- "orangefs_file_release: called on %s\n",
- file->f_path.dentry->d_name.name);
+ "orangefs_file_release: called on %pD\n",
+ file);
orangefs_flush_inode(inode);
/*
- * remove all associated inode pages from the page cache and mmap
+ * remove all associated inode pages from the page cache and
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
- if (file->f_path.dentry->d_inode &&
- file->f_path.dentry->d_inode->i_mapping &&
- mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
- truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
+ if (file_inode(file) &&
+ file_inode(file)->i_mapping &&
+ mapping_nrpages(&file_inode(file)->i_data)) {
+ if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "calling flush_racache on %pU\n",
+ get_khandle_from_ino(inode));
+ flush_racache(inode);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "flush_racache finished\n");
+ }
+ truncate_inode_pages(file_inode(file)->i_mapping,
0);
+ }
return 0;
}
@@ -613,7 +648,7 @@ static int orangefs_fsync(struct file *file,
{
int ret = -EINVAL;
struct orangefs_inode_s *orangefs_inode =
- ORANGEFS_I(file->f_path.dentry->d_inode);
+ ORANGEFS_I(file_inode(file));
struct orangefs_kernel_op_s *new_op = NULL;
/* required call */
@@ -626,7 +661,7 @@ static int orangefs_fsync(struct file *file,
ret = service_operation(new_op,
"orangefs_fsync",
- get_interruptible_flag(file->f_path.dentry->d_inode));
+ get_interruptible_flag(file_inode(file)));
gossip_debug(GOSSIP_FILE_DEBUG,
"orangefs_fsync got return value of %d\n",
@@ -634,7 +669,7 @@ static int orangefs_fsync(struct file *file,
op_release(new_op);
- orangefs_flush_inode(file->f_path.dentry->d_inode);
+ orangefs_flush_inode(file_inode(file));
return ret;
}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 28a0557a69be..ef3b4eb54cf2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -129,8 +129,8 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
struct iov_iter *iter)
{
gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_direct_IO: %s\n",
- iocb->ki_filp->f_path.dentry->d_name.name);
+ "orangefs_direct_IO: %pD\n",
+ iocb->ki_filp);
return -EINVAL;
}
@@ -216,10 +216,10 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = dentry->d_inode;
gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_setattr: called on %s\n",
- dentry->d_name.name);
+ "orangefs_setattr: called on %pd\n",
+ dentry);
- ret = inode_change_ok(inode, iattr);
+ ret = setattr_prepare(dentry, iattr);
if (ret)
goto out;
@@ -259,8 +259,8 @@ int orangefs_getattr(struct vfsmount *mnt,
struct orangefs_inode_s *orangefs_inode = NULL;
gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_getattr: called on %s\n",
- dentry->d_name.name);
+ "orangefs_getattr: called on %pd\n",
+ dentry);
ret = orangefs_inode_getattr(inode, 0, 0);
if (ret == 0) {
@@ -296,10 +296,7 @@ const struct inode_operations orangefs_file_inode_operations = {
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = orangefs_listxattr,
- .removexattr = generic_removexattr,
.permission = orangefs_permission,
};
@@ -438,7 +435,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_size = PAGE_SIZE;
inode->i_rdev = dev;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 62c525936ee8..a290ff6ec756 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -24,9 +24,9 @@ static int orangefs_create(struct inode *dir,
struct inode *inode;
int ret;
- gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n",
__func__,
- dentry->d_name.name);
+ dentry);
new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
if (!new_op)
@@ -43,9 +43,9 @@ static int orangefs_create(struct inode *dir,
ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
gossip_debug(GOSSIP_NAME_DEBUG,
- "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
+ "%s: %pd: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
__func__,
- dentry->d_name.name,
+ dentry,
&new_op->downcall.resp.create.refn.khandle,
new_op->downcall.resp.create.refn.fs_id,
new_op,
@@ -57,39 +57,39 @@ static int orangefs_create(struct inode *dir,
inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
&new_op->downcall.resp.create.refn);
if (IS_ERR(inode)) {
- gossip_err("%s: Failed to allocate inode for file :%s:\n",
+ gossip_err("%s: Failed to allocate inode for file :%pd:\n",
__func__,
- dentry->d_name.name);
+ dentry);
ret = PTR_ERR(inode);
goto out;
}
gossip_debug(GOSSIP_NAME_DEBUG,
- "%s: Assigned inode :%pU: for file :%s:\n",
+ "%s: Assigned inode :%pU: for file :%pd:\n",
__func__,
get_khandle_from_ino(inode),
- dentry->d_name.name);
+ dentry);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
- "%s: dentry instantiated for %s\n",
+ "%s: dentry instantiated for %pd\n",
__func__,
- dentry->d_name.name);
+ dentry);
SetMtimeFlag(parent);
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty_sync(dir);
ret = 0;
out:
op_release(new_op);
gossip_debug(GOSSIP_NAME_DEBUG,
- "%s: %s: returning %d\n",
+ "%s: %pd: returning %d\n",
__func__,
- dentry->d_name.name,
+ dentry,
ret);
return ret;
}
@@ -115,8 +115,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
* -EEXIST on O_EXCL opens, which is broken if we skip this lookup
* in the create path)
*/
- gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
- __func__, dentry->d_name.name);
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %pd\n",
+ __func__, dentry);
if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
return ERR_PTR(-ENAMETOOLONG);
@@ -169,9 +169,9 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
gossip_debug(GOSSIP_NAME_DEBUG,
"orangefs_lookup: Adding *negative* dentry "
- "%p for %s\n",
+ "%p for %pd\n",
dentry,
- dentry->d_name.name);
+ dentry);
d_add(dentry, NULL);
res = NULL;
@@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
@@ -224,10 +224,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
int ret;
gossip_debug(GOSSIP_NAME_DEBUG,
- "%s: called on %s\n"
+ "%s: called on %pd\n"
" (inode %pU): Parent is %pU | fs_id %d\n",
__func__,
- dentry->d_name.name,
+ dentry,
get_khandle_from_ino(inode),
&parent->refn.khandle,
parent->refn.fs_id);
@@ -254,7 +254,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
drop_nlink(inode);
SetMtimeFlag(parent);
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty_sync(dir);
}
return ret;
@@ -322,16 +322,16 @@ static int orangefs_symlink(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
- "Inode (Symlink) %pU -> %s\n",
+ "Inode (Symlink) %pU -> %pd\n",
get_khandle_from_ino(inode),
- dentry->d_name.name);
+ dentry);
SetMtimeFlag(parent);
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty_sync(dir);
ret = 0;
out:
@@ -386,20 +386,20 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
- "Inode (Directory) %pU -> %s\n",
+ "Inode (Directory) %pU -> %pd\n",
get_khandle_from_ino(inode),
- dentry->d_name.name);
+ dentry);
/*
* NOTE: we have no good way to keep nlink consistent for directories
* across clients; keep constant at 1.
*/
SetMtimeFlag(parent);
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty_sync(dir);
out:
op_release(new_op);
@@ -409,11 +409,15 @@ out:
static int orangefs_rename(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
- struct dentry *new_dentry)
+ struct dentry *new_dentry,
+ unsigned int flags)
{
struct orangefs_kernel_op_s *new_op;
int ret;
+ if (flags)
+ return -EINVAL;
+
gossip_debug(GOSSIP_NAME_DEBUG,
"orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
old_dentry, new_dentry, d_count(new_dentry));
@@ -443,7 +447,7 @@ static int orangefs_rename(struct inode *old_dir,
ret);
if (new_dentry->d_inode)
- new_dentry->d_inode->i_ctime = CURRENT_TIME;
+ new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode);
op_release(new_op);
return ret;
@@ -462,9 +466,6 @@ const struct inode_operations orangefs_dir_inode_operations = {
.rename = orangefs_rename,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = orangefs_listxattr,
.permission = orangefs_permission,
};
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index b6edbe9fb309..aa3830b741c7 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
return "OP_STATFS";
else if (type == ORANGEFS_VFS_OP_TRUNCATE)
return "OP_TRUNCATE";
- else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
- return "OP_MMAP_RA_FLUSH";
+ else if (type == ORANGEFS_VFS_OP_RA_FLUSH)
+ return "OP_RA_FLUSH";
else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
return "OP_FS_MOUNT";
else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
@@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
return "OP_FSYNC";
else if (type == ORANGEFS_VFS_OP_FSKEY)
return "OP_FSKEY";
+ else if (type == ORANGEFS_VFS_OP_FEATURES)
+ return "OP_FEATURES";
}
return "OP_UNKNOWN?";
}
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1714a737d556..eb09aa026723 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -43,36 +43,35 @@
#include "protocol.h"
#include "orangefs-kernel.h"
-static int orangefs_debug_disabled = 1;
-
-static int orangefs_debug_help_open(struct inode *, struct file *);
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+ "Client Debug Keywords are unknown until the first time\n" \
+ "the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
-const struct file_operations debug_help_fops = {
- .open = orangefs_debug_help_open,
- .read = seq_read,
- .release = seq_release,
- .llseek = seq_lseek,
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+ char *keyword;
+ __u64 mask1;
+ __u64 mask2;
};
+static int orangefs_kernel_debug_init(void);
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
static void *help_start(struct seq_file *, loff_t *);
static void *help_next(struct seq_file *, void *, loff_t *);
static void help_stop(struct seq_file *, void *);
static int help_show(struct seq_file *, void *);
-static const struct seq_operations help_debug_ops = {
- .start = help_start,
- .next = help_next,
- .stop = help_stop,
- .show = help_show,
-};
-
-/*
- * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
- * ORANGEFS_KMOD_DEBUG_FILE.
- */
-static DEFINE_MUTEX(orangefs_debug_lock);
-
-int orangefs_debug_open(struct inode *, struct file *);
+static int orangefs_debug_open(struct inode *, struct file *);
static ssize_t orangefs_debug_read(struct file *,
char __user *,
@@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *,
size_t,
loff_t *);
+static int orangefs_prepare_cdm_array(char *);
+static void debug_mask_to_string(void *, int);
+static void do_k_string(void *, int);
+static void do_c_string(void *, int);
+static int keyword_is_amalgam(char *);
+static int check_amalgam_keyword(void *, int);
+static void debug_string_to_mask(char *, void *, int);
+static void do_c_mask(int, char *, struct client_debug_mask **);
+static void do_k_mask(int, char *, __u64 **);
+
+static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+static char *debug_help_string;
+static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+
+static struct dentry *help_file_dentry;
+static struct dentry *client_debug_dentry;
+static struct dentry *debug_dir;
+
+static unsigned int kernel_mask_set_mod_init;
+static int orangefs_debug_disabled = 1;
+static int help_string_initialized;
+
+static const struct seq_operations help_debug_ops = {
+ .start = help_start,
+ .next = help_next,
+ .stop = help_stop,
+ .show = help_show,
+};
+
+const struct file_operations debug_help_fops = {
+ .open = orangefs_debug_help_open,
+ .read = seq_read,
+ .release = seq_release,
+ .llseek = seq_lseek,
+};
+
static const struct file_operations kernel_debug_fops = {
.open = orangefs_debug_open,
.read = orangefs_debug_read,
@@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = {
.llseek = generic_file_llseek,
};
+static int client_all_index;
+static int client_verbose_index;
+
+static struct client_debug_mask *cdm_array;
+static int cdm_element_count;
+
+static struct client_debug_mask client_debug_mask;
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
/*
* initialize kmod debug operations, create orangefs debugfs dir and
* ORANGEFS_KMOD_DEBUG_HELP_FILE.
*/
-int orangefs_debugfs_init(void)
+int orangefs_debugfs_init(int debug_mask)
{
-
int rc = -ENOMEM;
+ /* convert input debug mask to a 64-bit unsigned integer */
+ orangefs_gossip_debug_mask = (unsigned long long)debug_mask;
+
+ /*
+ * set the kernel's gossip debug string; invalid mask values will
+ * be ignored.
+ */
+ debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
+
+ /* remove any invalid values from the mask */
+ debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask,
+ 0);
+
+ /*
+ * if the mask has a non-zero value, then indicate that the mask
+ * was set when the kernel module was loaded. The orangefs dev ioctl
+ * command will look at this boolean to determine if the kernel's
+ * debug mask should be overwritten when the client-core is started.
+ */
+ if (orangefs_gossip_debug_mask != 0)
+ kernel_mask_set_mod_init = true;
+
+ pr_info("%s: called with debug mask: :%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)orangefs_gossip_debug_mask);
+
debug_dir = debugfs_create_dir("orangefs", NULL);
if (!debug_dir) {
pr_info("%s: debugfs_create_dir failed.\n", __func__);
@@ -117,13 +193,58 @@ int orangefs_debugfs_init(void)
}
orangefs_debug_disabled = 0;
+
+ rc = orangefs_kernel_debug_init();
+
+out:
+
+ return rc;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+static int orangefs_kernel_debug_init(void)
+{
+ int rc = -ENOMEM;
+ struct dentry *ret;
+ char *k_buffer = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+ k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!k_buffer)
+ goto out;
+
+ if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ strcpy(k_buffer, kernel_debug_string);
+ strcat(k_buffer, "\n");
+ } else {
+ strcpy(k_buffer, "none\n");
+ pr_info("%s: overflow 1!\n", __func__);
+ }
+
+ ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+ 0444,
+ debug_dir,
+ k_buffer,
+ &kernel_debug_fops);
+ if (!ret) {
+ pr_info("%s: failed to create %s.\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_FILE);
+ goto out;
+ }
+
rc = 0;
out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
return rc;
}
+
void orangefs_debugfs_cleanup(void)
{
debugfs_remove_recursive(debug_dir);
@@ -196,49 +317,6 @@ static int help_show(struct seq_file *m, void *v)
}
/*
- * initialize the kernel-debug file.
- */
-int orangefs_kernel_debug_init(void)
-{
- int rc = -ENOMEM;
- struct dentry *ret;
- char *k_buffer = NULL;
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
-
- k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!k_buffer)
- goto out;
-
- if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
- strcpy(k_buffer, kernel_debug_string);
- strcat(k_buffer, "\n");
- } else {
- strcpy(k_buffer, "none\n");
- pr_info("%s: overflow 1!\n", __func__);
- }
-
- ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
- 0444,
- debug_dir,
- k_buffer,
- &kernel_debug_fops);
- if (!ret) {
- pr_info("%s: failed to create %s.\n",
- __func__,
- ORANGEFS_KMOD_DEBUG_FILE);
- goto out;
- }
-
- rc = 0;
-
-out:
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
- return rc;
-}
-
-/*
* initialize the client-debug file.
*/
int orangefs_client_debug_init(void)
@@ -282,7 +360,7 @@ out:
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
-int orangefs_debug_open(struct inode *inode, struct file *file)
+static int orangefs_debug_open(struct inode *inode, struct file *file)
{
int rc = -ENODEV;
@@ -350,8 +428,8 @@ static ssize_t orangefs_debug_write(struct file *file,
struct client_debug_mask c_mask = { NULL, 0, 0 };
gossip_debug(GOSSIP_DEBUGFS_DEBUG,
- "orangefs_debug_write: %s\n",
- file->f_path.dentry->d_name.name);
+ "orangefs_debug_write: %pD\n",
+ file);
/*
* Thwart users who try to jamb a ridiculous number
@@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file,
*/
if (!strcmp(file->f_path.dentry->d_name.name,
ORANGEFS_KMOD_DEBUG_FILE)) {
- debug_string_to_mask(buf, &gossip_debug_mask, 0);
- debug_mask_to_string(&gossip_debug_mask, 0);
+ debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0);
+ debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
debug_string = kernel_debug_string;
gossip_debug(GOSSIP_DEBUGFS_DEBUG,
"New kernel debug string is %s\n",
@@ -452,3 +530,546 @@ out:
kfree(buf);
return rc;
}
+
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+static int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+ int i;
+ int rc = -EINVAL;
+ char *cds_head = NULL;
+ char *cds_delimiter = NULL;
+ int keyword_len = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ /*
+ * figure out how many elements the cdm_array needs.
+ */
+ for (i = 0; i < strlen(debug_array_string); i++)
+ if (debug_array_string[i] == '\n')
+ cdm_element_count++;
+
+ if (!cdm_element_count) {
+ pr_info("No elements in client debug array string!\n");
+ goto out;
+ }
+
+ cdm_array =
+ kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+ GFP_KERNEL);
+ if (!cdm_array) {
+ pr_info("malloc failed for cdm_array!\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ cds_head = debug_array_string;
+
+ for (i = 0; i < cdm_element_count; i++) {
+ cds_delimiter = strchr(cds_head, '\n');
+ *cds_delimiter = '\0';
+
+ keyword_len = strcspn(cds_head, " ");
+
+ cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+ if (!cdm_array[i].keyword) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sscanf(cds_head,
+ "%s %llx %llx",
+ cdm_array[i].keyword,
+ (unsigned long long *)&(cdm_array[i].mask1),
+ (unsigned long long *)&(cdm_array[i].mask2));
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+ client_verbose_index = i;
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+ client_all_index = i;
+
+ cds_head = cds_delimiter + 1;
+ }
+
+ rc = cdm_element_count;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+ int rc = -EINVAL;
+ int i;
+ int byte_count = 0;
+ char *client_title = "Client Debug Keywords:\n";
+ char *kernel_title = "Kernel Debug Keywords:\n";
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (at_boot) {
+ byte_count += strlen(HELP_STRING_UNINITIALIZED);
+ client_title = HELP_STRING_UNINITIALIZED;
+ } else {
+ /*
+ * fill the client keyword/mask array and remember
+ * how many elements there were.
+ */
+ cdm_element_count =
+ orangefs_prepare_cdm_array(client_debug_array_string);
+ if (cdm_element_count <= 0)
+ goto out;
+
+ /* Count the bytes destined for debug_help_string. */
+ byte_count += strlen(client_title);
+
+ for (i = 0; i < cdm_element_count; i++) {
+ byte_count += strlen(cdm_array[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 1!\n", __func__);
+ goto out;
+ }
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: cdm_element_count:%d:\n",
+ __func__,
+ cdm_element_count);
+ }
+
+ byte_count += strlen(kernel_title);
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ byte_count +=
+ strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 2!\n", __func__);
+ goto out;
+ }
+ }
+
+ /* build debug_help_string. */
+ debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+ if (!debug_help_string) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ strcat(debug_help_string, client_title);
+
+ if (!at_boot) {
+ for (i = 0; i < cdm_element_count; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, cdm_array[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+ }
+
+ strcat(debug_help_string, "\n");
+ strcat(debug_help_string, kernel_title);
+
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+
+ rc = 0;
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_mask_to_string(void *mask, int type)
+{
+ int i;
+ int len = 0;
+ char *debug_string;
+ int element_count = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ debug_string = client_debug_string;
+ element_count = cdm_element_count;
+ } else {
+ debug_string = kernel_debug_string;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+
+ /*
+ * Some keywords, like "all" or "verbose", are amalgams of
+ * numerous other keywords. Make a special check for those
+ * before grinding through the whole mask only to find out
+ * later...
+ */
+ if (check_amalgam_keyword(mask, type))
+ goto out;
+
+ /* Build the debug string. */
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_string(mask, i);
+ else
+ do_k_string(mask, i);
+
+ len = strlen(debug_string);
+
+ if ((len) && (type))
+ client_debug_string[len - 1] = '\0';
+ else if (len)
+ kernel_debug_string[len - 1] = '\0';
+ else if (type)
+ strcpy(client_debug_string, "none");
+ else
+ strcpy(kernel_debug_string, "none");
+
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+
+ return;
+
+}
+
+static void do_k_string(void *k_mask, int index)
+{
+ __u64 *mask = (__u64 *) k_mask;
+
+ if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+ goto out;
+
+ if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+ if ((strlen(kernel_debug_string) +
+ strlen(s_kmod_keyword_mask_map[index].keyword))
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strcat(kernel_debug_string,
+ s_kmod_keyword_mask_map[index].keyword);
+ strcat(kernel_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+
+out:
+
+ return;
+}
+
+static void do_c_string(void *c_mask, int index)
+{
+ struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+
+ if (keyword_is_amalgam(cdm_array[index].keyword))
+ goto out;
+
+ if ((mask->mask1 & cdm_array[index].mask1) ||
+ (mask->mask2 & cdm_array[index].mask2)) {
+ if ((strlen(client_debug_string) +
+ strlen(cdm_array[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ strcat(client_debug_string,
+ cdm_array[index].keyword);
+ strcat(client_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+out:
+ return;
+}
+
+static int keyword_is_amalgam(char *keyword)
+{
+ int rc = 0;
+
+ if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+ rc = 1;
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+static int check_amalgam_keyword(void *mask, int type)
+{
+ __u64 *k_mask;
+ struct client_debug_mask *c_mask;
+ int k_all_index = num_kmod_keyword_mask_map - 1;
+ int rc = 0;
+
+ if (type) {
+ c_mask = (struct client_debug_mask *) mask;
+
+ if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+
+ if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_VERBOSE);
+ rc = 1;
+ goto out;
+ }
+
+ } else {
+ k_mask = (__u64 *) mask;
+
+ if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+ }
+
+out:
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+ char *unchecked_keyword;
+ int i;
+ char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+ char *original_pointer;
+ int element_count = 0;
+ struct client_debug_mask *c_mask = NULL;
+ __u64 *k_mask = NULL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ c_mask = (struct client_debug_mask *)mask;
+ element_count = cdm_element_count;
+ } else {
+ k_mask = (__u64 *)mask;
+ *k_mask = 0;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ original_pointer = strsep_fodder;
+ while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+ if (strlen(unchecked_keyword)) {
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_mask(i,
+ unchecked_keyword,
+ &c_mask);
+ else
+ do_k_mask(i,
+ unchecked_keyword,
+ &k_mask);
+ }
+
+ kfree(original_pointer);
+}
+
+static void do_c_mask(int i, char *unchecked_keyword,
+ struct client_debug_mask **sane_mask)
+{
+
+ if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+ (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+ (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+ }
+}
+
+static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+
+ if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+ **sane_mask = (**sane_mask) |
+ s_kmod_keyword_mask_map[i].mask_val;
+}
+
+int orangefs_debugfs_new_client_mask(void __user *arg)
+{
+ struct dev_mask2_info_s mask2_info = {0};
+ int ret;
+
+ ret = copy_from_user(&mask2_info,
+ (void __user *)arg,
+ sizeof(struct dev_mask2_info_s));
+
+ if (ret != 0)
+ return -EIO;
+
+ client_debug_mask.mask1 = mask2_info.mask1_value;
+ client_debug_mask.mask2 = mask2_info.mask2_value;
+
+ pr_info("%s: client debug mask has been been received "
+ ":%llx: :%llx:\n",
+ __func__,
+ (unsigned long long)client_debug_mask.mask1,
+ (unsigned long long)client_debug_mask.mask2);
+
+ return ret;
+}
+
+int orangefs_debugfs_new_client_string(void __user *arg)
+{
+ int ret;
+
+ ret = copy_from_user(&client_debug_array_string,
+ (void __user *)arg,
+ ORANGEFS_MAX_DEBUG_STRING_LEN);
+ if (ret != 0)
+ return -EIO;
+
+ /*
+ * The real client-core makes an effort to ensure
+ * that actual strings that aren't too long to fit in
+ * this buffer is what we get here. We're going to use
+ * string functions on the stuff we got, so we'll make
+ * this extra effort to try and keep from
+ * flowing out of this buffer when we use the string
+ * functions, even if somehow the stuff we end up
+ * with here is garbage.
+ */
+ client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+ '\0';
+
+ if (ret != 0) {
+ pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+ __func__);
+ return -EIO;
+ }
+
+ pr_info("%s: client debug array string has been received.\n",
+ __func__);
+
+ if (!help_string_initialized) {
+
+ /* Free the "we don't know yet" default string... */
+ kfree(debug_help_string);
+
+ /* build a proper debug help string */
+ if (orangefs_prepare_debugfs_help_string(0)) {
+ gossip_err("%s: no debug help string \n",
+ __func__);
+ return -EIO;
+ }
+
+ /* Replace the boilerplate boot-time debug-help file. */
+ debugfs_remove(help_file_dentry);
+
+ help_file_dentry =
+ debugfs_create_file(
+ ORANGEFS_KMOD_DEBUG_HELP_FILE,
+ 0444,
+ debug_dir,
+ debug_help_string,
+ &debug_help_fops);
+
+ if (!help_file_dentry) {
+ gossip_err("%s: debugfs_create_file failed for"
+ " :%s:!\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_HELP_FILE);
+ return -EIO;
+ }
+ }
+
+ debug_mask_to_string(&client_debug_mask, 1);
+
+ debugfs_remove(client_debug_dentry);
+
+ orangefs_client_debug_init();
+
+ help_string_initialized++;
+
+ return ret;
+}
+
+int orangefs_debugfs_new_debug(void __user *arg)
+{
+ struct dev_mask_info_s mask_info = {0};
+ int ret;
+
+ ret = copy_from_user(&mask_info,
+ (void __user *)arg,
+ sizeof(mask_info));
+
+ if (ret != 0)
+ return -EIO;
+
+ if (mask_info.mask_type == KERNEL_MASK) {
+ if ((mask_info.mask_value == 0)
+ && (kernel_mask_set_mod_init)) {
+ /*
+ * the kernel debug mask was set when the
+ * kernel module was loaded; don't override
+ * it if the client-core was started without
+ * a value for ORANGEFS_KMODMASK.
+ */
+ return 0;
+ }
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ orangefs_gossip_debug_mask = mask_info.mask_value;
+ pr_info("%s: kernel debug mask has been modified to "
+ ":%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)orangefs_gossip_debug_mask);
+ } else if (mask_info.mask_type == CLIENT_MASK) {
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ pr_info("%s: client debug mask has been modified to"
+ ":%s: :%llx:\n",
+ __func__,
+ client_debug_string,
+ llu(mask_info.mask_value));
+ } else {
+ gossip_lerr("Invalid mask type....\n");
+ return -EINVAL;
+ }
+
+ return ret;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
index e4828c0e3ef9..803517269ba6 100644
--- a/fs/orangefs/orangefs-debugfs.h
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -1,3 +1,7 @@
-int orangefs_debugfs_init(void);
-int orangefs_kernel_debug_init(void);
+int orangefs_debugfs_init(int);
void orangefs_debugfs_cleanup(void);
+int orangefs_client_debug_init(void);
+int orangefs_prepare_debugfs_help_string(int);
+int orangefs_debugfs_new_client_mask(void __user *);
+int orangefs_debugfs_new_client_string(void __user *);
+int orangefs_debugfs_new_debug(void __user *);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index 9eac9d9a3f3a..a3d84ffee905 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -28,7 +28,7 @@
#define ORANGEFS_VFS_OP_RENAME 0xFF00000A
#define ORANGEFS_VFS_OP_STATFS 0xFF00000B
#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C
-#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D
+#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D
#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E
#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F
#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010
@@ -41,6 +41,10 @@
#define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01
#define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02
#define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03
+#define ORANGEFS_VFS_OP_FEATURES 0xFF00EE05 /* 2.9.6 */
+
+/* features is a 64-bit unsigned bitmask */
+#define ORANGEFS_FEATURE_READAHEAD 1
/*
* Misc constants. Please retain them as multiples of 8!
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 633c07a6e3d8..3bf803d732c5 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -100,16 +100,6 @@ enum orangefs_vfs_op_states {
};
/*
- * An array of client_debug_mask will be built to hold debug keyword/mask
- * values fetched from userspace.
- */
-struct client_debug_mask {
- char *keyword;
- __u64 mask1;
- __u64 mask2;
-};
-
-/*
* orangefs kernel memory related flags
*/
@@ -119,29 +109,6 @@ struct client_debug_mask {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
-/* these functions are defined in orangefs-utils.c */
-int orangefs_prepare_cdm_array(char *debug_array_string);
-int orangefs_prepare_debugfs_help_string(int);
-
-/* defined in orangefs-debugfs.c */
-int orangefs_client_debug_init(void);
-
-void debug_string_to_mask(char *, void *, int);
-void do_c_mask(int, char *, struct client_debug_mask **);
-void do_k_mask(int, char *, __u64 **);
-
-void debug_mask_to_string(void *, int);
-void do_k_string(void *, int);
-void do_c_string(void *, int);
-int check_amalgam_keyword(void *, int);
-int keyword_is_amalgam(char *);
-
-/*these variables are defined in orangefs-mod.c */
-extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern unsigned int kernel_mask_set_mod_init;
-
extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
extern const struct xattr_handler *orangefs_xattr_handlers[];
@@ -331,7 +298,7 @@ struct orangefs_stats {
unsigned long writes;
};
-extern struct orangefs_stats g_orangefs_stats;
+extern struct orangefs_stats orangefs_stats;
/*
* NOTE: See Documentation/filesystems/porting for information
@@ -447,6 +414,8 @@ void purge_waiting_ops(void);
/*
* defined in super.c
*/
+extern uint64_t orangefs_features;
+
struct dentry *orangefs_mount(struct file_system_type *fst,
int flags,
const char *devname,
@@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode,
/*
* defined in devorangefs-req.c
*/
+extern uint32_t orangefs_userspace_version;
+
int orangefs_dev_init(void);
void orangefs_dev_cleanup(void);
int is_daemon_in_service(void);
@@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
int orangefs_normalize_to_errno(__s32 error_code);
-extern struct mutex devreq_mutex;
-extern struct mutex request_mutex;
-extern int debug;
+extern struct mutex orangefs_request_mutex;
extern int op_timeout_secs;
extern int slot_timeout_secs;
-extern int dcache_timeout_msecs;
-extern int getattr_timeout_msecs;
+extern int orangefs_dcache_timeout_msecs;
+extern int orangefs_getattr_timeout_msecs;
extern struct list_head orangefs_superblocks;
extern spinlock_t orangefs_superblocks_lock;
extern struct list_head orangefs_request_list;
extern spinlock_t orangefs_request_list_lock;
extern wait_queue_head_t orangefs_request_list_waitq;
-extern struct list_head *htable_ops_in_progress;
-extern spinlock_t htable_ops_in_progress_lock;
+extern struct list_head *orangefs_htable_ops_in_progress;
+extern spinlock_t orangefs_htable_ops_in_progress_lock;
extern int hash_table_size;
extern const struct address_space_operations orangefs_address_operations;
@@ -611,4 +580,11 @@ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
#endif
}
+static inline void orangefs_set_timeout(struct dentry *dentry)
+{
+ unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+
+ dentry->d_fsdata = (void *) time;
+}
+
#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index e9fd5755c05f..2e5b03065f34 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -21,34 +21,17 @@
* global variables declared here
*/
-/* array of client debug keyword/mask values */
-struct client_debug_mask *cdm_array;
-int cdm_element_count;
-
-char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
-char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-
-char *debug_help_string;
-int help_string_initialized;
-struct dentry *help_file_dentry;
-struct dentry *client_debug_dentry;
-struct dentry *debug_dir;
-int client_verbose_index;
-int client_all_index;
-struct orangefs_stats g_orangefs_stats;
+struct orangefs_stats orangefs_stats;
/* the size of the hash tables for ops in progress */
int hash_table_size = 509;
static ulong module_parm_debug_mask;
-__u64 gossip_debug_mask;
-struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
-unsigned int kernel_mask_set_mod_init; /* implicitly false */
+__u64 orangefs_gossip_debug_mask;
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
-int dcache_timeout_msecs = 50;
-int getattr_timeout_msecs = 50;
+int orangefs_dcache_timeout_msecs = 50;
+int orangefs_getattr_timeout_msecs = 50;
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ORANGEFS Development Team");
@@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644);
module_param(op_timeout_secs, int, 0);
module_param(slot_timeout_secs, int, 0);
-/* synchronizes the request device file */
-DEFINE_MUTEX(devreq_mutex);
-
/*
* Blocks non-priority requests from being queued for servicing. This
* could be used for protecting the request list data structure, but
* for now it's only being used to stall the op addition to the request
* list
*/
-DEFINE_MUTEX(request_mutex);
+DEFINE_MUTEX(orangefs_request_mutex);
/* hash table for storing operations waiting for matching downcall */
-struct list_head *htable_ops_in_progress;
-DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+struct list_head *orangefs_htable_ops_in_progress;
+DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock);
/* list for queueing upcall operations */
LIST_HEAD(orangefs_request_list);
@@ -100,32 +80,6 @@ static int __init orangefs_init(void)
int ret = -1;
__u32 i = 0;
- /* convert input debug mask to a 64-bit unsigned integer */
- gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
-
- /*
- * set the kernel's gossip debug string; invalid mask values will
- * be ignored.
- */
- debug_mask_to_string(&gossip_debug_mask, 0);
-
- /* remove any invalid values from the mask */
- debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
-
- /*
- * if the mask has a non-zero value, then indicate that the mask
- * was set when the kernel module was loaded. The orangefs dev ioctl
- * command will look at this boolean to determine if the kernel's
- * debug mask should be overwritten when the client-core is started.
- */
- if (gossip_debug_mask != 0)
- kernel_mask_set_mod_init = true;
-
- pr_info("%s: called with debug mask: :%s: :%llx:\n",
- __func__,
- kernel_debug_string,
- (unsigned long long)gossip_debug_mask);
-
ret = bdi_init(&orangefs_backing_dev_info);
if (ret)
@@ -146,9 +100,9 @@ static int __init orangefs_init(void)
if (ret < 0)
goto cleanup_op;
- htable_ops_in_progress =
+ orangefs_htable_ops_in_progress =
kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
- if (!htable_ops_in_progress) {
+ if (!orangefs_htable_ops_in_progress) {
gossip_err("Failed to initialize op hashtable");
ret = -ENOMEM;
goto cleanup_inode;
@@ -156,7 +110,7 @@ static int __init orangefs_init(void)
/* initialize a doubly linked at each hash table index */
for (i = 0; i < hash_table_size; i++)
- INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+ INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]);
ret = fsid_key_table_initialize();
if (ret < 0)
@@ -179,14 +133,10 @@ static int __init orangefs_init(void)
if (ret)
goto cleanup_key_table;
- ret = orangefs_debugfs_init();
+ ret = orangefs_debugfs_init(module_parm_debug_mask);
if (ret)
goto debugfs_init_failed;
- ret = orangefs_kernel_debug_init();
- if (ret)
- goto kernel_debug_init_failed;
-
ret = orangefs_sysfs_init();
if (ret)
goto sysfs_init_failed;
@@ -214,8 +164,6 @@ cleanup_device:
sysfs_init_failed:
-kernel_debug_init_failed:
-
debugfs_init_failed:
orangefs_debugfs_cleanup();
@@ -223,7 +171,7 @@ cleanup_key_table:
fsid_key_table_finalize();
cleanup_progress_table:
- kfree(htable_ops_in_progress);
+ kfree(orangefs_htable_ops_in_progress);
cleanup_inode:
orangefs_inode_cache_finalize();
@@ -250,12 +198,12 @@ static void __exit orangefs_exit(void)
orangefs_dev_cleanup();
BUG_ON(!list_empty(&orangefs_request_list));
for (i = 0; i < hash_table_size; i++)
- BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+ BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i]));
orangefs_inode_cache_finalize();
op_cache_finalize();
- kfree(htable_ops_in_progress);
+ kfree(orangefs_htable_ops_in_progress);
bdi_destroy(&orangefs_backing_dev_info);
@@ -274,10 +222,10 @@ void purge_inprogress_ops(void)
struct orangefs_kernel_op_s *op;
struct orangefs_kernel_op_s *next;
- spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&orangefs_htable_ops_in_progress_lock);
list_for_each_entry_safe(op,
next,
- &htable_ops_in_progress[i],
+ &orangefs_htable_ops_in_progress[i],
list) {
set_op_state_purged(op);
gossip_debug(GOSSIP_DEV_DEBUG,
@@ -287,7 +235,7 @@ void purge_inprogress_ops(void)
op->op_state,
current->comm);
}
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
}
}
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 375708c2db87..a799546a67f7 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -73,6 +73,24 @@
* Description:
* Time getattr is valid in milliseconds.
*
+ * What: /sys/fs/orangefs/readahead_count
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer count.
+ *
+ * What: /sys/fs/orangefs/readahead_size
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer size.
+ *
+ * What: /sys/fs/orangefs/readahead_count_size
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer count and size.
+ *
* What: /sys/fs/orangefs/acache/...
* Date: Jun 2015
* Contact: Martin Brandenburg <martin@omnibond.com>
@@ -121,159 +139,34 @@
#define PC_KOBJ_ID "pc"
#define STATS_KOBJ_ID "stats"
-struct orangefs_obj {
- struct kobject kobj;
- int op_timeout_secs;
- int perf_counter_reset;
- int perf_history_size;
- int perf_time_interval_secs;
- int slot_timeout_secs;
- int dcache_timeout_msecs;
- int getattr_timeout_msecs;
-};
-
-struct acache_orangefs_obj {
- struct kobject kobj;
- int hard_limit;
- int reclaim_percentage;
- int soft_limit;
- int timeout_msecs;
-};
-
-struct capcache_orangefs_obj {
- struct kobject kobj;
- int hard_limit;
- int reclaim_percentage;
- int soft_limit;
- int timeout_secs;
-};
-
-struct ccache_orangefs_obj {
- struct kobject kobj;
- int hard_limit;
- int reclaim_percentage;
- int soft_limit;
- int timeout_secs;
-};
-
-struct ncache_orangefs_obj {
- struct kobject kobj;
- int hard_limit;
- int reclaim_percentage;
- int soft_limit;
- int timeout_msecs;
-};
-
-struct pc_orangefs_obj {
- struct kobject kobj;
- char *acache;
- char *capcache;
- char *ncache;
-};
-
-struct stats_orangefs_obj {
- struct kobject kobj;
- int reads;
- int writes;
-};
+/*
+ * Every item calls orangefs_attr_show and orangefs_attr_store through
+ * orangefs_sysfs_ops. They look at the orangefs_attributes further below to
+ * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or
+ * sysfs_service_op_store.
+ */
struct orangefs_attribute {
struct attribute attr;
- ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+ ssize_t (*show)(struct kobject *kobj,
struct orangefs_attribute *attr,
char *buf);
- ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+ ssize_t (*store)(struct kobject *kobj,
struct orangefs_attribute *attr,
const char *buf,
size_t count);
};
-struct acache_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
- struct acache_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
- struct acache_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
-struct capcache_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
- struct capcache_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
- struct capcache_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
-struct ccache_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
- struct ccache_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
- struct ccache_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
-struct ncache_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
- struct ncache_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
- struct ncache_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
-struct pc_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
- struct pc_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
- struct pc_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
-struct stats_orangefs_attribute {
- struct attribute attr;
- ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
- struct stats_orangefs_attribute *attr,
- char *buf);
- ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
- struct stats_orangefs_attribute *attr,
- const char *buf,
- size_t count);
-};
-
static ssize_t orangefs_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
struct orangefs_attribute *attribute;
- struct orangefs_obj *orangefs_obj;
- int rc;
attribute = container_of(attr, struct orangefs_attribute, attr);
- orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(orangefs_obj, attribute, buf);
-
-out:
- return rc;
+ if (!attribute->show)
+ return -EIO;
+ return attribute->show(kobj, attribute, buf);
}
static ssize_t orangefs_attr_store(struct kobject *kobj,
@@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj,
size_t len)
{
struct orangefs_attribute *attribute;
- struct orangefs_obj *orangefs_obj;
- int rc;
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "orangefs_attr_store: start\n");
+ if (!strcmp(kobj->name, PC_KOBJ_ID) ||
+ !strcmp(kobj->name, STATS_KOBJ_ID))
+ return -EPERM;
attribute = container_of(attr, struct orangefs_attribute, attr);
- orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
- if (!attribute->store) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->store(orangefs_obj, attribute, buf, len);
-
-out:
- return rc;
+ if (!attribute->store)
+ return -EIO;
+ return attribute->store(kobj, attribute, buf, len);
}
static const struct sysfs_ops orangefs_sysfs_ops = {
@@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = {
.store = orangefs_attr_store,
};
-static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct acache_orangefs_attribute *attribute;
- struct acache_orangefs_obj *acache_orangefs_obj;
- int rc;
-
- attribute = container_of(attr, struct acache_orangefs_attribute, attr);
- acache_orangefs_obj =
- container_of(kobj, struct acache_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(acache_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf,
- size_t len)
-{
- struct acache_orangefs_attribute *attribute;
- struct acache_orangefs_obj *acache_orangefs_obj;
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "acache_orangefs_attr_store: start\n");
-
- attribute = container_of(attr, struct acache_orangefs_attribute, attr);
- acache_orangefs_obj =
- container_of(kobj, struct acache_orangefs_obj, kobj);
-
- if (!attribute->store) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops acache_orangefs_sysfs_ops = {
- .show = acache_orangefs_attr_show,
- .store = acache_orangefs_attr_store,
-};
-
-static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct capcache_orangefs_attribute *attribute;
- struct capcache_orangefs_obj *capcache_orangefs_obj;
- int rc;
-
- attribute =
- container_of(attr, struct capcache_orangefs_attribute, attr);
- capcache_orangefs_obj =
- container_of(kobj, struct capcache_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(capcache_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf,
- size_t len)
-{
- struct capcache_orangefs_attribute *attribute;
- struct capcache_orangefs_obj *capcache_orangefs_obj;
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "capcache_orangefs_attr_store: start\n");
-
- attribute =
- container_of(attr, struct capcache_orangefs_attribute, attr);
- capcache_orangefs_obj =
- container_of(kobj, struct capcache_orangefs_obj, kobj);
-
- if (!attribute->store) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
- .show = capcache_orangefs_attr_show,
- .store = capcache_orangefs_attr_store,
-};
-
-static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct ccache_orangefs_attribute *attribute;
- struct ccache_orangefs_obj *ccache_orangefs_obj;
- int rc;
-
- attribute =
- container_of(attr, struct ccache_orangefs_attribute, attr);
- ccache_orangefs_obj =
- container_of(kobj, struct ccache_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(ccache_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf,
- size_t len)
-{
- struct ccache_orangefs_attribute *attribute;
- struct ccache_orangefs_obj *ccache_orangefs_obj;
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "ccache_orangefs_attr_store: start\n");
-
- attribute =
- container_of(attr, struct ccache_orangefs_attribute, attr);
- ccache_orangefs_obj =
- container_of(kobj, struct ccache_orangefs_obj, kobj);
-
- if (!attribute->store) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
- .show = ccache_orangefs_attr_show,
- .store = ccache_orangefs_attr_store,
-};
-
-static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct ncache_orangefs_attribute *attribute;
- struct ncache_orangefs_obj *ncache_orangefs_obj;
- int rc;
-
- attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
- ncache_orangefs_obj =
- container_of(kobj, struct ncache_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(ncache_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf,
- size_t len)
-{
- struct ncache_orangefs_attribute *attribute;
- struct ncache_orangefs_obj *ncache_orangefs_obj;
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "ncache_orangefs_attr_store: start\n");
-
- attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
- ncache_orangefs_obj =
- container_of(kobj, struct ncache_orangefs_obj, kobj);
-
- if (!attribute->store) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
- .show = ncache_orangefs_attr_show,
- .store = ncache_orangefs_attr_store,
-};
-
-static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct pc_orangefs_attribute *attribute;
- struct pc_orangefs_obj *pc_orangefs_obj;
- int rc;
-
- attribute = container_of(attr, struct pc_orangefs_attribute, attr);
- pc_orangefs_obj =
- container_of(kobj, struct pc_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(pc_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops pc_orangefs_sysfs_ops = {
- .show = pc_orangefs_attr_show,
-};
-
-static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct stats_orangefs_attribute *attribute;
- struct stats_orangefs_obj *stats_orangefs_obj;
- int rc;
-
- attribute = container_of(attr, struct stats_orangefs_attribute, attr);
- stats_orangefs_obj =
- container_of(kobj, struct stats_orangefs_obj, kobj);
-
- if (!attribute->show) {
- rc = -EIO;
- goto out;
- }
-
- rc = attribute->show(stats_orangefs_obj, attribute, buf);
-
-out:
- return rc;
-}
-
-static const struct sysfs_ops stats_orangefs_sysfs_ops = {
- .show = stats_orangefs_attr_show,
-};
-
-static void orangefs_release(struct kobject *kobj)
-{
- struct orangefs_obj *orangefs_obj;
-
- orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
- kfree(orangefs_obj);
-}
-
-static void acache_orangefs_release(struct kobject *kobj)
-{
- struct acache_orangefs_obj *acache_orangefs_obj;
-
- acache_orangefs_obj =
- container_of(kobj, struct acache_orangefs_obj, kobj);
- kfree(acache_orangefs_obj);
-}
-
-static void capcache_orangefs_release(struct kobject *kobj)
-{
- struct capcache_orangefs_obj *capcache_orangefs_obj;
-
- capcache_orangefs_obj =
- container_of(kobj, struct capcache_orangefs_obj, kobj);
- kfree(capcache_orangefs_obj);
-}
-
-static void ccache_orangefs_release(struct kobject *kobj)
-{
- struct ccache_orangefs_obj *ccache_orangefs_obj;
-
- ccache_orangefs_obj =
- container_of(kobj, struct ccache_orangefs_obj, kobj);
- kfree(ccache_orangefs_obj);
-}
-
-static void ncache_orangefs_release(struct kobject *kobj)
-{
- struct ncache_orangefs_obj *ncache_orangefs_obj;
-
- ncache_orangefs_obj =
- container_of(kobj, struct ncache_orangefs_obj, kobj);
- kfree(ncache_orangefs_obj);
-}
-
-static void pc_orangefs_release(struct kobject *kobj)
-{
- struct pc_orangefs_obj *pc_orangefs_obj;
-
- pc_orangefs_obj =
- container_of(kobj, struct pc_orangefs_obj, kobj);
- kfree(pc_orangefs_obj);
-}
-
-static void stats_orangefs_release(struct kobject *kobj)
-{
- struct stats_orangefs_obj *stats_orangefs_obj;
-
- stats_orangefs_obj =
- container_of(kobj, struct stats_orangefs_obj, kobj);
- kfree(stats_orangefs_obj);
-}
-
-static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_int_show(struct kobject *kobj,
+ struct orangefs_attribute *attr, char *buf)
{
int rc = -EIO;
- struct orangefs_attribute *orangefs_attr;
- struct stats_orangefs_attribute *stats_orangefs_attr;
- gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n",
+ kobj->name);
- if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
- orangefs_attr = (struct orangefs_attribute *)attr;
-
- if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+ if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "op_timeout_secs")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%d\n",
op_timeout_secs);
goto out;
- } else if (!strcmp(orangefs_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"slot_timeout_secs")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%d\n",
slot_timeout_secs);
goto out;
- } else if (!strcmp(orangefs_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"dcache_timeout_msecs")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%d\n",
- dcache_timeout_msecs);
+ orangefs_dcache_timeout_msecs);
goto out;
- } else if (!strcmp(orangefs_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"getattr_timeout_msecs")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%d\n",
- getattr_timeout_msecs);
+ orangefs_getattr_timeout_msecs);
goto out;
} else {
goto out;
}
- } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
- stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
-
- if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+ } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "reads")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%lu\n",
- g_orangefs_stats.reads);
+ orangefs_stats.reads);
goto out;
- } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+ } else if (!strcmp(attr->attr.name, "writes")) {
rc = scnprintf(buf,
PAGE_SIZE,
"%lu\n",
- g_orangefs_stats.writes);
+ orangefs_stats.writes);
goto out;
} else {
goto out;
@@ -714,45 +254,13 @@ out:
return rc;
}
-static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
- struct orangefs_attribute *attr,
- char *buf)
-{
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "int_orangefs_show:start attr->attr.name:%s:\n",
- attr->attr.name);
-
- rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
- return rc;
-}
-
-static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
- struct stats_orangefs_attribute *attr,
- char *buf)
-{
- int rc;
-
- gossip_debug(GOSSIP_SYSFS_DEBUG,
- "int_stats_show:start attr->attr.name:%s:\n",
- attr->attr.name);
-
- rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
-
- return rc;
-}
-
-static ssize_t int_store(struct orangefs_obj *orangefs_obj,
- struct orangefs_attribute *attr,
- const char *buf,
- size_t count)
+static ssize_t sysfs_int_store(struct kobject *kobj,
+ struct orangefs_attribute *attr, const char *buf, size_t count)
{
int rc = 0;
gossip_debug(GOSSIP_SYSFS_DEBUG,
- "int_store: start attr->attr.name:%s: buf:%s:\n",
+ "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n",
attr->attr.name, buf);
if (!strcmp(attr->attr.name, "op_timeout_secs")) {
@@ -762,10 +270,10 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
rc = kstrtoint(buf, 0, &slot_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
- rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+ rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
goto out;
} else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
- rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+ rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs);
goto out;
} else {
goto out;
@@ -783,24 +291,19 @@ out:
/*
* obtain attribute values from userspace with a service operation.
*/
-static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_service_op_show(struct kobject *kobj,
+ struct orangefs_attribute *attr, char *buf)
{
struct orangefs_kernel_op_s *new_op = NULL;
int rc = 0;
char *ser_op_type = NULL;
- struct orangefs_attribute *orangefs_attr;
- struct acache_orangefs_attribute *acache_attr;
- struct capcache_orangefs_attribute *capcache_attr;
- struct ccache_orangefs_attribute *ccache_attr;
- struct ncache_orangefs_attribute *ncache_attr;
- struct pc_orangefs_attribute *pc_attr;
__u32 op_alloc_type;
gossip_debug(GOSSIP_SYSFS_DEBUG,
"sysfs_service_op_show: id:%s:\n",
- kobj_id);
+ kobj->name);
- if (strcmp(kobj_id, PC_KOBJ_ID))
+ if (strcmp(kobj->name, PC_KOBJ_ID))
op_alloc_type = ORANGEFS_VFS_OP_PARAM;
else
op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
@@ -818,124 +321,135 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
goto out;
}
- if (strcmp(kobj_id, PC_KOBJ_ID))
+ if (strcmp(kobj->name, PC_KOBJ_ID))
new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
- if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
- orangefs_attr = (struct orangefs_attribute *)attr;
+ if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+ /* Drop unsupported requests first. */
+ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+ (!strcmp(attr->attr.name, "readahead_count") ||
+ !strcmp(attr->attr.name, "readahead_size") ||
+ !strcmp(attr->attr.name, "readahead_count_size"))) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+ if (!strcmp(attr->attr.name, "perf_history_size"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
- else if (!strcmp(orangefs_attr->attr.name,
+ else if (!strcmp(attr->attr.name,
"perf_time_interval_secs"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
- else if (!strcmp(orangefs_attr->attr.name,
+ else if (!strcmp(attr->attr.name,
"perf_counter_reset"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
- } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
- acache_attr = (struct acache_orangefs_attribute *)attr;
+ else if (!strcmp(attr->attr.name,
+ "readahead_count"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+
+ else if (!strcmp(attr->attr.name,
+ "readahead_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
- if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+ else if (!strcmp(attr->attr.name,
+ "readahead_count_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+ } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "timeout_msecs"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
- if (!strcmp(acache_attr->attr.name, "hard_limit"))
+ if (!strcmp(attr->attr.name, "hard_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
- if (!strcmp(acache_attr->attr.name, "soft_limit"))
+ if (!strcmp(attr->attr.name, "soft_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
- if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+ if (!strcmp(attr->attr.name, "reclaim_percentage"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
- } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
- capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
- if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+ } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "timeout_secs"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
- if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+ if (!strcmp(attr->attr.name, "hard_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
- if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+ if (!strcmp(attr->attr.name, "soft_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
- if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+ if (!strcmp(attr->attr.name, "reclaim_percentage"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
- } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
- ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
- if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+ } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "timeout_secs"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
- if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+ if (!strcmp(attr->attr.name, "hard_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
- if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+ if (!strcmp(attr->attr.name, "soft_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
- if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+ if (!strcmp(attr->attr.name, "reclaim_percentage"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
- } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
- ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
- if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+ } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "timeout_msecs"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
- if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+ if (!strcmp(attr->attr.name, "hard_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
- if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+ if (!strcmp(attr->attr.name, "soft_limit"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
- if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+ if (!strcmp(attr->attr.name, "reclaim_percentage"))
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
- } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
- pc_attr = (struct pc_orangefs_attribute *)attr;
-
- if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+ } else if (!strcmp(kobj->name, PC_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID))
new_op->upcall.req.perf_count.type =
ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
- if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+ if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID))
new_op->upcall.req.perf_count.type =
ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
- if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+ if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID))
new_op->upcall.req.perf_count.type =
ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
} else {
gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
- kobj_id);
+ kobj->name);
rc = -EINVAL;
goto out;
}
- if (strcmp(kobj_id, PC_KOBJ_ID))
+ if (strcmp(kobj->name, PC_KOBJ_ID))
ser_op_type = "orangefs_param";
else
ser_op_type = "orangefs_perf_count";
@@ -948,11 +462,18 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
out:
if (!rc) {
- if (strcmp(kobj_id, PC_KOBJ_ID)) {
- rc = scnprintf(buf,
- PAGE_SIZE,
- "%d\n",
- (int)new_op->downcall.resp.param.value);
+ if (strcmp(kobj->name, PC_KOBJ_ID)) {
+ if (new_op->upcall.req.param.op ==
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
+ rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+ (int)new_op->downcall.resp.param.u.
+ value32[0],
+ (int)new_op->downcall.resp.param.u.
+ value32[1]);
+ } else {
+ rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+ (int)new_op->downcall.resp.param.u.value64);
+ }
} else {
rc = scnprintf(
buf,
@@ -968,77 +489,6 @@ out:
}
-static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
- struct orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
-static ssize_t
- service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
- struct acache_orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
-static ssize_t service_capcache_show(struct capcache_orangefs_obj
- *capcache_orangefs_obj,
- struct capcache_orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
-static ssize_t service_ccache_show(struct ccache_orangefs_obj
- *ccache_orangefs_obj,
- struct ccache_orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
-static ssize_t
- service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
- struct ncache_orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
-static ssize_t
- service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
- struct pc_orangefs_attribute *attr,
- char *buf)
-{
- int rc = 0;
-
- rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
-
- return rc;
-}
-
/*
* pass attribute values back to userspace with a service operation.
*
@@ -1050,20 +500,16 @@ static ssize_t
* We want to return 1 if we think everything went OK, and
* EINVAL if not.
*/
-static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+static ssize_t sysfs_service_op_store(struct kobject *kobj,
+ struct orangefs_attribute *attr, const char *buf, size_t count)
{
struct orangefs_kernel_op_s *new_op = NULL;
int val = 0;
int rc = 0;
- struct orangefs_attribute *orangefs_attr;
- struct acache_orangefs_attribute *acache_attr;
- struct capcache_orangefs_attribute *capcache_attr;
- struct ccache_orangefs_attribute *ccache_attr;
- struct ncache_orangefs_attribute *ncache_attr;
gossip_debug(GOSSIP_SYSFS_DEBUG,
"sysfs_service_op_store: id:%s:\n",
- kobj_id);
+ kobj->name);
new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
if (!new_op)
@@ -1079,16 +525,29 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
}
/*
- * The value we want to send back to userspace is in buf.
+ * The value we want to send back to userspace is in buf, unless this
+ * there are two parameters, which is specially handled below.
*/
- rc = kstrtoint(buf, 0, &val);
- if (rc)
- goto out;
+ if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) ||
+ strcmp(attr->attr.name, "readahead_count_size")) {
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ goto out;
+ }
- if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
- orangefs_attr = (struct orangefs_attribute *)attr;
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+
+ if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+ /* Drop unsupported requests first. */
+ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+ (!strcmp(attr->attr.name, "readahead_count") ||
+ !strcmp(attr->attr.name, "readahead_size") ||
+ !strcmp(attr->attr.name, "readahead_count_size"))) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+ if (!strcmp(attr->attr.name, "perf_history_size")) {
if (val > 0) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
@@ -1096,7 +555,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(orangefs_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"perf_time_interval_secs")) {
if (val > 0) {
new_op->upcall.req.param.op =
@@ -1105,7 +564,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(orangefs_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"perf_counter_reset")) {
if ((val == 0) || (val == 1)) {
new_op->upcall.req.param.op =
@@ -1114,12 +573,55 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
+ } else if (!strcmp(attr->attr.name,
+ "readahead_count")) {
+ if ((val >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(attr->attr.name,
+ "readahead_size")) {
+ if ((val >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(attr->attr.name,
+ "readahead_count_size")) {
+ int val1, val2;
+ rc = sscanf(buf, "%d %d", &val1, &val2);
+ if (rc < 2) {
+ rc = 0;
+ goto out;
+ }
+ if ((val1 >= 0) && (val2 >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ new_op->upcall.req.param.u.value32[0] = val1;
+ new_op->upcall.req.param.u.value32[1] = val2;
+ goto value_set;
+ } else if (!strcmp(attr->attr.name,
+ "perf_counter_reset")) {
+ if ((val > 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
}
- } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
- acache_attr = (struct acache_orangefs_attribute *)attr;
-
- if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+ } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "hard_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
@@ -1127,7 +629,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+ } else if (!strcmp(attr->attr.name, "soft_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
@@ -1135,7 +637,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(acache_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"reclaim_percentage")) {
if ((val > -1) && (val < 101)) {
new_op->upcall.req.param.op =
@@ -1144,7 +646,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+ } else if (!strcmp(attr->attr.name, "timeout_msecs")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
@@ -1154,10 +656,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
}
}
- } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
- capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
- if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+ } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "hard_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
@@ -1165,7 +665,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+ } else if (!strcmp(attr->attr.name, "soft_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
@@ -1173,7 +673,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(capcache_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"reclaim_percentage")) {
if ((val > -1) && (val < 101)) {
new_op->upcall.req.param.op =
@@ -1182,7 +682,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+ } else if (!strcmp(attr->attr.name, "timeout_secs")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
@@ -1192,10 +692,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
}
}
- } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
- ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
- if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+ } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "hard_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
@@ -1203,7 +701,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+ } else if (!strcmp(attr->attr.name, "soft_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
@@ -1211,7 +709,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ccache_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"reclaim_percentage")) {
if ((val > -1) && (val < 101)) {
new_op->upcall.req.param.op =
@@ -1220,7 +718,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+ } else if (!strcmp(attr->attr.name, "timeout_secs")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
@@ -1230,10 +728,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
}
}
- } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
- ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
- if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+ } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+ if (!strcmp(attr->attr.name, "hard_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
@@ -1241,7 +737,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+ } else if (!strcmp(attr->attr.name, "soft_limit")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
@@ -1249,7 +745,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ncache_attr->attr.name,
+ } else if (!strcmp(attr->attr.name,
"reclaim_percentage")) {
if ((val > -1) && (val < 101)) {
new_op->upcall.req.param.op =
@@ -1258,7 +754,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
- } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+ } else if (!strcmp(attr->attr.name, "timeout_msecs")) {
if (val > -1) {
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
@@ -1270,14 +766,13 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
} else {
gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
- kobj_id);
+ kobj->name);
rc = -EINVAL;
goto out;
}
- new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
-
- new_op->upcall.req.param.value = val;
+ new_op->upcall.req.param.u.value64 = val;
+value_set:
/*
* The service_operation will return a errno return code on
@@ -1290,7 +785,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc);
rc = 0;
} else {
- rc = 1;
+ rc = count;
}
out:
@@ -1302,127 +797,56 @@ out:
return rc;
}
-static ssize_t
- service_orangefs_store(struct orangefs_obj *orangefs_obj,
- struct orangefs_attribute *attr,
- const char *buf,
- size_t count)
-{
- int rc = 0;
-
- rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
- /* rc should have an errno value if the service_op went bad. */
- if (rc == 1)
- rc = count;
-
- return rc;
-}
-
-static ssize_t
- service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
- struct acache_orangefs_attribute *attr,
- const char *buf,
- size_t count)
-{
- int rc = 0;
-
- rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
-
- /* rc should have an errno value if the service_op went bad. */
- if (rc == 1)
- rc = count;
-
- return rc;
-}
-
-static ssize_t
- service_capcache_store(struct capcache_orangefs_obj
- *capcache_orangefs_obj,
- struct capcache_orangefs_attribute *attr,
- const char *buf,
- size_t count)
-{
- int rc = 0;
-
- rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
-
- /* rc should have an errno value if the service_op went bad. */
- if (rc == 1)
- rc = count;
-
- return rc;
-}
-
-static ssize_t service_ccache_store(struct ccache_orangefs_obj
- *ccache_orangefs_obj,
- struct ccache_orangefs_attribute *attr,
- const char *buf,
- size_t count)
-{
- int rc = 0;
-
- rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
-
- /* rc should have an errno value if the service_op went bad. */
- if (rc == 1)
- rc = count;
-
- return rc;
-}
-
-static ssize_t
- service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
- struct ncache_orangefs_attribute *attr,
- const char *buf,
- size_t count)
-{
- int rc = 0;
-
- rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
-
- /* rc should have an errno value if the service_op went bad. */
- if (rc == 1)
- rc = count;
-
- return rc;
-}
-
static struct orangefs_attribute op_timeout_secs_attribute =
- __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+ __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
static struct orangefs_attribute slot_timeout_secs_attribute =
- __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+ __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
static struct orangefs_attribute dcache_timeout_msecs_attribute =
- __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+ __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
static struct orangefs_attribute getattr_timeout_msecs_attribute =
- __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+ __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
+static struct orangefs_attribute readahead_count_attribute =
+ __ATTR(readahead_count, 0664, sysfs_service_op_show,
+ sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_size_attribute =
+ __ATTR(readahead_size, 0664, sysfs_service_op_show,
+ sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_count_size_attribute =
+ __ATTR(readahead_count_size, 0664, sysfs_service_op_show,
+ sysfs_service_op_store);
static struct orangefs_attribute perf_counter_reset_attribute =
__ATTR(perf_counter_reset,
0664,
- service_orangefs_show,
- service_orangefs_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct orangefs_attribute perf_history_size_attribute =
__ATTR(perf_history_size,
0664,
- service_orangefs_show,
- service_orangefs_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct orangefs_attribute perf_time_interval_secs_attribute =
__ATTR(perf_time_interval_secs,
0664,
- service_orangefs_show,
- service_orangefs_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct attribute *orangefs_default_attrs[] = {
&op_timeout_secs_attribute.attr,
&slot_timeout_secs_attribute.attr,
&dcache_timeout_msecs_attribute.attr,
&getattr_timeout_msecs_attribute.attr,
+ &readahead_count_attribute.attr,
+ &readahead_size_attribute.attr,
+ &readahead_count_size_attribute.attr,
&perf_counter_reset_attribute.attr,
&perf_history_size_attribute.attr,
&perf_time_interval_secs_attribute.attr,
@@ -1431,33 +855,32 @@ static struct attribute *orangefs_default_attrs[] = {
static struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
- .release = orangefs_release,
.default_attrs = orangefs_default_attrs,
};
-static struct acache_orangefs_attribute acache_hard_limit_attribute =
+static struct orangefs_attribute acache_hard_limit_attribute =
__ATTR(hard_limit,
0664,
- service_acache_show,
- service_acache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+static struct orangefs_attribute acache_reclaim_percent_attribute =
__ATTR(reclaim_percentage,
0664,
- service_acache_show,
- service_acache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct acache_orangefs_attribute acache_soft_limit_attribute =
+static struct orangefs_attribute acache_soft_limit_attribute =
__ATTR(soft_limit,
0664,
- service_acache_show,
- service_acache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+static struct orangefs_attribute acache_timeout_msecs_attribute =
__ATTR(timeout_msecs,
0664,
- service_acache_show,
- service_acache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct attribute *acache_orangefs_default_attrs[] = {
&acache_hard_limit_attribute.attr,
@@ -1468,34 +891,33 @@ static struct attribute *acache_orangefs_default_attrs[] = {
};
static struct kobj_type acache_orangefs_ktype = {
- .sysfs_ops = &acache_orangefs_sysfs_ops,
- .release = acache_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = acache_orangefs_default_attrs,
};
-static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+static struct orangefs_attribute capcache_hard_limit_attribute =
__ATTR(hard_limit,
0664,
- service_capcache_show,
- service_capcache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+static struct orangefs_attribute capcache_reclaim_percent_attribute =
__ATTR(reclaim_percentage,
0664,
- service_capcache_show,
- service_capcache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+static struct orangefs_attribute capcache_soft_limit_attribute =
__ATTR(soft_limit,
0664,
- service_capcache_show,
- service_capcache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+static struct orangefs_attribute capcache_timeout_secs_attribute =
__ATTR(timeout_secs,
0664,
- service_capcache_show,
- service_capcache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct attribute *capcache_orangefs_default_attrs[] = {
&capcache_hard_limit_attribute.attr,
@@ -1506,34 +928,33 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
};
static struct kobj_type capcache_orangefs_ktype = {
- .sysfs_ops = &capcache_orangefs_sysfs_ops,
- .release = capcache_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = capcache_orangefs_default_attrs,
};
-static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+static struct orangefs_attribute ccache_hard_limit_attribute =
__ATTR(hard_limit,
0664,
- service_ccache_show,
- service_ccache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+static struct orangefs_attribute ccache_reclaim_percent_attribute =
__ATTR(reclaim_percentage,
0664,
- service_ccache_show,
- service_ccache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+static struct orangefs_attribute ccache_soft_limit_attribute =
__ATTR(soft_limit,
0664,
- service_ccache_show,
- service_ccache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+static struct orangefs_attribute ccache_timeout_secs_attribute =
__ATTR(timeout_secs,
0664,
- service_ccache_show,
- service_ccache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct attribute *ccache_orangefs_default_attrs[] = {
&ccache_hard_limit_attribute.attr,
@@ -1544,34 +965,33 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
};
static struct kobj_type ccache_orangefs_ktype = {
- .sysfs_ops = &ccache_orangefs_sysfs_ops,
- .release = ccache_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = ccache_orangefs_default_attrs,
};
-static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+static struct orangefs_attribute ncache_hard_limit_attribute =
__ATTR(hard_limit,
0664,
- service_ncache_show,
- service_ncache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+static struct orangefs_attribute ncache_reclaim_percent_attribute =
__ATTR(reclaim_percentage,
0664,
- service_ncache_show,
- service_ncache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+static struct orangefs_attribute ncache_soft_limit_attribute =
__ATTR(soft_limit,
0664,
- service_ncache_show,
- service_ncache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
-static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+static struct orangefs_attribute ncache_timeout_msecs_attribute =
__ATTR(timeout_msecs,
0664,
- service_ncache_show,
- service_ncache_store);
+ sysfs_service_op_show,
+ sysfs_service_op_store);
static struct attribute *ncache_orangefs_default_attrs[] = {
&ncache_hard_limit_attribute.attr,
@@ -1582,27 +1002,26 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
};
static struct kobj_type ncache_orangefs_ktype = {
- .sysfs_ops = &ncache_orangefs_sysfs_ops,
- .release = ncache_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = ncache_orangefs_default_attrs,
};
-static struct pc_orangefs_attribute pc_acache_attribute =
+static struct orangefs_attribute pc_acache_attribute =
__ATTR(acache,
0664,
- service_pc_show,
+ sysfs_service_op_show,
NULL);
-static struct pc_orangefs_attribute pc_capcache_attribute =
+static struct orangefs_attribute pc_capcache_attribute =
__ATTR(capcache,
0664,
- service_pc_show,
+ sysfs_service_op_show,
NULL);
-static struct pc_orangefs_attribute pc_ncache_attribute =
+static struct orangefs_attribute pc_ncache_attribute =
__ATTR(ncache,
0664,
- service_pc_show,
+ sysfs_service_op_show,
NULL);
static struct attribute *pc_orangefs_default_attrs[] = {
@@ -1613,21 +1032,20 @@ static struct attribute *pc_orangefs_default_attrs[] = {
};
static struct kobj_type pc_orangefs_ktype = {
- .sysfs_ops = &pc_orangefs_sysfs_ops,
- .release = pc_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = pc_orangefs_default_attrs,
};
-static struct stats_orangefs_attribute stats_reads_attribute =
+static struct orangefs_attribute stats_reads_attribute =
__ATTR(reads,
0664,
- int_stats_show,
+ sysfs_int_show,
NULL);
-static struct stats_orangefs_attribute stats_writes_attribute =
+static struct orangefs_attribute stats_writes_attribute =
__ATTR(writes,
0664,
- int_stats_show,
+ sysfs_int_show,
NULL);
static struct attribute *stats_orangefs_default_attrs[] = {
@@ -1637,18 +1055,17 @@ static struct attribute *stats_orangefs_default_attrs[] = {
};
static struct kobj_type stats_orangefs_ktype = {
- .sysfs_ops = &stats_orangefs_sysfs_ops,
- .release = stats_orangefs_release,
+ .sysfs_ops = &orangefs_sysfs_ops,
.default_attrs = stats_orangefs_default_attrs,
};
-static struct orangefs_obj *orangefs_obj;
-static struct acache_orangefs_obj *acache_orangefs_obj;
-static struct capcache_orangefs_obj *capcache_orangefs_obj;
-static struct ccache_orangefs_obj *ccache_orangefs_obj;
-static struct ncache_orangefs_obj *ncache_orangefs_obj;
-static struct pc_orangefs_obj *pc_orangefs_obj;
-static struct stats_orangefs_obj *stats_orangefs_obj;
+static struct kobject *orangefs_obj;
+static struct kobject *acache_orangefs_obj;
+static struct kobject *capcache_orangefs_obj;
+static struct kobject *ccache_orangefs_obj;
+static struct kobject *ncache_orangefs_obj;
+static struct kobject *pc_orangefs_obj;
+static struct kobject *stats_orangefs_obj;
int orangefs_sysfs_init(void)
{
@@ -1661,7 +1078,7 @@ int orangefs_sysfs_init(void)
if (!orangefs_obj)
goto out;
- rc = kobject_init_and_add(&orangefs_obj->kobj,
+ rc = kobject_init_and_add(orangefs_obj,
&orangefs_ktype,
fs_kobj,
ORANGEFS_KOBJ_ID);
@@ -1669,7 +1086,7 @@ int orangefs_sysfs_init(void)
if (rc)
goto ofs_obj_bail;
- kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/acache. */
acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
@@ -1678,15 +1095,15 @@ int orangefs_sysfs_init(void)
goto ofs_obj_bail;
}
- rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+ rc = kobject_init_and_add(acache_orangefs_obj,
&acache_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
ACACHE_KOBJ_ID);
if (rc)
goto acache_obj_bail;
- kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(acache_orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/capcache. */
capcache_orangefs_obj =
@@ -1696,14 +1113,14 @@ int orangefs_sysfs_init(void)
goto acache_obj_bail;
}
- rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+ rc = kobject_init_and_add(capcache_orangefs_obj,
&capcache_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
CAPCACHE_KOBJ_ID);
if (rc)
goto capcache_obj_bail;
- kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(capcache_orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/ccache. */
ccache_orangefs_obj =
@@ -1713,14 +1130,14 @@ int orangefs_sysfs_init(void)
goto capcache_obj_bail;
}
- rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+ rc = kobject_init_and_add(ccache_orangefs_obj,
&ccache_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
CCACHE_KOBJ_ID);
if (rc)
goto ccache_obj_bail;
- kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(ccache_orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/ncache. */
ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
@@ -1729,15 +1146,15 @@ int orangefs_sysfs_init(void)
goto ccache_obj_bail;
}
- rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+ rc = kobject_init_and_add(ncache_orangefs_obj,
&ncache_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
NCACHE_KOBJ_ID);
if (rc)
goto ncache_obj_bail;
- kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(ncache_orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/perf_counters. */
pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
@@ -1746,15 +1163,15 @@ int orangefs_sysfs_init(void)
goto ncache_obj_bail;
}
- rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+ rc = kobject_init_and_add(pc_orangefs_obj,
&pc_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
"perf_counters");
if (rc)
goto pc_obj_bail;
- kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(pc_orangefs_obj, KOBJ_ADD);
/* create /sys/fs/orangefs/stats. */
stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
@@ -1763,37 +1180,31 @@ int orangefs_sysfs_init(void)
goto pc_obj_bail;
}
- rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+ rc = kobject_init_and_add(stats_orangefs_obj,
&stats_orangefs_ktype,
- &orangefs_obj->kobj,
+ orangefs_obj,
STATS_KOBJ_ID);
if (rc)
goto stats_obj_bail;
- kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+ kobject_uevent(stats_orangefs_obj, KOBJ_ADD);
goto out;
stats_obj_bail:
- kobject_put(&stats_orangefs_obj->kobj);
-
+ kobject_put(stats_orangefs_obj);
pc_obj_bail:
- kobject_put(&pc_orangefs_obj->kobj);
-
+ kobject_put(pc_orangefs_obj);
ncache_obj_bail:
- kobject_put(&ncache_orangefs_obj->kobj);
-
+ kobject_put(ncache_orangefs_obj);
ccache_obj_bail:
- kobject_put(&ccache_orangefs_obj->kobj);
-
+ kobject_put(ccache_orangefs_obj);
capcache_obj_bail:
- kobject_put(&capcache_orangefs_obj->kobj);
-
+ kobject_put(capcache_orangefs_obj);
acache_obj_bail:
- kobject_put(&acache_orangefs_obj->kobj);
-
+ kobject_put(acache_orangefs_obj);
ofs_obj_bail:
- kobject_put(&orangefs_obj->kobj);
+ kobject_put(orangefs_obj);
out:
return rc;
}
@@ -1801,13 +1212,11 @@ out:
void orangefs_sysfs_exit(void)
{
gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
-
- kobject_put(&acache_orangefs_obj->kobj);
- kobject_put(&capcache_orangefs_obj->kobj);
- kobject_put(&ccache_orangefs_obj->kobj);
- kobject_put(&ncache_orangefs_obj->kobj);
- kobject_put(&pc_orangefs_obj->kobj);
- kobject_put(&stats_orangefs_obj->kobj);
-
- kobject_put(&orangefs_obj->kobj);
+ kobject_put(acache_orangefs_obj);
+ kobject_put(capcache_orangefs_obj);
+ kobject_put(ccache_orangefs_obj);
+ kobject_put(ncache_orangefs_obj);
+ kobject_put(pc_orangefs_obj);
+ kobject_put(stats_orangefs_obj);
+ kobject_put(orangefs_obj);
}
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index d13c7291fd05..06af81f71e10 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op)
case ORANGEFS_VFS_OP_TRUNCATE:
fsid = op->upcall.req.truncate.refn.fs_id;
break;
- case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+ case ORANGEFS_VFS_OP_RA_FLUSH:
fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
break;
case ORANGEFS_VFS_OP_FS_UMOUNT:
@@ -347,7 +347,8 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
- orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
+ orangefs_inode->getattr_time = jiffies +
+ orangefs_getattr_timeout_msecs*HZ/1000;
ret = 0;
out:
op_release(new_op);
@@ -656,401 +657,3 @@ __s32 ORANGEFS_util_translate_mode(int mode)
return ret;
}
#undef NUM_MODES
-
-/*
- * After obtaining a string representation of the client's debug
- * keywords and their associated masks, this function is called to build an
- * array of these values.
- */
-int orangefs_prepare_cdm_array(char *debug_array_string)
-{
- int i;
- int rc = -EINVAL;
- char *cds_head = NULL;
- char *cds_delimiter = NULL;
- int keyword_len = 0;
-
- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
- /*
- * figure out how many elements the cdm_array needs.
- */
- for (i = 0; i < strlen(debug_array_string); i++)
- if (debug_array_string[i] == '\n')
- cdm_element_count++;
-
- if (!cdm_element_count) {
- pr_info("No elements in client debug array string!\n");
- goto out;
- }
-
- cdm_array =
- kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
- GFP_KERNEL);
- if (!cdm_array) {
- pr_info("malloc failed for cdm_array!\n");
- rc = -ENOMEM;
- goto out;
- }
-
- cds_head = debug_array_string;
-
- for (i = 0; i < cdm_element_count; i++) {
- cds_delimiter = strchr(cds_head, '\n');
- *cds_delimiter = '\0';
-
- keyword_len = strcspn(cds_head, " ");
-
- cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
- if (!cdm_array[i].keyword) {
- rc = -ENOMEM;
- goto out;
- }
-
- sscanf(cds_head,
- "%s %llx %llx",
- cdm_array[i].keyword,
- (unsigned long long *)&(cdm_array[i].mask1),
- (unsigned long long *)&(cdm_array[i].mask2));
-
- if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
- client_verbose_index = i;
-
- if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
- client_all_index = i;
-
- cds_head = cds_delimiter + 1;
- }
-
- rc = cdm_element_count;
-
- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-
-out:
-
- return rc;
-
-}
-
-/*
- * /sys/kernel/debug/orangefs/debug-help can be catted to
- * see all the available kernel and client debug keywords.
- *
- * When the kernel boots, we have no idea what keywords the
- * client supports, nor their associated masks.
- *
- * We pass through this function once at boot and stamp a
- * boilerplate "we don't know" message for the client in the
- * debug-help file. We pass through here again when the client
- * starts and then we can fill out the debug-help file fully.
- *
- * The client might be restarted any number of times between
- * reboots, we only build the debug-help file the first time.
- */
-int orangefs_prepare_debugfs_help_string(int at_boot)
-{
- int rc = -EINVAL;
- int i;
- int byte_count = 0;
- char *client_title = "Client Debug Keywords:\n";
- char *kernel_title = "Kernel Debug Keywords:\n";
-
- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
- if (at_boot) {
- byte_count += strlen(HELP_STRING_UNINITIALIZED);
- client_title = HELP_STRING_UNINITIALIZED;
- } else {
- /*
- * fill the client keyword/mask array and remember
- * how many elements there were.
- */
- cdm_element_count =
- orangefs_prepare_cdm_array(client_debug_array_string);
- if (cdm_element_count <= 0)
- goto out;
-
- /* Count the bytes destined for debug_help_string. */
- byte_count += strlen(client_title);
-
- for (i = 0; i < cdm_element_count; i++) {
- byte_count += strlen(cdm_array[i].keyword + 2);
- if (byte_count >= DEBUG_HELP_STRING_SIZE) {
- pr_info("%s: overflow 1!\n", __func__);
- goto out;
- }
- }
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "%s: cdm_element_count:%d:\n",
- __func__,
- cdm_element_count);
- }
-
- byte_count += strlen(kernel_title);
- for (i = 0; i < num_kmod_keyword_mask_map; i++) {
- byte_count +=
- strlen(s_kmod_keyword_mask_map[i].keyword + 2);
- if (byte_count >= DEBUG_HELP_STRING_SIZE) {
- pr_info("%s: overflow 2!\n", __func__);
- goto out;
- }
- }
-
- /* build debug_help_string. */
- debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
- if (!debug_help_string) {
- rc = -ENOMEM;
- goto out;
- }
-
- strcat(debug_help_string, client_title);
-
- if (!at_boot) {
- for (i = 0; i < cdm_element_count; i++) {
- strcat(debug_help_string, "\t");
- strcat(debug_help_string, cdm_array[i].keyword);
- strcat(debug_help_string, "\n");
- }
- }
-
- strcat(debug_help_string, "\n");
- strcat(debug_help_string, kernel_title);
-
- for (i = 0; i < num_kmod_keyword_mask_map; i++) {
- strcat(debug_help_string, "\t");
- strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
- strcat(debug_help_string, "\n");
- }
-
- rc = 0;
-
-out:
-
- return rc;
-
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_mask_to_string(void *mask, int type)
-{
- int i;
- int len = 0;
- char *debug_string;
- int element_count = 0;
-
- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
- if (type) {
- debug_string = client_debug_string;
- element_count = cdm_element_count;
- } else {
- debug_string = kernel_debug_string;
- element_count = num_kmod_keyword_mask_map;
- }
-
- memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
-
- /*
- * Some keywords, like "all" or "verbose", are amalgams of
- * numerous other keywords. Make a special check for those
- * before grinding through the whole mask only to find out
- * later...
- */
- if (check_amalgam_keyword(mask, type))
- goto out;
-
- /* Build the debug string. */
- for (i = 0; i < element_count; i++)
- if (type)
- do_c_string(mask, i);
- else
- do_k_string(mask, i);
-
- len = strlen(debug_string);
-
- if ((len) && (type))
- client_debug_string[len - 1] = '\0';
- else if (len)
- kernel_debug_string[len - 1] = '\0';
- else if (type)
- strcpy(client_debug_string, "none");
- else
- strcpy(kernel_debug_string, "none");
-
-out:
-gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
-
- return;
-
-}
-
-void do_k_string(void *k_mask, int index)
-{
- __u64 *mask = (__u64 *) k_mask;
-
- if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
- goto out;
-
- if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
- if ((strlen(kernel_debug_string) +
- strlen(s_kmod_keyword_mask_map[index].keyword))
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
- strcat(kernel_debug_string,
- s_kmod_keyword_mask_map[index].keyword);
- strcat(kernel_debug_string, ",");
- } else {
- gossip_err("%s: overflow!\n", __func__);
- strcpy(kernel_debug_string, ORANGEFS_ALL);
- goto out;
- }
- }
-
-out:
-
- return;
-}
-
-void do_c_string(void *c_mask, int index)
-{
- struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
-
- if (keyword_is_amalgam(cdm_array[index].keyword))
- goto out;
-
- if ((mask->mask1 & cdm_array[index].mask1) ||
- (mask->mask2 & cdm_array[index].mask2)) {
- if ((strlen(client_debug_string) +
- strlen(cdm_array[index].keyword) + 1)
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
- strcat(client_debug_string,
- cdm_array[index].keyword);
- strcat(client_debug_string, ",");
- } else {
- gossip_err("%s: overflow!\n", __func__);
- strcpy(client_debug_string, ORANGEFS_ALL);
- goto out;
- }
- }
-out:
- return;
-}
-
-int keyword_is_amalgam(char *keyword)
-{
- int rc = 0;
-
- if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
- rc = 1;
-
- return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- *
- * return 1 if we found an amalgam.
- */
-int check_amalgam_keyword(void *mask, int type)
-{
- __u64 *k_mask;
- struct client_debug_mask *c_mask;
- int k_all_index = num_kmod_keyword_mask_map - 1;
- int rc = 0;
-
- if (type) {
- c_mask = (struct client_debug_mask *) mask;
-
- if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
- (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
- strcpy(client_debug_string, ORANGEFS_ALL);
- rc = 1;
- goto out;
- }
-
- if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
- (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
- strcpy(client_debug_string, ORANGEFS_VERBOSE);
- rc = 1;
- goto out;
- }
-
- } else {
- k_mask = (__u64 *) mask;
-
- if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
- strcpy(kernel_debug_string, ORANGEFS_ALL);
- rc = 1;
- goto out;
- }
- }
-
-out:
-
- return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_string_to_mask(char *debug_string, void *mask, int type)
-{
- char *unchecked_keyword;
- int i;
- char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
- char *original_pointer;
- int element_count = 0;
- struct client_debug_mask *c_mask;
- __u64 *k_mask;
-
- gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
- if (type) {
- c_mask = (struct client_debug_mask *)mask;
- element_count = cdm_element_count;
- } else {
- k_mask = (__u64 *)mask;
- *k_mask = 0;
- element_count = num_kmod_keyword_mask_map;
- }
-
- original_pointer = strsep_fodder;
- while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
- if (strlen(unchecked_keyword)) {
- for (i = 0; i < element_count; i++)
- if (type)
- do_c_mask(i,
- unchecked_keyword,
- &c_mask);
- else
- do_k_mask(i,
- unchecked_keyword,
- &k_mask);
- }
-
- kfree(original_pointer);
-}
-
-void do_c_mask(int i,
- char *unchecked_keyword,
- struct client_debug_mask **sane_mask)
-{
-
- if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
- (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
- (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
- }
-}
-
-void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
-{
-
- if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
- **sane_mask = (**sane_mask) |
- s_kmod_keyword_mask_map[i].mask_val;
-}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 3d7418c728f5..971307ad69be 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -4,26 +4,6 @@
#include <linux/slab.h>
#include <linux/ioctl.h>
-extern struct client_debug_mask *cdm_array;
-extern char *debug_help_string;
-extern int help_string_initialized;
-extern struct dentry *debug_dir;
-extern struct dentry *help_file_dentry;
-extern struct dentry *client_debug_dentry;
-extern const struct file_operations debug_help_fops;
-extern int client_all_index;
-extern int client_verbose_index;
-extern int cdm_element_count;
-#define DEBUG_HELP_STRING_SIZE 4096
-#define HELP_STRING_UNINITIALIZED \
- "Client Debug Keywords are unknown until the first time\n" \
- "the client is started after boot.\n"
-#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
-#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
-#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
-#define ORANGEFS_VERBOSE "verbose"
-#define ORANGEFS_ALL "all"
-
/* pvfs2-config.h ***********************************************************/
#define ORANGEFS_VERSION_MAJOR 2
#define ORANGEFS_VERSION_MINOR 9
@@ -426,13 +406,12 @@ do { \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
#else
-extern __u64 gossip_debug_mask;
-extern struct client_debug_mask client_debug_mask;
+extern __u64 orangefs_gossip_debug_mask;
/* try to avoid function call overhead by checking masks in macro */
#define gossip_debug(mask, fmt, ...) \
do { \
- if (gossip_debug_mask & (mask)) \
+ if (orangefs_gossip_debug_mask & (mask)) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
#endif /* GOSSIP_DISABLE_DEBUG */
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index b9da9a0281c9..c48859f16e7b 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -33,6 +33,7 @@ static const match_table_t tokens = {
{ Opt_err, NULL }
};
+uint64_t orangefs_features;
static int parse_mount_options(struct super_block *sb, char *options,
int silent)
@@ -249,6 +250,19 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
}
op_release(new_op);
+
+ if (orangefs_userspace_version >= 20906) {
+ new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.features.features = 0;
+ ret = service_operation(new_op, "orangefs_features", 0);
+ orangefs_features = new_op->downcall.resp.features.features;
+ op_release(new_op);
+ } else {
+ orangefs_features = 0;
+ }
+
return ret;
}
@@ -492,6 +506,19 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
spin_unlock(&orangefs_superblocks_lock);
op_release(new_op);
+
+ if (orangefs_userspace_version >= 20906) {
+ new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+ new_op->upcall.req.features.features = 0;
+ ret = service_operation(new_op, "orangefs_features", 0);
+ orangefs_features = new_op->downcall.resp.features.features;
+ op_release(new_op);
+ } else {
+ orangefs_features = 0;
+ }
+
return dget(sb->s_root);
free_op:
@@ -530,8 +557,8 @@ void orangefs_kill_sb(struct super_block *sb)
* make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
* gets completed before we free the dang thing.
*/
- mutex_lock(&request_mutex);
- mutex_unlock(&request_mutex);
+ mutex_lock(&orangefs_request_mutex);
+ mutex_unlock(&orangefs_request_mutex);
/* free the orangefs superblock private data */
kfree(ORANGEFS_SB(sb));
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 8fecf823f5ba..10b0b06e075e 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -14,6 +14,5 @@ const struct inode_operations orangefs_symlink_inode_operations = {
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
.listxattr = orangefs_listxattr,
- .setxattr = generic_setxattr,
.permission = orangefs_permission,
};
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
index 001b20239407..af0b0e36d559 100644
--- a/fs/orangefs/upcall.h
+++ b/fs/orangefs/upcall.h
@@ -98,7 +98,7 @@ struct orangefs_truncate_request_s {
__s64 size;
};
-struct orangefs_mmap_ra_cache_flush_request_s {
+struct orangefs_ra_cache_flush_request_s {
struct orangefs_object_kref refn;
};
@@ -179,12 +179,18 @@ enum orangefs_param_request_op {
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28,
};
struct orangefs_param_request_s {
enum orangefs_param_request_type type;
enum orangefs_param_request_op op;
- __s64 value;
+ union {
+ __s64 value64;
+ __s32 value32[2];
+ } u;
char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
};
@@ -204,6 +210,11 @@ struct orangefs_fs_key_request_s {
__s32 __pad1;
};
+/* 2.9.6 */
+struct orangefs_features_request_s {
+ __u64 features;
+};
+
struct orangefs_upcall_s {
__s32 type;
__u32 uid;
@@ -228,7 +239,7 @@ struct orangefs_upcall_s {
struct orangefs_rename_request_s rename;
struct orangefs_statfs_request_s statfs;
struct orangefs_truncate_request_s truncate;
- struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+ struct orangefs_ra_cache_flush_request_s ra_cache_flush;
struct orangefs_fs_mount_request_s fs_mount;
struct orangefs_fs_umount_request_s fs_umount;
struct orangefs_getxattr_request_s getxattr;
@@ -240,6 +251,7 @@ struct orangefs_upcall_s {
struct orangefs_param_request_s param;
struct orangefs_perf_count_request_s perf_count;
struct orangefs_fs_key_request_s fs_key;
+ struct orangefs_features_request_s features;
} req;
};
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 31635bc303fe..abcfa3fa9992 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -87,9 +87,9 @@ retry_servicing:
*/
if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
if (flags & ORANGEFS_OP_INTERRUPTIBLE)
- ret = mutex_lock_interruptible(&request_mutex);
+ ret = mutex_lock_interruptible(&orangefs_request_mutex);
else
- ret = mutex_lock_killable(&request_mutex);
+ ret = mutex_lock_killable(&orangefs_request_mutex);
/*
* check to see if we were interrupted while waiting for
* mutex
@@ -129,7 +129,7 @@ retry_servicing:
spin_unlock(&orangefs_request_list_lock);
if (!(flags & ORANGEFS_OP_NO_MUTEX))
- mutex_unlock(&request_mutex);
+ mutex_unlock(&orangefs_request_mutex);
ret = wait_for_matching_downcall(op, timeout,
flags & ORANGEFS_OP_INTERRUPTIBLE);
@@ -272,9 +272,9 @@ static void
} else if (op_state_in_progress(op)) {
/* op must be removed from the in progress htable */
spin_unlock(&op->lock);
- spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&orangefs_htable_ops_in_progress_lock);
list_del_init(&op->list);
- spin_unlock(&htable_ops_in_progress_lock);
+ spin_unlock(&orangefs_htable_ops_in_progress_lock);
gossip_debug(GOSSIP_WAIT_DEBUG,
"Interrupted: Removed op %p"
" from htable_ops_in_progress\n",
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 2a9f07f06d10..74a81b1daaac 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -73,6 +73,9 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
"%s: name %s, buffer_size %zd\n",
__func__, name, size);
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
gossip_err("Invalid key length (%d)\n",
(int)strlen(name));
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 43fdc2765aea..aeb60f791418 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -57,9 +57,10 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
+ size_t slen;
- if (!old->d_inode->i_op->getxattr ||
- !new->d_inode->i_op->getxattr)
+ if (!(old->d_inode->i_opflags & IOP_XATTR) ||
+ !(new->d_inode->i_opflags & IOP_XATTR))
return 0;
list_size = vfs_listxattr(old, NULL, 0);
@@ -79,7 +80,16 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
goto out;
}
- for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+ for (name = buf; list_size; name += slen) {
+ slen = strnlen(name, list_size) + 1;
+
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > list_size)) {
+ error = -EIO;
+ break;
+ }
+ list_size -= slen;
+
if (ovl_is_private_xattr(name))
continue;
retry:
@@ -105,6 +115,13 @@ retry:
goto retry;
}
+ error = security_inode_copy_up_xattr(name);
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
+ if (error == 1) {
+ error = 0;
+ continue; /* Discard */
+ }
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
@@ -167,40 +184,6 @@ out_fput:
return error;
}
-static char *ovl_read_symlink(struct dentry *realdentry)
-{
- int res;
- char *buf;
- struct inode *inode = realdentry->d_inode;
- mm_segment_t old_fs;
-
- res = -EINVAL;
- if (!inode->i_op->readlink)
- goto err;
-
- res = -ENOMEM;
- buf = (char *) __get_free_page(GFP_KERNEL);
- if (!buf)
- goto err;
-
- old_fs = get_fs();
- set_fs(get_ds());
- /* The cast to a user pointer is valid due to the set_fs() */
- res = inode->i_op->readlink(realdentry,
- (char __user *)buf, PAGE_SIZE - 1);
- set_fs(old_fs);
- if (res < 0) {
- free_page((unsigned long) buf);
- goto err;
- }
- buf[res] = '\0';
-
- return buf;
-
-err:
- return ERR_PTR(res);
-}
-
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
@@ -248,6 +231,8 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
+ const struct cred *old_creds = NULL;
+ struct cred *new_creds = NULL;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
@@ -260,10 +245,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (IS_ERR(upper))
goto out1;
+ err = security_inode_copy_up(dentry, &new_creds);
+ if (err < 0)
+ goto out2;
+
+ if (new_creds)
+ old_creds = override_creds(new_creds);
+
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
+
+ if (new_creds) {
+ revert_creds(old_creds);
+ put_cred(new_creds);
+ }
+
if (err)
goto out2;
@@ -332,19 +330,20 @@ out_cleanup:
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat)
{
+ DEFINE_DELAYED_CALL(done);
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
+ struct dentry *lowerdentry = lowerpath->dentry;
struct dentry *upperdir;
struct dentry *upperdentry;
- const struct cred *old_cred;
- char *link = NULL;
+ const char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
- ovl_do_check_copy_up(lowerpath->dentry);
+ ovl_do_check_copy_up(lowerdentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
@@ -354,13 +353,11 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return err;
if (S_ISLNK(stat->mode)) {
- link = ovl_read_symlink(lowerpath->dentry);
+ link = vfs_get_link(lowerdentry, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
- old_cred = ovl_override_creds(dentry->d_sb);
-
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
@@ -381,19 +378,16 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
}
out_unlock:
unlock_rename(workdir, upperdir);
- revert_creds(old_cred);
-
- if (link)
- free_page((unsigned long) link);
+ do_delayed_call(&done);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
- int err;
+ int err = 0;
+ const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
- err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
@@ -425,6 +419,7 @@ int ovl_copy_up(struct dentry *dentry)
dput(parent);
dput(next);
}
+ revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 1560fdc09a5f..306b6c161840 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -14,6 +14,7 @@
#include <linux/cred.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/atomic.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
@@ -37,8 +38,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
+ static atomic_t temp_id = ATOMIC_INIT(0);
- snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+ /* counter is allowed to wrap, since temp dentries are ephemeral */
+ snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
@@ -489,6 +492,15 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
if (override_cred) {
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
+ if (!hardlink) {
+ err = security_dentry_create_files_as(dentry,
+ stat->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
+ }
+ }
put_cred(override_creds(override_cred));
put_cred(override_cred);
@@ -499,6 +511,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
err = ovl_create_over_whiteout(dentry, inode, stat,
link, hardlink);
}
+out_revert_creds:
revert_creds(old_cred);
if (!err) {
struct inode *realinode = d_inode(ovl_dentry_upper(dentry));
@@ -996,17 +1009,14 @@ const struct inode_operations ovl_dir_inode_operations = {
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
- .rename2 = ovl_rename2,
+ .rename = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c75625c1efa3..c58f01babf30 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -19,6 +19,7 @@ static int ovl_copy_up_truncate(struct dentry *dentry)
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
+ const struct cred *old_cred;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
@@ -26,12 +27,14 @@ static int ovl_copy_up_truncate(struct dentry *dentry)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
- err = vfs_getattr(&lowerpath, &stat);
- if (err)
- goto out_dput_parent;
- stat.size = 0;
- err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (!err) {
+ stat.size = 0;
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
+ }
+ revert_creds(old_cred);
out_dput_parent:
dput(parent);
@@ -53,7 +56,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
- err = inode_change_ok(dentry->d_inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -153,45 +156,18 @@ static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct dentry *realdentry;
- struct inode *realinode;
const struct cred *old_cred;
const char *p;
if (!dentry)
return ERR_PTR(-ECHILD);
- realdentry = ovl_dentry_real(dentry);
- realinode = realdentry->d_inode;
-
- if (WARN_ON(!realinode->i_op->get_link))
- return ERR_PTR(-EPERM);
-
old_cred = ovl_override_creds(dentry->d_sb);
- p = realinode->i_op->get_link(realdentry, realinode, done);
+ p = vfs_get_link(ovl_dentry_real(dentry), done);
revert_creds(old_cred);
return p;
}
-static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
-{
- struct path realpath;
- struct inode *realinode;
- const struct cred *old_cred;
- int err;
-
- ovl_path_real(dentry, &realpath);
- realinode = realpath.dentry->d_inode;
-
- if (!realinode->i_op->readlink)
- return -EINVAL;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
- revert_creds(old_cred);
- return err;
-}
-
bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PREFIX,
@@ -367,10 +343,7 @@ static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = generic_removexattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
};
@@ -378,12 +351,9 @@ static const struct inode_operations ovl_file_inode_operations = {
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
- .readlink = ovl_readlink,
+ .readlink = generic_readlink,
.getattr = ovl_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = generic_removexattr,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5813ccff8cd9..e218e741cb99 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -114,13 +114,13 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
{
int err;
- pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+ pr_debug("rename(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
- pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+ pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e2a94a26767b..bcf3965be819 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -273,12 +273,11 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
{
int res;
char val;
- struct inode *inode = dentry->d_inode;
- if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+ if (!d_is_dir(dentry))
return false;
- res = inode->i_op->getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1);
+ res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
if (res == 1 && val == 'y')
return true;
@@ -419,16 +418,12 @@ static bool ovl_dentry_weird(struct dentry *dentry)
DCACHE_OP_COMPARE);
}
-static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb,
- struct dentry *dir,
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
const struct qstr *name)
{
- const struct cred *old_cred;
struct dentry *dentry;
- old_cred = ovl_override_creds(ovl_sb);
dentry = lookup_one_len_unlocked(name->name, dir, name->len);
- revert_creds(old_cred);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
@@ -469,6 +464,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ovl_entry *oe;
+ const struct cred *old_cred;
struct ovl_entry *poe = dentry->d_parent->d_fsdata;
struct path *stack = NULL;
struct dentry *upperdir, *upperdentry = NULL;
@@ -479,9 +475,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int i;
int err;
+ old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_upperdentry_dereference(poe);
if (upperdir) {
- this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name);
+ this = ovl_lookup_real(upperdir, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this))
goto out;
@@ -514,8 +511,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
bool opaque = false;
struct path lowerpath = poe->lowerstack[i];
- this = ovl_lookup_real(dentry->d_sb,
- lowerpath.dentry, &dentry->d_name);
+ this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
/*
@@ -588,6 +584,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ovl_copyattr(realdentry->d_inode, inode);
}
+ revert_creds(old_cred);
oe->opaque = upperopaque;
oe->__upperdentry = upperdentry;
memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
@@ -606,6 +603,7 @@ out_put:
out_put_upper:
dput(upperdentry);
out:
+ revert_creds(old_cred);
return ERR_PTR(err);
}
@@ -834,6 +832,19 @@ retry:
if (err)
goto out_dput;
+ /*
+ * Try to remove POSIX ACL xattrs from workdir. We are good if:
+ *
+ * a) success (there was a POSIX ACL xattr and was removed)
+ * b) -ENODATA (there was no POSIX ACL xattr)
+ * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported)
+ *
+ * There are various other error values that could effectively
+ * mean that the xattr doesn't exist (e.g. -ERANGE is returned
+ * if the xattr name is too long), but the set of filesystems
+ * allowed as upper are limited to "normal" ones, where checking
+ * for the above two errors is sufficient.
+ */
err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
@@ -1292,6 +1303,12 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!oe)
goto out_put_cred;
+ sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+ sb->s_op = &ovl_super_operations;
+ sb->s_xattr = ovl_xattr_handlers;
+ sb->s_fs_info = ufs;
+ sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+
root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
if (!root_dentry)
goto out_free_oe;
@@ -1315,12 +1332,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry);
ovl_copyattr(realinode, d_inode(root_dentry));
- sb->s_magic = OVERLAYFS_SUPER_MAGIC;
- sb->s_op = &ovl_super_operations;
- sb->s_xattr = ovl_xattr_handlers;
sb->s_root = root_dentry;
- sb->s_fs_info = ufs;
- sb->s_flags |= MS_POSIXACL;
return 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4ebe6b2e5217..8e0d9f26dfad 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -267,7 +267,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (bufs) {
int curbuf = pipe->curbuf;
struct pipe_buffer *buf = pipe->bufs + curbuf;
- const struct pipe_buf_operations *ops = buf->ops;
size_t chars = buf->len;
size_t written;
int error;
@@ -275,7 +274,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (chars > total_len)
chars = total_len;
- error = ops->confirm(pipe, buf);
+ error = pipe_buf_confirm(pipe, buf);
if (error) {
if (!ret)
ret = error;
@@ -299,8 +298,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
if (!buf->len) {
- buf->ops = NULL;
- ops->release(pipe, buf);
+ pipe_buf_release(pipe, buf);
curbuf = (curbuf + 1) & (pipe->buffers - 1);
pipe->curbuf = curbuf;
pipe->nrbufs = --bufs;
@@ -383,11 +381,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
(pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + lastbuf;
- const struct pipe_buf_operations *ops = buf->ops;
int offset = buf->offset + buf->len;
- if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- ret = ops->confirm(pipe, buf);
+ if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+ ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
@@ -604,54 +601,63 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
-static void account_pipe_buffers(struct pipe_inode_info *pipe,
+static unsigned long account_pipe_buffers(struct user_struct *user,
unsigned long old, unsigned long new)
{
- atomic_long_add(new - old, &pipe->user->pipe_bufs);
+ return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-static bool too_many_pipe_buffers_soft(struct user_struct *user)
+static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
- return pipe_user_pages_soft &&
- atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+ return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft;
}
-static bool too_many_pipe_buffers_hard(struct user_struct *user)
+static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
- return pipe_user_pages_hard &&
- atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+ return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard;
}
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
+ unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+ struct user_struct *user = get_current_user();
+ unsigned long user_bufs;
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
- if (pipe) {
- unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
- struct user_struct *user = get_current_user();
-
- if (!too_many_pipe_buffers_hard(user)) {
- if (too_many_pipe_buffers_soft(user))
- pipe_bufs = 1;
- pipe->bufs = kcalloc(pipe_bufs,
- sizeof(struct pipe_buffer),
- GFP_KERNEL_ACCOUNT);
- }
+ if (pipe == NULL)
+ goto out_free_uid;
- if (pipe->bufs) {
- init_waitqueue_head(&pipe->wait);
- pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = pipe_bufs;
- pipe->user = user;
- account_pipe_buffers(pipe, 0, pipe_bufs);
- mutex_init(&pipe->mutex);
- return pipe;
- }
- free_uid(user);
- kfree(pipe);
+ if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ pipe_bufs = pipe_max_size >> PAGE_SHIFT;
+
+ user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
+
+ if (too_many_pipe_buffers_soft(user_bufs)) {
+ user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
+ pipe_bufs = 1;
+ }
+
+ if (too_many_pipe_buffers_hard(user_bufs))
+ goto out_revert_acct;
+
+ pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ GFP_KERNEL_ACCOUNT);
+
+ if (pipe->bufs) {
+ init_waitqueue_head(&pipe->wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->buffers = pipe_bufs;
+ pipe->user = user;
+ mutex_init(&pipe->mutex);
+ return pipe;
}
+out_revert_acct:
+ (void) account_pipe_buffers(user, pipe_bufs, 0);
+ kfree(pipe);
+out_free_uid:
+ free_uid(user);
return NULL;
}
@@ -659,12 +665,12 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- account_pipe_buffers(pipe, pipe->buffers, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
- buf->ops->release(pipe, buf);
+ pipe_buf_release(pipe, buf);
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
@@ -716,7 +722,7 @@ static struct inode * get_pipe_inode(void)
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
return inode;
@@ -1011,12 +1017,54 @@ const struct file_operations pipefifo_fops = {
};
/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+ unsigned long nr_pages;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
+ unsigned int size, nr_pages;
+ unsigned long user_bufs;
+ long ret = 0;
+
+ size = round_pipe_size(arg);
+ nr_pages = size >> PAGE_SHIFT;
+
+ if (!nr_pages)
+ return -EINVAL;
+
+ /*
+ * If trying to increase the pipe capacity, check that an
+ * unprivileged user is not trying to exceed various limits
+ * (soft limit check here, hard limit check just below).
+ * Decreasing the pipe capacity is always permitted, even
+ * if the user is currently over a limit.
+ */
+ if (nr_pages > pipe->buffers &&
+ size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+
+ if (nr_pages > pipe->buffers &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_revert_acct;
+ }
/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
@@ -1024,13 +1072,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
* again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (nr_pages < pipe->nrbufs)
- return -EBUSY;
+ if (nr_pages < pipe->nrbufs) {
+ ret = -EBUSY;
+ goto out_revert_acct;
+ }
bufs = kcalloc(nr_pages, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs))
- return -ENOMEM;
+ if (unlikely(!bufs)) {
+ ret = -ENOMEM;
+ goto out_revert_acct;
+ }
/*
* The pipe array wraps around, so just start the new one at zero
@@ -1053,24 +1105,15 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
- account_pipe_buffers(pipe, pipe->buffers, nr_pages);
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->buffers = nr_pages;
return nr_pages * PAGE_SIZE;
-}
-
-/*
- * Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
- */
-static inline unsigned int round_pipe_size(unsigned int size)
-{
- unsigned long nr_pages;
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+out_revert_acct:
+ (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+ return ret;
}
/*
@@ -1112,28 +1155,9 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
__pipe_lock(pipe);
switch (cmd) {
- case F_SETPIPE_SZ: {
- unsigned int size, nr_pages;
-
- size = round_pipe_size(arg);
- nr_pages = size >> PAGE_SHIFT;
-
- ret = -EINVAL;
- if (!nr_pages)
- goto out;
-
- if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
- ret = -EPERM;
- goto out;
- } else if ((too_many_pipe_buffers_hard(pipe->user) ||
- too_many_pipe_buffers_soft(pipe->user)) &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- ret = -EPERM;
- goto out;
- }
- ret = pipe_set_size(pipe, nr_pages);
+ case F_SETPIPE_SZ:
+ ret = pipe_set_size(pipe, arg);
break;
- }
case F_GETPIPE_SZ:
ret = pipe->buffers * PAGE_SIZE;
break;
@@ -1142,7 +1166,6 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
break;
}
-out:
__pipe_unlock(pipe);
return ret;
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 99899705b105..234a9ac49958 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -259,7 +259,7 @@ static int propagate_one(struct mount *m)
read_sequnlock_excl(&mount_lock);
}
hlist_add_head(&child->mnt_hash, list);
- return 0;
+ return count_mounts(m->mnt_ns, child);
}
/*
diff --git a/fs/pnode.h b/fs/pnode.h
index 0fcdbe7ca648..550f5a8b4fcf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
#endif /* _LINUX_PNODE_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 59d47ab0791a..595522022aca 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -598,13 +598,14 @@ posix_acl_create(struct inode *dir, umode_t *mode,
if (IS_ERR(p))
return PTR_ERR(p);
+ ret = -ENOMEM;
clone = posix_acl_clone(p, GFP_NOFS);
if (!clone)
- goto no_mem;
+ goto err_release;
ret = posix_acl_create_masq(clone, mode);
if (ret < 0)
- goto no_mem_clone;
+ goto err_release_clone;
if (ret == 0)
posix_acl_release(clone);
@@ -618,14 +619,45 @@ posix_acl_create(struct inode *dir, umode_t *mode,
return 0;
-no_mem_clone:
+err_release_clone:
posix_acl_release(clone);
-no_mem:
+err_release:
posix_acl_release(p);
- return -ENOMEM;
+ return ret;
}
EXPORT_SYMBOL_GPL(posix_acl_create);
+/**
+ * posix_acl_update_mode - update mode in set_acl
+ *
+ * Update the file mode when setting an ACL: compute the new file permission
+ * bits based on the ACL. In addition, if the ACL is equivalent to the new
+ * file mode, set *acl to NULL to indicate that no ACL should be set.
+ *
+ * As with chmod, clear the setgit bit if the caller is not in the owning group
+ * or capable of CAP_FSETID (see inode_change_ok).
+ *
+ * Called from set_acl inode operations.
+ */
+int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
+{
+ umode_t mode = inode->i_mode;
+ int error;
+
+ error = posix_acl_equiv_mode(*acl, &mode);
+ if (error < 0)
+ return error;
+ if (error == 0)
+ *acl = NULL;
+ if (!in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+ mode &= ~S_ISGID;
+ *mode_p = mode;
+ return 0;
+}
+EXPORT_SYMBOL(posix_acl_update_mode);
+
/*
* Fix up the uids and gids in posix acl extended attributes in place.
*/
@@ -633,15 +665,15 @@ static void posix_acl_fix_xattr_userns(
struct user_namespace *to, struct user_namespace *from,
void *value, size_t size)
{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ struct posix_acl_xattr_header *header = value;
+ struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
int count;
kuid_t uid;
kgid_t gid;
if (!value)
return;
- if (size < sizeof(posix_acl_xattr_header))
+ if (size < sizeof(struct posix_acl_xattr_header))
return;
if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
return;
@@ -691,15 +723,15 @@ struct posix_acl *
posix_acl_from_xattr(struct user_namespace *user_ns,
const void *value, size_t size)
{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ const struct posix_acl_xattr_header *header = value;
+ const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
int count;
struct posix_acl *acl;
struct posix_acl_entry *acl_e;
if (!value)
return NULL;
- if (size < sizeof(posix_acl_xattr_header))
+ if (size < sizeof(struct posix_acl_xattr_header))
return ERR_PTR(-EINVAL);
if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
return ERR_PTR(-EOPNOTSUPP);
@@ -760,8 +792,8 @@ int
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
void *buffer, size_t size)
{
- posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
- posix_acl_xattr_entry *ext_entry;
+ struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_entry *ext_entry;
int real_size, n;
real_size = posix_acl_xattr_size(acl->a_count);
@@ -770,7 +802,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
if (real_size > size)
return -ERANGE;
- ext_entry = ext_acl->a_entries;
+ ext_entry = (void *)(ext_acl + 1);
ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
for (n=0; n < acl->a_count; n++, ext_entry++) {
@@ -897,7 +929,7 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
acl = NULL;
}
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_time(inode);
set_cached_acl(inode, type, acl);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c7de12197b..81818adb8e9e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -186,51 +186,45 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_unlock(p);
rcu_read_unlock();
- seq_printf(m,
- "State:\t%s\n"
- "Tgid:\t%d\n"
- "Ngid:\t%d\n"
- "Pid:\t%d\n"
- "PPid:\t%d\n"
- "TracerPid:\t%d\n"
- "Uid:\t%d\t%d\t%d\t%d\n"
- "Gid:\t%d\t%d\t%d\t%d\n"
- "FDSize:\t%d\nGroups:\t",
- get_task_state(p),
- tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
- from_kuid_munged(user_ns, cred->uid),
- from_kuid_munged(user_ns, cred->euid),
- from_kuid_munged(user_ns, cred->suid),
- from_kuid_munged(user_ns, cred->fsuid),
- from_kgid_munged(user_ns, cred->gid),
- from_kgid_munged(user_ns, cred->egid),
- from_kgid_munged(user_ns, cred->sgid),
- from_kgid_munged(user_ns, cred->fsgid),
- max_fds);
-
+ seq_printf(m, "State:\t%s", get_task_state(p));
+
+ seq_put_decimal_ull(m, "\nTgid:\t", tgid);
+ seq_put_decimal_ull(m, "\nNgid:\t", ngid);
+ seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
+ seq_put_decimal_ull(m, "\nPPid:\t", ppid);
+ seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
+ seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
+ seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+ seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+
+ seq_puts(m, "\nGroups:\t");
group_info = cred->group_info;
for (g = 0; g < group_info->ngroups; g++)
- seq_printf(m, "%d ",
- from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+ seq_put_decimal_ull(m, g ? " " : "",
+ from_kgid_munged(user_ns, group_info->gid[g]));
put_cred(cred);
+ /* Trailing space shouldn't have been added in the first place. */
+ seq_putc(m, ' ');
#ifdef CONFIG_PID_NS
seq_puts(m, "\nNStgid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSpid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSpgid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSsid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_session_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
seq_putc(m, '\n');
}
@@ -299,11 +293,12 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
unlock_task_sighand(p, &flags);
}
- seq_printf(m, "Threads:\t%d\n", num_threads);
- seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+ seq_put_decimal_ull(m, "Threads:\t", num_threads);
+ seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
+ seq_put_decimal_ull(m, "/", qlim);
/* render them all */
- render_sigset_t(m, "SigPnd:\t", &pending);
+ render_sigset_t(m, "\nSigPnd:\t", &pending);
render_sigset_t(m, "ShdPnd:\t", &shpending);
render_sigset_t(m, "SigBlk:\t", &blocked);
render_sigset_t(m, "SigIgn:\t", &ignored);
@@ -348,17 +343,17 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
- seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+ seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
+ seq_putc(m, '\n');
#endif
}
static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
{
- seq_printf(m, "voluntary_ctxt_switches:\t%lu\n"
- "nonvoluntary_ctxt_switches:\t%lu\n",
- p->nvcsw,
- p->nivcsw);
+ seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
+ seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
+ seq_putc(m, '\n');
}
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
@@ -417,10 +412,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
- if (permitted) {
- eip = KSTK_EIP(task);
- esp = KSTK_ESP(task);
- }
+ /*
+ * esp and eip are intentionally zeroed out. There is no
+ * non-racy way to read them without freezing the task.
+ * Programs that need reliable values can use ptrace(2).
+ */
}
get_task_comm(tcomm, task);
@@ -490,41 +486,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
start_time = nsec_to_clock_t(task->real_start_time);
seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
- seq_put_decimal_ll(m, ' ', ppid);
- seq_put_decimal_ll(m, ' ', pgid);
- seq_put_decimal_ll(m, ' ', sid);
- seq_put_decimal_ll(m, ' ', tty_nr);
- seq_put_decimal_ll(m, ' ', tty_pgrp);
- seq_put_decimal_ull(m, ' ', task->flags);
- seq_put_decimal_ull(m, ' ', min_flt);
- seq_put_decimal_ull(m, ' ', cmin_flt);
- seq_put_decimal_ull(m, ' ', maj_flt);
- seq_put_decimal_ull(m, ' ', cmaj_flt);
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
- seq_put_decimal_ll(m, ' ', priority);
- seq_put_decimal_ll(m, ' ', nice);
- seq_put_decimal_ll(m, ' ', num_threads);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', start_time);
- seq_put_decimal_ull(m, ' ', vsize);
- seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
- seq_put_decimal_ull(m, ' ', rsslim);
- seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
- seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
- seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
- seq_put_decimal_ull(m, ' ', esp);
- seq_put_decimal_ull(m, ' ', eip);
+ seq_put_decimal_ll(m, " ", ppid);
+ seq_put_decimal_ll(m, " ", pgid);
+ seq_put_decimal_ll(m, " ", sid);
+ seq_put_decimal_ll(m, " ", tty_nr);
+ seq_put_decimal_ll(m, " ", tty_pgrp);
+ seq_put_decimal_ull(m, " ", task->flags);
+ seq_put_decimal_ull(m, " ", min_flt);
+ seq_put_decimal_ull(m, " ", cmin_flt);
+ seq_put_decimal_ull(m, " ", maj_flt);
+ seq_put_decimal_ull(m, " ", cmaj_flt);
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+ seq_put_decimal_ll(m, " ", priority);
+ seq_put_decimal_ll(m, " ", nice);
+ seq_put_decimal_ll(m, " ", num_threads);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", start_time);
+ seq_put_decimal_ull(m, " ", vsize);
+ seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, " ", rsslim);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, " ", esp);
+ seq_put_decimal_ull(m, " ", eip);
/* The signal information here is obsolete.
* It must be decimal for Linux 2.0 compatibility.
* Use /proc/#/status for real-time signals.
*/
- seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);
/*
* We used to output the absolute kernel address, but that's an
@@ -538,31 +534,31 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
else
seq_puts(m, " 0");
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ll(m, ' ', task->exit_signal);
- seq_put_decimal_ll(m, ' ', task_cpu(task));
- seq_put_decimal_ull(m, ' ', task->rt_priority);
- seq_put_decimal_ull(m, ' ', task->policy);
- seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ll(m, " ", task->exit_signal);
+ seq_put_decimal_ll(m, " ", task_cpu(task));
+ seq_put_decimal_ull(m, " ", task->rt_priority);
+ seq_put_decimal_ull(m, " ", task->policy);
+ seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
if (mm && permitted) {
- seq_put_decimal_ull(m, ' ', mm->start_data);
- seq_put_decimal_ull(m, ' ', mm->end_data);
- seq_put_decimal_ull(m, ' ', mm->start_brk);
- seq_put_decimal_ull(m, ' ', mm->arg_start);
- seq_put_decimal_ull(m, ' ', mm->arg_end);
- seq_put_decimal_ull(m, ' ', mm->env_start);
- seq_put_decimal_ull(m, ' ', mm->env_end);
+ seq_put_decimal_ull(m, " ", mm->start_data);
+ seq_put_decimal_ull(m, " ", mm->end_data);
+ seq_put_decimal_ull(m, " ", mm->start_brk);
+ seq_put_decimal_ull(m, " ", mm->arg_start);
+ seq_put_decimal_ull(m, " ", mm->arg_end);
+ seq_put_decimal_ull(m, " ", mm->env_start);
+ seq_put_decimal_ull(m, " ", mm->env_end);
} else
- seq_printf(m, " 0 0 0 0 0 0 0");
+ seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, ' ', task->exit_code);
+ seq_put_decimal_ll(m, " ", task->exit_code);
else
- seq_put_decimal_ll(m, ' ', 0);
+ seq_puts(m, " 0");
seq_putc(m, '\n');
if (mm)
@@ -598,13 +594,13 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
* size, resident, shared, text, data);
*/
- seq_put_decimal_ull(m, 0, size);
- seq_put_decimal_ull(m, ' ', resident);
- seq_put_decimal_ull(m, ' ', shared);
- seq_put_decimal_ull(m, ' ', text);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', data);
- seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
seq_putc(m, '\n');
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ac0df4dde823..ca651ac00660 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -400,23 +400,6 @@ static const struct file_operations proc_pid_cmdline_ops = {
.llseek = generic_file_llseek,
};
-static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task)
-{
- struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (mm && !IS_ERR(mm)) {
- unsigned int nwords = 0;
- do {
- nwords += 2;
- } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
- mmput(mm);
- return 0;
- } else
- return PTR_ERR(mm);
-}
-
-
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
@@ -483,7 +466,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
save_stack_trace_tsk(task, &trace);
for (i = 0; i < trace.nr_entries; i++) {
- seq_printf(m, "[<%pK>] %pS\n",
+ seq_printf(m, "[<%pK>] %pB\n",
(void *)entries[i], (void *)entries[i]);
}
unlock_trace(task);
@@ -709,7 +692,7 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_MODE)
return -EPERM;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -849,6 +832,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
unsigned long addr = *ppos;
ssize_t copied;
char *page;
+ unsigned int flags;
if (!mm)
return 0;
@@ -861,6 +845,11 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!atomic_inc_not_zero(&mm->mm_users))
goto free;
+ /* Maybe we should limit FOLL_FORCE to actual ptrace users? */
+ flags = FOLL_FORCE;
+ if (write)
+ flags |= FOLL_WRITE;
+
while (count > 0) {
int this_len = min_t(int, count, PAGE_SIZE);
@@ -869,7 +858,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
break;
}
- this_len = access_remote_vm(mm, addr, page, this_len, write);
+ this_len = access_remote_vm(mm, addr, page, this_len, flags);
if (!this_len) {
if (!copied)
copied = -EIO;
@@ -981,8 +970,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
max_len = min_t(size_t, PAGE_SIZE, count);
this_len = min(max_len, this_len);
- retval = access_remote_vm(mm, (env_start + src),
- page, this_len, 0);
+ retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
if (retval <= 0) {
ret = retval;
@@ -1014,6 +1002,33 @@ static const struct file_operations proc_environ_operations = {
.release = mem_release,
};
+static int auxv_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+}
+
+static ssize_t auxv_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm = file->private_data;
+ unsigned int nwords = 0;
+
+ if (!mm)
+ return 0;
+ do {
+ nwords += 2;
+ } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
+ nwords * sizeof(mm->saved_auxv[0]));
+}
+
+static const struct file_operations proc_auxv_operations = {
+ .open = auxv_open,
+ .read = auxv_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
+};
+
static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
@@ -1664,7 +1679,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t
/* Common stuff */
ei = PROC_I(inode);
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -2280,16 +2295,27 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
- task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
- task_unlock(p);
- } else
- count = -EPERM;
+ if (p != current) {
+ if (!capable(CAP_SYS_NICE)) {
+ count = -EPERM;
+ goto out;
+ }
+
+ err = security_task_setscheduler(p);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ }
+
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+out:
put_task_struct(p);
return count;
@@ -2299,19 +2325,28 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
struct task_struct *p;
- int err = 0;
+ int err = 0;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
- if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
- task_lock(p);
- seq_printf(m, "%llu\n", p->timer_slack_ns);
- task_unlock(p);
- } else
- err = -EPERM;
+ if (p != current) {
+ if (!capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+ err = security_task_getscheduler(p);
+ if (err)
+ goto out;
+ }
+
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+
+out:
put_task_struct(p);
return err;
@@ -2822,7 +2857,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- ONE("auxv", S_IRUSR, proc_pid_auxv),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
@@ -3210,7 +3245,7 @@ static const struct pid_entry tid_base_stuff[] = {
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- ONE("auxv", S_IRUSR, proc_pid_auxv),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 01df23cc81f6..d21dafef3102 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -31,7 +31,7 @@ static int seq_show(struct seq_file *m, void *v)
put_task_struct(task);
if (files) {
- int fd = proc_fd(m->private);
+ unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
@@ -86,7 +86,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
struct task_struct *task;
const struct cred *cred;
struct inode *inode;
- int fd;
+ unsigned int fd;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -158,7 +158,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
}
if (files) {
- int fd = proc_fd(d_inode(dentry));
+ unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
spin_lock(&files->file_lock);
@@ -253,7 +253,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
continue;
rcu_read_unlock();
- len = snprintf(name, sizeof(name), "%d", fd);
+ len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index 7c047f256ae2..46dafadd0083 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -11,7 +11,7 @@ extern const struct inode_operations proc_fdinfo_inode_operations;
extern int proc_fd_permission(struct inode *inode, int mask);
-static inline int proc_fd(struct inode *inode)
+static inline unsigned int proc_fd(struct inode *inode)
{
return PROC_I(inode)->fd;
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index c633476616e0..5f2dc2032c79 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -105,7 +105,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
struct proc_dir_entry *de = PDE(inode);
int error;
- error = inode_change_ok(inode, iattr);
+ error = setattr_prepare(dentry, iattr);
if (error)
return error;
@@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
atomic_set(&ent->count, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
+ proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+
out:
return ent;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c1b72388e571..e69ebe648a34 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -68,7 +68,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->sysctl_entry = NULL;
ei->ns_ops = NULL;
inode = &ei->vfs_inode;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
}
@@ -421,7 +420,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (inode) {
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7931c558c192..5378441ec1b7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -60,7 +60,7 @@ union proc_op {
struct proc_inode {
struct pid *pid;
- int fd;
+ unsigned int fd;
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c813e5e6..8a428498d6b2 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -23,6 +23,25 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
{
}
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+ char v[32];
+ static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
+ int len;
+
+ len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
+
+ seq_write(m, s, 16);
+
+ if (len > 0) {
+ if (len < 8)
+ seq_write(m, blanks, 8 - len);
+
+ seq_write(m, v, len);
+ }
+ seq_write(m, " kB\n", 4);
+}
+
static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
@@ -32,10 +51,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long pages[NR_LRU_LISTS];
int lru;
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
@@ -50,136 +65,100 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
available = si_mem_available();
- /*
- * Tagged format, for easy grepping and expansion.
- */
- seq_printf(m,
- "MemTotal: %8lu kB\n"
- "MemFree: %8lu kB\n"
- "MemAvailable: %8lu kB\n"
- "Buffers: %8lu kB\n"
- "Cached: %8lu kB\n"
- "SwapCached: %8lu kB\n"
- "Active: %8lu kB\n"
- "Inactive: %8lu kB\n"
- "Active(anon): %8lu kB\n"
- "Inactive(anon): %8lu kB\n"
- "Active(file): %8lu kB\n"
- "Inactive(file): %8lu kB\n"
- "Unevictable: %8lu kB\n"
- "Mlocked: %8lu kB\n"
-#ifdef CONFIG_HIGHMEM
- "HighTotal: %8lu kB\n"
- "HighFree: %8lu kB\n"
- "LowTotal: %8lu kB\n"
- "LowFree: %8lu kB\n"
-#endif
-#ifndef CONFIG_MMU
- "MmapCopy: %8lu kB\n"
-#endif
- "SwapTotal: %8lu kB\n"
- "SwapFree: %8lu kB\n"
- "Dirty: %8lu kB\n"
- "Writeback: %8lu kB\n"
- "AnonPages: %8lu kB\n"
- "Mapped: %8lu kB\n"
- "Shmem: %8lu kB\n"
- "Slab: %8lu kB\n"
- "SReclaimable: %8lu kB\n"
- "SUnreclaim: %8lu kB\n"
- "KernelStack: %8lu kB\n"
- "PageTables: %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
- "Quicklists: %8lu kB\n"
-#endif
- "NFS_Unstable: %8lu kB\n"
- "Bounce: %8lu kB\n"
- "WritebackTmp: %8lu kB\n"
- "CommitLimit: %8lu kB\n"
- "Committed_AS: %8lu kB\n"
- "VmallocTotal: %8lu kB\n"
- "VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n"
-#ifdef CONFIG_MEMORY_FAILURE
- "HardwareCorrupted: %5lu kB\n"
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "AnonHugePages: %8lu kB\n"
- "ShmemHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
-#endif
-#ifdef CONFIG_CMA
- "CmaTotal: %8lu kB\n"
- "CmaFree: %8lu kB\n"
-#endif
- ,
- K(i.totalram),
- K(i.freeram),
- K(available),
- K(i.bufferram),
- K(cached),
- K(total_swapcache_pages()),
- K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
- K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
- K(pages[LRU_ACTIVE_ANON]),
- K(pages[LRU_INACTIVE_ANON]),
- K(pages[LRU_ACTIVE_FILE]),
- K(pages[LRU_INACTIVE_FILE]),
- K(pages[LRU_UNEVICTABLE]),
- K(global_page_state(NR_MLOCK)),
+ show_val_kb(m, "MemTotal: ", i.totalram);
+ show_val_kb(m, "MemFree: ", i.freeram);
+ show_val_kb(m, "MemAvailable: ", available);
+ show_val_kb(m, "Buffers: ", i.bufferram);
+ show_val_kb(m, "Cached: ", cached);
+ show_val_kb(m, "SwapCached: ", total_swapcache_pages());
+ show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
+ pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
+ pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
+ show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
+ show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
+ show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
+
#ifdef CONFIG_HIGHMEM
- K(i.totalhigh),
- K(i.freehigh),
- K(i.totalram-i.totalhigh),
- K(i.freeram-i.freehigh),
+ show_val_kb(m, "HighTotal: ", i.totalhigh);
+ show_val_kb(m, "HighFree: ", i.freehigh);
+ show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
+ show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
#endif
+
#ifndef CONFIG_MMU
- K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+ show_val_kb(m, "MmapCopy: ",
+ (unsigned long)atomic_long_read(&mmap_pages_allocated));
#endif
- K(i.totalswap),
- K(i.freeswap),
- K(global_node_page_state(NR_FILE_DIRTY)),
- K(global_node_page_state(NR_WRITEBACK)),
- K(global_node_page_state(NR_ANON_MAPPED)),
- K(global_node_page_state(NR_FILE_MAPPED)),
- K(i.sharedram),
- K(global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE)),
- K(global_page_state(NR_SLAB_RECLAIMABLE)),
- K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
- global_page_state(NR_KERNEL_STACK_KB),
- K(global_page_state(NR_PAGETABLE)),
+
+ show_val_kb(m, "SwapTotal: ", i.totalswap);
+ show_val_kb(m, "SwapFree: ", i.freeswap);
+ show_val_kb(m, "Dirty: ",
+ global_node_page_state(NR_FILE_DIRTY));
+ show_val_kb(m, "Writeback: ",
+ global_node_page_state(NR_WRITEBACK));
+ show_val_kb(m, "AnonPages: ",
+ global_node_page_state(NR_ANON_MAPPED));
+ show_val_kb(m, "Mapped: ",
+ global_node_page_state(NR_FILE_MAPPED));
+ show_val_kb(m, "Shmem: ", i.sharedram);
+ show_val_kb(m, "Slab: ",
+ global_page_state(NR_SLAB_RECLAIMABLE) +
+ global_page_state(NR_SLAB_UNRECLAIMABLE));
+
+ show_val_kb(m, "SReclaimable: ",
+ global_page_state(NR_SLAB_RECLAIMABLE));
+ show_val_kb(m, "SUnreclaim: ",
+ global_page_state(NR_SLAB_UNRECLAIMABLE));
+ seq_printf(m, "KernelStack: %8lu kB\n",
+ global_page_state(NR_KERNEL_STACK_KB));
+ show_val_kb(m, "PageTables: ",
+ global_page_state(NR_PAGETABLE));
#ifdef CONFIG_QUICKLIST
- K(quicklist_total_size()),
+ show_val_kb(m, "Quicklists: ", quicklist_total_size());
#endif
- K(global_node_page_state(NR_UNSTABLE_NFS)),
- K(global_page_state(NR_BOUNCE)),
- K(global_node_page_state(NR_WRITEBACK_TEMP)),
- K(vm_commit_limit()),
- K(committed),
- (unsigned long)VMALLOC_TOTAL >> 10,
- 0ul, // used to be vmalloc 'used'
- 0ul // used to be vmalloc 'largest_chunk'
+
+ show_val_kb(m, "NFS_Unstable: ",
+ global_node_page_state(NR_UNSTABLE_NFS));
+ show_val_kb(m, "Bounce: ",
+ global_page_state(NR_BOUNCE));
+ show_val_kb(m, "WritebackTmp: ",
+ global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "CommitLimit: ", vm_commit_limit());
+ show_val_kb(m, "Committed_AS: ", committed);
+ seq_printf(m, "VmallocTotal: %8lu kB\n",
+ (unsigned long)VMALLOC_TOTAL >> 10);
+ show_val_kb(m, "VmallocUsed: ", 0ul);
+ show_val_kb(m, "VmallocChunk: ", 0ul);
+
#ifdef CONFIG_MEMORY_FAILURE
- , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+ seq_printf(m, "HardwareCorrupted: %5lu kB\n",
+ atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
- , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
- , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+ show_val_kb(m, "AnonHugePages: ",
+ global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "ShmemHugePages: ",
+ global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "ShmemPmdMapped: ",
+ global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
#endif
+
#ifdef CONFIG_CMA
- , K(totalcma_pages)
- , K(global_page_state(NR_FREE_CMA_PAGES))
+ show_val_kb(m, "CmaTotal: ", totalcma_pages);
+ show_val_kb(m, "CmaFree: ",
+ global_page_state(NR_FREE_CMA_PAGES));
#endif
- );
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
return 0;
-#undef K
}
static int meminfo_proc_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index c8bbc68cdb05..7ae6b1da7cab 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -21,6 +21,7 @@
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
+#include <linux/uidgid.h>
#include <net/net_namespace.h>
#include <linux/seq_file.h>
@@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = {
static __net_init int proc_net_ns_init(struct net *net)
{
struct proc_dir_entry *netd, *net_statd;
+ kuid_t uid;
+ kgid_t gid;
int err;
err = -ENOMEM;
@@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net)
netd->parent = &proc_root;
memcpy(netd->name, "net", 4);
+ uid = make_kuid(net->user_ns, 0);
+ if (!uid_valid(uid))
+ uid = netd->uid;
+
+ gid = make_kgid(net->user_ns, 0);
+ if (!gid_valid(gid))
+ gid = netd->gid;
+
+ proc_set_user(netd, uid, gid);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1b93650dda2f..55313d994895 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry, struct nsproxy *namespaces);
+ struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
}
static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
{
struct ctl_table_set *set = &root->default_set;
if (root->lookup)
- set = root->lookup(root, namespaces);
+ set = root->lookup(root);
return set;
}
@@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
+ struct ctl_table_root *root = head->root;
struct inode *inode;
struct proc_inode *ei;
@@ -444,7 +445,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
ei->sysctl = head;
ei->sysctl_entry = table;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (is_empty_dir(head))
make_empty_dir_inode(inode);
}
+
+ if (root->set_ownership)
+ root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+
out:
return inode;
}
@@ -491,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
if (S_ISLNK(p->mode)) {
- ret = sysctl_follow_link(&h, &p, current->nsproxy);
+ ret = sysctl_follow_link(&h, &p);
err = ERR_PTR(ret);
if (ret)
goto out;
@@ -659,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
- int err = sysctl_follow_link(&head, &table, current->nsproxy);
+ int err = sysctl_follow_link(&head, &table);
if (err)
goto out;
}
@@ -754,7 +759,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
return -EPERM;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -976,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry, struct nsproxy *namespaces)
+ struct ctl_table **pentry)
{
struct ctl_table_header *head;
struct ctl_table_root *root;
@@ -988,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
- set = lookup_header_set(root, namespaces);
+ set = lookup_header_set(root);
dir = xlate_dir(set, (*phead)->parent);
if (IS_ERR(dir))
ret = PTR_ERR(dir);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index b6a8d3529fea..40245954c450 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -56,7 +56,7 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode_pseudo(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7907e456ac4f..d700c42b3572 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -115,17 +115,16 @@ static int show_stat(struct seq_file *p, void *v)
}
sum += arch_irq_stat();
- seq_puts(p, "cpu ");
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
seq_putc(p, '\n');
for_each_online_cpu(i) {
@@ -141,23 +140,23 @@ static int show_stat(struct seq_file *p, void *v)
guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
seq_putc(p, '\n');
}
- seq_printf(p, "intr %llu", (unsigned long long)sum);
+ seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
seq_printf(p,
"\nctxt %llu\n"
@@ -171,10 +170,10 @@ static int show_stat(struct seq_file *p, void *v)
nr_running(),
nr_iowait());
- seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+ seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
for (i = 0; i < NR_SOFTIRQS; i++)
- seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+ seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
seq_putc(p, '\n');
return 0;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f6fa99eca515..35b92d81692f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
{
if (m->count < m->size) /* vma is copied successfully */
- m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+ m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
}
static void *m_start(struct seq_file *m, loff_t *ppos)
@@ -175,8 +175,10 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
priv->tail_vma = get_gate_vma(mm);
if (last_addr) {
- vma = find_vma(mm, last_addr);
- if (vma && (vma = m_next_vma(priv, vma)))
+ vma = find_vma(mm, last_addr - 1);
+ if (vma && vma->vm_start <= last_addr)
+ vma = m_next_vma(priv, vma);
+ if (vma)
return vma;
}
@@ -264,24 +266,15 @@ static int do_maps_open(struct inode *inode, struct file *file,
* /proc/PID/maps that is the stack of the main task.
*/
static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, int is_pid)
+ struct vm_area_struct *vma)
{
- int stack = 0;
-
- if (is_pid) {
- stack = vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
- } else {
- struct inode *inode = priv->inode;
- struct task_struct *task;
-
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task)
- stack = vma_is_stack_for_task(vma, task);
- rcu_read_unlock();
- }
- return stack;
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
}
static void
@@ -352,7 +345,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- if (is_stack(priv, vma, is_pid))
+ if (is_stack(priv, vma))
name = "[stack]";
}
@@ -1070,7 +1063,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
mmu_notifier_invalidate_range_start(mm, 0, -1);
}
- walk_page_range(0, ~0UL, &clear_refs_walk);
+ walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
@@ -1667,7 +1660,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else if (is_stack(proc_priv, vma, is_pid)) {
+ } else if (is_stack(proc_priv, vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index faacb0c0d857..37175621e890 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -124,25 +124,17 @@ unsigned long task_statm(struct mm_struct *mm,
}
static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, int is_pid)
+ struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
- int stack = 0;
-
- if (is_pid) {
- stack = vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
- } else {
- struct inode *inode = priv->inode;
- struct task_struct *task;
-
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task)
- stack = vma_is_stack_for_task(vma, task);
- rcu_read_unlock();
- }
- return stack;
+
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack;
}
/*
@@ -184,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(priv, vma, is_pid)) {
+ } else if (mm && is_stack(priv, vma)) {
seq_pad(m, ' ');
seq_printf(m, "[stack]");
}
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index e58a31e8fb2a..595b90a9766c 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -58,7 +58,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode_pseudo(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index ec9ddef5ae75..1781dc50762e 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -230,7 +230,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
}
return inode;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 16ecca5b72d8..14984d902a99 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -623,6 +623,40 @@ static int pstore_write_compat(enum pstore_type_id type,
size, psi);
}
+static int pstore_write_buf_user_compat(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id, unsigned int part,
+ const char __user *buf,
+ bool compressed, size_t size,
+ struct pstore_info *psi)
+{
+ unsigned long flags = 0;
+ size_t i, bufsize = size;
+ long ret = 0;
+
+ if (unlikely(!access_ok(VERIFY_READ, buf, size)))
+ return -EFAULT;
+ if (bufsize > psinfo->bufsize)
+ bufsize = psinfo->bufsize;
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
+ for (i = 0; i < size; ) {
+ size_t c = min(size - i, bufsize);
+
+ ret = __copy_from_user(psinfo->buf, buf + i, c);
+ if (unlikely(ret != 0)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = psi->write_buf(type, reason, id, part, psinfo->buf,
+ compressed, c, psi);
+ if (unlikely(ret < 0))
+ break;
+ i += c;
+ }
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+ return unlikely(ret < 0) ? ret : size;
+}
+
/*
* platform specific persistent storage driver registers with
* us here. If pstore is already mounted, call the platform
@@ -645,6 +679,8 @@ int pstore_register(struct pstore_info *psi)
if (!psi->write)
psi->write = pstore_write_compat;
+ if (!psi->write_buf_user)
+ psi->write_buf_user = pstore_write_buf_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
@@ -659,13 +695,14 @@ int pstore_register(struct pstore_info *psi)
if (pstore_is_mounted())
pstore_get_records(0);
- pstore_register_kmsg();
-
- if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+ if (psi->flags & PSTORE_FLAGS_DMESG)
+ pstore_register_kmsg();
+ if (psi->flags & PSTORE_FLAGS_CONSOLE)
pstore_register_console();
+ if (psi->flags & PSTORE_FLAGS_FTRACE)
pstore_register_ftrace();
+ if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_register_pmsg();
- }
if (pstore_update_ms >= 0) {
pstore_timer.expires = jiffies +
@@ -689,12 +726,14 @@ EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
- if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+ if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_unregister_pmsg();
+ if (psi->flags & PSTORE_FLAGS_FTRACE)
pstore_unregister_ftrace();
+ if (psi->flags & PSTORE_FLAGS_CONSOLE)
pstore_unregister_console();
- }
- pstore_unregister_kmsg();
+ if (psi->flags & PSTORE_FLAGS_DMESG)
+ pstore_unregister_kmsg();
free_buf_for_compression();
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 7de20cd3797f..78f6176c020f 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -19,48 +19,25 @@
#include "internal.h"
static DEFINE_MUTEX(pmsg_lock);
-#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- size_t i, buffer_size;
- char *buffer;
+ u64 id;
+ int ret;
if (!count)
return 0;
+ /* check outside lock, page in any data. write_buf_user also checks */
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
- buffer_size = count;
- if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
- buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
- buffer = vmalloc(buffer_size);
- if (!buffer)
- return -ENOMEM;
-
mutex_lock(&pmsg_lock);
- for (i = 0; i < count; ) {
- size_t c = min(count - i, buffer_size);
- u64 id;
- long ret;
-
- ret = __copy_from_user(buffer, buf + i, c);
- if (unlikely(ret != 0)) {
- mutex_unlock(&pmsg_lock);
- vfree(buffer);
- return -EFAULT;
- }
- psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
- psinfo);
-
- i += c;
- }
-
+ ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
+ psinfo);
mutex_unlock(&pmsg_lock);
- vfree(buffer);
- return count;
+ return ret ? ret : count;
}
static const struct file_operations pmsg_fops = {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 7a034d62cf8c..6ad831b9d1b8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
return 0;
}
+static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id, unsigned int part,
+ const char __user *buf,
+ bool compressed, size_t size,
+ struct pstore_info *psi)
+{
+ if (type == PSTORE_TYPE_PMSG) {
+ struct ramoops_context *cxt = psi->data;
+
+ if (!cxt->mprz)
+ return -ENOMEM;
+ return persistent_ram_write_user(cxt->mprz, buf, size);
+ }
+
+ return -EINVAL;
+}
+
static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
struct timespec time, struct pstore_info *psi)
{
@@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = {
.open = ramoops_pstore_open,
.read = ramoops_pstore_read,
.write_buf = ramoops_pstore_write_buf,
+ .write_buf_user = ramoops_pstore_write_buf_user,
.erase = ramoops_pstore_erase,
},
};
@@ -377,13 +396,14 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
- cxt->max_dump_cnt = 0;
if (!cxt->przs)
return;
- for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++)
+ for (i = 0; i < cxt->max_dump_cnt; i++)
persistent_ram_free(cxt->przs[i]);
+
kfree(cxt->przs);
+ cxt->max_dump_cnt = 0;
}
static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
@@ -408,7 +428,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
GFP_KERNEL);
if (!cxt->przs) {
dev_err(dev, "failed to initialize a prz array for dumps\n");
- goto fail_prz;
+ goto fail_mem;
}
for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -419,6 +439,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
err = PTR_ERR(cxt->przs[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
cxt->record_size, (unsigned long long)*paddr, err);
+
+ while (i > 0) {
+ i--;
+ persistent_ram_free(cxt->przs[i]);
+ }
goto fail_prz;
}
*paddr += cxt->record_size;
@@ -426,7 +451,9 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
return 0;
fail_prz:
- ramoops_free_przs(cxt);
+ kfree(cxt->przs);
+fail_mem:
+ cxt->max_dump_cnt = 0;
return err;
}
@@ -608,12 +635,20 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
- spin_lock_init(&cxt->pstore.buf_lock);
if (!cxt->pstore.buf) {
pr_err("cannot allocate pstore buffer\n");
err = -ENOMEM;
goto fail_clear;
}
+ spin_lock_init(&cxt->pstore.buf_lock);
+
+ cxt->pstore.flags = PSTORE_FLAGS_DMESG;
+ if (cxt->console_size)
+ cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
+ if (cxt->ftrace_size)
+ cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
+ if (cxt->pmsg_size)
+ cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
err = pstore_register(&cxt->pstore);
if (err) {
@@ -659,7 +694,6 @@ static int ramoops_remove(struct platform_device *pdev)
struct ramoops_context *cxt = &oops_cxt;
pstore_unregister(&cxt->pstore);
- cxt->max_dump_cnt = 0;
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 76c3f80efdfa..3975deec02f8 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,15 +17,16 @@
#include <linux/device.h>
#include <linux/err.h>
#include <linux/errno.h>
-#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/io.h>
+#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/memblock.h>
+#include <linux/pstore_ram.h>
#include <linux/rslib.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
-#include <linux/pstore_ram.h>
#include <asm/page.h>
struct persistent_ram_buffer {
@@ -47,43 +48,10 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
return atomic_read(&prz->buffer->start);
}
-/* increase and wrap the start pointer, returning the old value */
-static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
-{
- int old;
- int new;
-
- do {
- old = atomic_read(&prz->buffer->start);
- new = old + a;
- while (unlikely(new >= prz->buffer_size))
- new -= prz->buffer_size;
- } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
-
- return old;
-}
-
-/* increase the size counter until it hits the max size */
-static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
-{
- size_t old;
- size_t new;
-
- if (atomic_read(&prz->buffer->size) == prz->buffer_size)
- return;
-
- do {
- old = atomic_read(&prz->buffer->size);
- new = old + a;
- if (new > prz->buffer_size)
- new = prz->buffer_size;
- } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
-}
-
static DEFINE_RAW_SPINLOCK(buffer_lock);
/* increase and wrap the start pointer, returning the old value */
-static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
{
int old;
int new;
@@ -103,7 +71,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
}
/* increase the size counter until it hits the max size */
-static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
{
size_t old;
size_t new;
@@ -124,9 +92,6 @@ exit:
raw_spin_unlock_irqrestore(&buffer_lock, flags);
}
-static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
-static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
-
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
uint8_t *data, size_t len, uint8_t *ecc)
{
@@ -299,10 +264,20 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
const void *s, unsigned int start, unsigned int count)
{
struct persistent_ram_buffer *buffer = prz->buffer;
- memcpy(buffer->data + start, s, count);
+ memcpy_toio(buffer->data + start, s, count);
persistent_ram_update_ecc(prz, start, count);
}
+static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int start, unsigned int count)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ?
+ -EFAULT : 0;
+ persistent_ram_update_ecc(prz, start, count);
+ return ret;
+}
+
void persistent_ram_save_old(struct persistent_ram_zone *prz)
{
struct persistent_ram_buffer *buffer = prz->buffer;
@@ -322,8 +297,8 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
}
prz->old_log_size = size;
- memcpy(prz->old_log, &buffer->data[start], size - start);
- memcpy(prz->old_log + size - start, &buffer->data[0], start);
+ memcpy_fromio(prz->old_log, &buffer->data[start], size - start);
+ memcpy_fromio(prz->old_log + size - start, &buffer->data[0], start);
}
int notrace persistent_ram_write(struct persistent_ram_zone *prz,
@@ -356,6 +331,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz,
return count;
}
+int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count)
+{
+ int rem, ret = 0, c = count;
+ size_t start;
+
+ if (unlikely(!access_ok(VERIFY_READ, s, count)))
+ return -EFAULT;
+ if (unlikely(c > prz->buffer_size)) {
+ s += c - prz->buffer_size;
+ c = prz->buffer_size;
+ }
+
+ buffer_size_add(prz, c);
+
+ start = buffer_start_add(prz, c);
+
+ rem = prz->buffer_size - start;
+ if (unlikely(rem < c)) {
+ ret = persistent_ram_update_user(prz, s, start, rem);
+ s += rem;
+ c -= rem;
+ start = 0;
+ }
+ if (likely(!ret))
+ ret = persistent_ram_update_user(prz, s, start, c);
+
+ persistent_ram_update_header_ecc(prz);
+
+ return unlikely(ret) ? ret : count;
+}
+
size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
{
return prz->old_log_size;
@@ -426,9 +433,6 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size,
return NULL;
}
- buffer_start_add = buffer_start_add_locked;
- buffer_size_add = buffer_size_add_locked;
-
if (memtype)
va = ioremap(start, size);
else
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 35df08ee9c97..2d445425aad7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -341,6 +341,7 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
struct qc_state state;
int ret;
+ memset(&state, 0, sizeof (struct qc_state));
ret = sb->s_qcop->get_state(sb, &state);
if (ret < 0)
return ret;
@@ -365,17 +366,19 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
- if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+
+ /* Inodes may be allocated even if inactive; copy out if present */
+ if (state.s_state[USRQUOTA].ino) {
fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
}
- if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+ if (state.s_state[GRPQUOTA].ino) {
fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
}
- if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+ if (state.s_state[PRJQUOTA].ino) {
/*
* Q_XGETQSTAT doesn't have room for both group and project
* quotas. So, allow the project quota values to be copied out
@@ -411,6 +414,7 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
struct qc_state state;
int ret;
+ memset(&state, 0, sizeof (struct qc_state));
ret = sb->s_qcop->get_state(sb, &state);
if (ret < 0)
return ret;
@@ -435,17 +439,19 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
- if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+
+ /* Inodes may be allocated even if inactive; copy out if present */
+ if (state.s_state[USRQUOTA].ino) {
fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
}
- if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+ if (state.s_state[GRPQUOTA].ino) {
fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
}
- if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+ if (state.s_state[PRJQUOTA].ino) {
fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino;
fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks;
fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index be3ddd189cd4..2bcbf4e77982 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -169,7 +169,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
int ret = 0;
/* POSIX UID/GID verification for setting inode attributes */
- ret = inode_change_ok(inode, ia);
+ ret = setattr_prepare(dentry, ia);
if (ret)
return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 1ab6e6c2e60e..8621c039b536 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &ramfs_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
@@ -100,7 +100,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
}
return error;
}
@@ -130,7 +130,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
} else
iput(inode);
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 66215a7b17cf..190e0d362581 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -730,6 +730,35 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
/* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+/**
+ * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
+ * into the kernel and check that it is valid.
+ *
+ * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
+ * @uvector: Pointer to the userspace array.
+ * @nr_segs: Number of elements in userspace array.
+ * @fast_segs: Number of elements in @fast_pointer.
+ * @fast_pointer: Pointer to (usually small on-stack) kernel array.
+ * @ret_pointer: (output parameter) Pointer to a variable that will point to
+ * either @fast_pointer, a newly allocated kernel array, or NULL,
+ * depending on which array was used.
+ *
+ * This function copies an array of &struct iovec of @nr_segs from
+ * userspace into the kernel and checks that each element is valid (e.g.
+ * it does not point to a kernel address or cause overflow by being too
+ * large, etc.).
+ *
+ * As an optimization, the caller may provide a pointer to a small
+ * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
+ * (the size of this array, or 0 if unused, should be given in @fast_segs).
+ *
+ * @ret_pointer will always point to the array that was used, so the
+ * caller must take care not to call kfree() on it e.g. in case the
+ * @fast_pointer array was used and it was allocated on the stack.
+ *
+ * Return: The total number of bytes covered by the iovec array on success
+ * or a negative error code on error.
+ */
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 90f815bdfa8a..2f8c5c9bdaf6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -260,10 +260,7 @@ const struct file_operations reiserfs_file_operations = {
const struct inode_operations reiserfs_file_inode_operations = {
.setattr = reiserfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c2c59f9ff04b..58b2dedb2a3a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2005,7 +2005,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (S_ISLNK(inode->i_mode))
inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
inode->i_bytes = 0;
@@ -3312,7 +3312,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
unsigned int ia_valid;
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 2f1ddc908013..1f4692a505a0 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -94,7 +94,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
setflags_out:
mnt_drop_write_file(filp);
@@ -115,7 +115,7 @@ setflags_out:
err = -EFAULT;
goto setversion_out;
}
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
setversion_out:
mnt_drop_write_file(filp);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 8a36696d6df9..e6a2b406af36 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -570,7 +570,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
}
dir->i_size += paste_size;
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
if (!S_ISDIR(inode->i_mode) && visible)
/* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
@@ -963,7 +963,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_nlink);
clear_nlink(inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir);
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
@@ -1067,11 +1067,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
inc_nlink(inode);
goto end_unlink;
}
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
reiserfs_update_sd(&th, inode);
dir->i_size -= (de.de_entrylen + DEH_SIZE);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
reiserfs_update_sd(&th, dir);
if (!savelink)
@@ -1246,7 +1246,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
return err ? err : retval;
}
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
reiserfs_update_sd(&th, inode);
ihold(inode);
@@ -1306,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
* get_empty_nodes or its clones
*/
static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
int retval;
INITIALIZE_PATH(old_entry_path);
@@ -1321,6 +1322,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
/*
* three balancings: (1) old name removal, (2) new name insertion
* and (3) maybe "save" link insertion
@@ -1567,7 +1571,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
journal_mark_dirty(&th, old_de.de_bh);
- ctime = CURRENT_TIME_SEC;
+ ctime = current_time(old_dir);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
/*
@@ -1650,10 +1654,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.mknod = reiserfs_mknod,
.rename = reiserfs_rename,
.setattr = reiserfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
@@ -1667,10 +1668,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = page_get_link,
.setattr = reiserfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = generic_removexattr,
.permission = reiserfs_permission,
};
@@ -1679,10 +1677,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
*/
const struct inode_operations reiserfs_special_inode_operations = {
.setattr = reiserfs_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = reiserfs_listxattr,
- .removexattr = generic_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 4032d1e87c8f..a97e352d05d3 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1987,8 +1987,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
pathrelse(&s_search_path);
if (update_timestamps) {
- inode->i_mtime = CURRENT_TIME_SEC;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = current_time(inode);
}
reiserfs_update_sd(th, inode);
@@ -2012,8 +2012,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
update_and_out:
if (update_timestamps) {
/* this is truncate, not file closing */
- inode->i_mtime = CURRENT_TIME_SEC;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = current_time(inode);
}
reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7a4a85a6821e..0a6ad4e71e88 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s,
static int reiserfs_quota_on_mount(struct super_block *, int);
#endif
-/* look for uncompleted unlinks and truncates and complete them */
+/*
+ * Look for uncompleted unlinks and truncates and complete them
+ *
+ * Called with superblock write locked. If quotas are enabled, we have to
+ * release/retake lest we call dquot_quota_on_mount(), proceed to
+ * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
+ * cpu worklets to complete flush_async_commits() that in turn wait for the
+ * superblock write lock.
+ */
static int finish_unfinished(struct super_block *s)
{
INITIALIZE_PATH(path);
@@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s)
quota_enabled[i] = 0;
continue;
}
+ reiserfs_write_unlock(s);
ret = reiserfs_quota_on_mount(s, i);
+ reiserfs_write_lock(s);
if (ret < 0)
reiserfs_warning(s, "reiserfs-2500",
"cannot turn on journaled "
@@ -2512,7 +2522,7 @@ out:
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
inode->i_version++;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index a33812ae9fad..e87aa21c30de 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -450,13 +450,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
static void update_ctime(struct inode *inode)
{
- struct timespec now = current_fs_time(inode->i_sb);
+ struct timespec now = current_time(inode);
if (inode_unhashed(inode) || !inode->i_nlink ||
timespec_equal(&inode->i_ctime, &now))
return;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
}
@@ -575,7 +575,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
if (!err && new_size < i_size_read(d_inode(dentry))) {
struct iattr newattrs = {
- .ia_ctime = current_fs_time(inode->i_sb),
+ .ia_ctime = current_time(inode),
.ia_size = new_size,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dbed42f755e0..3d2256a425ee 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -242,13 +242,9 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
- else {
- if (error == 0)
- acl = NULL;
- }
}
break;
case ACL_TYPE_DEFAULT:
@@ -277,7 +273,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
if (error == -ENODATA) {
error = 0;
if (type == ACL_TYPE_ACCESS) {
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
}
}
diff --git a/fs/select.c b/fs/select.c
index 8ed9da50896a..3d4f85defeab 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,6 +29,7 @@
#include <linux/sched/rt.h>
#include <linux/freezer.h>
#include <net/busy_poll.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
@@ -554,7 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set_bits fds;
void *bits;
int ret, max_fds;
- unsigned int size;
+ size_t size, alloc_size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -581,7 +582,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
if (size > sizeof(stack_fds) / 6) {
/* Not enough space in on-stack array; must use kmalloc */
ret = -ENOMEM;
- bits = kmalloc(6 * size, GFP_KERNEL);
+ if (size > (SIZE_MAX / 6))
+ goto out_nofds;
+
+ alloc_size = 6 * size;
+ bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
+ if (!bits && alloc_size > PAGE_SIZE)
+ bits = vmalloc(alloc_size);
+
if (!bits)
goto out_nofds;
}
@@ -618,7 +626,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
out:
if (bits != stack_fds)
- kfree(bits);
+ kvfree(bits);
out_nofds:
return ret;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6dc4296eed62..368bfb92b115 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -679,11 +679,11 @@ EXPORT_SYMBOL(seq_puts);
/*
* A helper routine for putting decimal numbers without rich format of printf().
* only 'unsigned long long' is supported.
- * This routine will put one byte delimiter + number into seq_file.
+ * This routine will put strlen(delimiter) + number into seq_file.
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
unsigned long long num)
{
int len;
@@ -691,8 +691,15 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
- if (delimiter)
- m->buf[m->count++] = delimiter;
+ len = strlen(delimiter);
+ if (m->count + len >= m->size)
+ goto overflow;
+
+ memcpy(m->buf + m->count, delimiter, len);
+ m->count += len;
+
+ if (m->count + 1 >= m->size)
+ goto overflow;
if (num < 10) {
m->buf[m->count++] = num + '0';
@@ -702,6 +709,7 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
len = num_to_str(m->buf + m->count, m->size - m->count, num);
if (!len)
goto overflow;
+
m->count += len;
return;
@@ -710,19 +718,42 @@ overflow:
}
EXPORT_SYMBOL(seq_put_decimal_ull);
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
+ int len;
+
+ if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
+ goto overflow;
+
+ len = strlen(delimiter);
+ if (m->count + len >= m->size)
+ goto overflow;
+
+ memcpy(m->buf + m->count, delimiter, len);
+ m->count += len;
+
+ if (m->count + 2 >= m->size)
+ goto overflow;
+
if (num < 0) {
- if (m->count + 3 >= m->size) {
- seq_set_overflow(m);
- return;
- }
- if (delimiter)
- m->buf[m->count++] = delimiter;
+ m->buf[m->count++] = '-';
num = -num;
- delimiter = '-';
}
- seq_put_decimal_ull(m, delimiter, num);
+
+ if (num < 10) {
+ m->buf[m->count++] = num + '0';
+ return;
+ }
+
+ len = num_to_str(m->buf + m->count, m->size - m->count, num);
+ if (!len)
+ goto overflow;
+
+ m->count += len;
+ return;
+
+overflow:
+ seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_put_decimal_ll);
diff --git a/fs/splice.c b/fs/splice.c
index dd9bf7e410d2..153d4f3bd441 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -183,82 +183,39 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
{
unsigned int spd_pages = spd->nr_pages;
- int ret, do_wakeup, page_nr;
+ int ret = 0, page_nr = 0;
if (!spd_pages)
return 0;
- ret = 0;
- do_wakeup = 0;
- page_nr = 0;
-
- pipe_lock(pipe);
-
- for (;;) {
- if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
- if (!ret)
- ret = -EPIPE;
- break;
- }
-
- if (pipe->nrbufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
-
- buf->page = spd->pages[page_nr];
- buf->offset = spd->partial[page_nr].offset;
- buf->len = spd->partial[page_nr].len;
- buf->private = spd->partial[page_nr].private;
- buf->ops = spd->ops;
- if (spd->flags & SPLICE_F_GIFT)
- buf->flags |= PIPE_BUF_FLAG_GIFT;
-
- pipe->nrbufs++;
- page_nr++;
- ret += buf->len;
-
- if (pipe->files)
- do_wakeup = 1;
+ if (unlikely(!pipe->readers)) {
+ send_sig(SIGPIPE, current, 0);
+ ret = -EPIPE;
+ goto out;
+ }
- if (!--spd->nr_pages)
- break;
- if (pipe->nrbufs < pipe->buffers)
- continue;
+ while (pipe->nrbufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ struct pipe_buffer *buf = pipe->bufs + newbuf;
- break;
- }
+ buf->page = spd->pages[page_nr];
+ buf->offset = spd->partial[page_nr].offset;
+ buf->len = spd->partial[page_nr].len;
+ buf->private = spd->partial[page_nr].private;
+ buf->ops = spd->ops;
- if (spd->flags & SPLICE_F_NONBLOCK) {
- if (!ret)
- ret = -EAGAIN;
- break;
- }
+ pipe->nrbufs++;
+ page_nr++;
+ ret += buf->len;
- if (signal_pending(current)) {
- if (!ret)
- ret = -ERESTARTSYS;
+ if (!--spd->nr_pages)
break;
- }
-
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible_sync(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- do_wakeup = 0;
- }
-
- pipe->waiting_writers++;
- pipe_wait(pipe);
- pipe->waiting_writers--;
}
- pipe_unlock(pipe);
-
- if (do_wakeup)
- wakeup_pipe_readers(pipe);
+ if (!ret)
+ ret = -EAGAIN;
+out:
while (page_nr < spd_pages)
spd->spd_release(spd, page_nr++);
@@ -266,6 +223,26 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
}
EXPORT_SYMBOL_GPL(splice_to_pipe);
+ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+{
+ int ret;
+
+ if (unlikely(!pipe->readers)) {
+ send_sig(SIGPIPE, current, 0);
+ ret = -EPIPE;
+ } else if (pipe->nrbufs == pipe->buffers) {
+ ret = -EAGAIN;
+ } else {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ pipe->bufs[newbuf] = *buf;
+ pipe->nrbufs++;
+ return buf->len;
+ }
+ pipe_buf_release(pipe, buf);
+ return ret;
+}
+EXPORT_SYMBOL(add_to_pipe);
+
void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
put_page(spd->pages[i]);
@@ -303,207 +280,6 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial);
}
-static int
-__generic_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
-{
- struct address_space *mapping = in->f_mapping;
- unsigned int loff, nr_pages, req_pages;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct page *page;
- pgoff_t index, end_index;
- loff_t isize;
- int error, page_nr;
- struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
- .ops = &page_cache_pipe_buf_ops,
- .spd_release = spd_release_page,
- };
-
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- index = *ppos >> PAGE_SHIFT;
- loff = *ppos & ~PAGE_MASK;
- req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
- nr_pages = min(req_pages, spd.nr_pages_max);
-
- /*
- * Lookup the (hopefully) full range of pages we need.
- */
- spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
- index += spd.nr_pages;
-
- /*
- * If find_get_pages_contig() returned fewer pages than we needed,
- * readahead/allocate the rest and fill in the holes.
- */
- if (spd.nr_pages < nr_pages)
- page_cache_sync_readahead(mapping, &in->f_ra, in,
- index, req_pages - spd.nr_pages);
-
- error = 0;
- while (spd.nr_pages < nr_pages) {
- /*
- * Page could be there, find_get_pages_contig() breaks on
- * the first hole.
- */
- page = find_get_page(mapping, index);
- if (!page) {
- /*
- * page didn't exist, allocate one.
- */
- page = page_cache_alloc_cold(mapping);
- if (!page)
- break;
-
- error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (unlikely(error)) {
- put_page(page);
- if (error == -EEXIST)
- continue;
- break;
- }
- /*
- * add_to_page_cache() locks the page, unlock it
- * to avoid convoluting the logic below even more.
- */
- unlock_page(page);
- }
-
- spd.pages[spd.nr_pages++] = page;
- index++;
- }
-
- /*
- * Now loop over the map and see if we need to start IO on any
- * pages, fill in the partial map, etc.
- */
- index = *ppos >> PAGE_SHIFT;
- nr_pages = spd.nr_pages;
- spd.nr_pages = 0;
- for (page_nr = 0; page_nr < nr_pages; page_nr++) {
- unsigned int this_len;
-
- if (!len)
- break;
-
- /*
- * this_len is the max we'll use from this page
- */
- this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
- page = spd.pages[page_nr];
-
- if (PageReadahead(page))
- page_cache_async_readahead(mapping, &in->f_ra, in,
- page, index, req_pages - page_nr);
-
- /*
- * If the page isn't uptodate, we may need to start io on it
- */
- if (!PageUptodate(page)) {
- lock_page(page);
-
- /*
- * Page was truncated, or invalidated by the
- * filesystem. Redo the find/create, but this time the
- * page is kept locked, so there's no chance of another
- * race with truncate/invalidate.
- */
- if (!page->mapping) {
- unlock_page(page);
-retry_lookup:
- page = find_or_create_page(mapping, index,
- mapping_gfp_mask(mapping));
-
- if (!page) {
- error = -ENOMEM;
- break;
- }
- put_page(spd.pages[page_nr]);
- spd.pages[page_nr] = page;
- }
- /*
- * page was already under io and is now done, great
- */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto fill_it;
- }
-
- /*
- * need to read in the page
- */
- error = mapping->a_ops->readpage(in, page);
- if (unlikely(error)) {
- /*
- * Re-lookup the page
- */
- if (error == AOP_TRUNCATED_PAGE)
- goto retry_lookup;
-
- break;
- }
- }
-fill_it:
- /*
- * i_size must be checked after PageUptodate.
- */
- isize = i_size_read(mapping->host);
- end_index = (isize - 1) >> PAGE_SHIFT;
- if (unlikely(!isize || index > end_index))
- break;
-
- /*
- * if this is the last page, see if we need to shrink
- * the length and stop
- */
- if (end_index == index) {
- unsigned int plen;
-
- /*
- * max good bytes in this page
- */
- plen = ((isize - 1) & ~PAGE_MASK) + 1;
- if (plen <= loff)
- break;
-
- /*
- * force quit after adding this page
- */
- this_len = min(this_len, plen - loff);
- len = this_len;
- }
-
- spd.partial[page_nr].offset = loff;
- spd.partial[page_nr].len = this_len;
- len -= this_len;
- loff = 0;
- spd.nr_pages++;
- index++;
- }
-
- /*
- * Release any pages at the end, if we quit early. 'page_nr' is how far
- * we got, 'nr_pages' is how many pages are in the map.
- */
- while (page_nr < nr_pages)
- put_page(spd.pages[page_nr++]);
- in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
-
- if (spd.nr_pages)
- error = splice_to_pipe(pipe, &spd);
-
- splice_shrink_spd(&spd);
- return error;
-}
-
/**
* generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from
@@ -514,39 +290,47 @@ fill_it:
*
* Description:
* Will read pages from given file and fill them into a pipe. Can be
- * used as long as the address_space operations for the source implements
- * a readpage() hook.
+ * used as long as it has more or less sane ->read_iter().
*
*/
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- loff_t isize, left;
- int ret;
-
- if (IS_DAX(in->f_mapping->host))
- return default_file_splice_read(in, ppos, pipe, len, flags);
+ struct iov_iter to;
+ struct kiocb kiocb;
+ loff_t isize;
+ int idx, ret;
isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
return 0;
- left = isize - *ppos;
- if (unlikely(left < len))
- len = left;
-
- ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+ iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
+ idx = to.idx;
+ init_sync_kiocb(&kiocb, in);
+ kiocb.ki_pos = *ppos;
+ ret = in->f_op->read_iter(&kiocb, &to);
if (ret > 0) {
- *ppos += ret;
+ *ppos = kiocb.ki_pos;
file_accessed(in);
+ } else if (ret < 0) {
+ to.idx = idx;
+ to.iov_offset = 0;
+ iov_iter_advance(&to, 0); /* to free what was emitted */
+ /*
+ * callers of ->splice_read() expect -EAGAIN on
+ * "can't put anything in there", rather than -EFAULT.
+ */
+ if (ret == -EFAULT)
+ ret = -EAGAIN;
}
return ret;
}
EXPORT_SYMBOL(generic_file_splice_read);
-static const struct pipe_buf_operations default_pipe_buf_ops = {
+const struct pipe_buf_operations default_pipe_buf_ops = {
.can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
@@ -570,7 +354,7 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);
-static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
unsigned long vlen, loff_t offset)
{
mm_segment_t old_fs;
@@ -602,102 +386,70 @@ ssize_t kernel_write(struct file *file, const char *buf, size_t count,
}
EXPORT_SYMBOL(kernel_write);
-ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
+ struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
+ struct iov_iter to;
+ struct page **pages;
unsigned int nr_pages;
- unsigned int nr_freed;
- size_t offset;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
+ size_t offset, dummy, copied = 0;
ssize_t res;
- size_t this_len;
- int error;
int i;
- struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
- .ops = &default_pipe_buf_ops,
- .spd_release = spd_release_page,
- };
- if (splice_grow_spd(pipe, &spd))
+ if (pipe->nrbufs == pipe->buffers)
+ return -EAGAIN;
+
+ /*
+ * Try to keep page boundaries matching to source pagecache ones -
+ * it probably won't be much help, but...
+ */
+ offset = *ppos & ~PAGE_MASK;
+
+ iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
+
+ res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
+ if (res <= 0)
return -ENOMEM;
- res = -ENOMEM;
+ nr_pages = res / PAGE_SIZE;
+
vec = __vec;
- if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
- vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
- if (!vec)
- goto shrink_ret;
+ if (nr_pages > PIPE_DEF_BUFFERS) {
+ vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
+ if (unlikely(!vec)) {
+ res = -ENOMEM;
+ goto out;
+ }
}
- offset = *ppos & ~PAGE_MASK;
- nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
- for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
- struct page *page;
+ pipe->bufs[to.idx].offset = offset;
+ pipe->bufs[to.idx].len -= offset;
- page = alloc_page(GFP_USER);
- error = -ENOMEM;
- if (!page)
- goto err;
-
- this_len = min_t(size_t, len, PAGE_SIZE - offset);
- vec[i].iov_base = (void __user *) page_address(page);
+ for (i = 0; i < nr_pages; i++) {
+ size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
+ vec[i].iov_base = page_address(pages[i]) + offset;
vec[i].iov_len = this_len;
- spd.pages[i] = page;
- spd.nr_pages++;
len -= this_len;
offset = 0;
}
- res = kernel_readv(in, vec, spd.nr_pages, *ppos);
- if (res < 0) {
- error = res;
- goto err;
- }
-
- error = 0;
- if (!res)
- goto err;
-
- nr_freed = 0;
- for (i = 0; i < spd.nr_pages; i++) {
- this_len = min_t(size_t, vec[i].iov_len, res);
- spd.partial[i].offset = 0;
- spd.partial[i].len = this_len;
- if (!this_len) {
- __free_page(spd.pages[i]);
- spd.pages[i] = NULL;
- nr_freed++;
- }
- res -= this_len;
- }
- spd.nr_pages -= nr_freed;
-
- res = splice_to_pipe(pipe, &spd);
- if (res > 0)
+ res = kernel_readv(in, vec, nr_pages, *ppos);
+ if (res > 0) {
+ copied = res;
*ppos += res;
+ }
-shrink_ret:
if (vec != __vec)
kfree(vec);
- splice_shrink_spd(&spd);
+out:
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ kvfree(pages);
+ iov_iter_advance(&to, copied); /* truncates and discards */
return res;
-
-err:
- for (i = 0; i < spd.nr_pages; i++)
- __free_page(spd.pages[i]);
-
- res = error;
- goto shrink_ret;
}
-EXPORT_SYMBOL(default_file_splice_read);
/*
* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
@@ -757,13 +509,12 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
while (pipe->nrbufs) {
struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
- const struct pipe_buf_operations *ops = buf->ops;
sd->len = buf->len;
if (sd->len > sd->total_len)
sd->len = sd->total_len;
- ret = buf->ops->confirm(pipe, buf);
+ ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
ret = 0;
@@ -783,8 +534,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
sd->total_len -= ret;
if (!buf->len) {
- buf->ops = NULL;
- ops->release(pipe, buf);
+ pipe_buf_release(pipe, buf);
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
if (pipe->files)
@@ -1003,7 +753,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (idx == pipe->buffers - 1)
idx = -1;
- ret = buf->ops->confirm(pipe, buf);
+ ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
ret = 0;
@@ -1030,11 +780,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
while (ret) {
struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
if (ret >= buf->len) {
- const struct pipe_buf_operations *ops = buf->ops;
ret -= buf->len;
buf->len = 0;
- buf->ops = NULL;
- ops->release(pipe, buf);
+ pipe_buf_release(pipe, buf);
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
if (pipe->files)
@@ -1273,10 +1021,8 @@ out_release:
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
- if (buf->ops) {
- buf->ops->release(pipe, buf);
- buf->ops = NULL;
- }
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
}
if (!bytes)
@@ -1342,6 +1088,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
}
EXPORT_SYMBOL(do_splice_direct);
+static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
+{
+ while (pipe->nrbufs == pipe->buffers) {
+ if (flags & SPLICE_F_NONBLOCK)
+ return -EAGAIN;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ pipe->waiting_writers++;
+ pipe_wait(pipe);
+ pipe->waiting_writers--;
+ }
+ return 0;
+}
+
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
@@ -1424,8 +1184,13 @@ static long do_splice(struct file *in, loff_t __user *off_in,
offset = in->f_pos;
}
- ret = do_splice_to(in, &offset, opipe, len, flags);
-
+ pipe_lock(opipe);
+ ret = wait_for_space(opipe, flags);
+ if (!ret)
+ ret = do_splice_to(in, &offset, opipe, len, flags);
+ pipe_unlock(opipe);
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
@@ -1437,106 +1202,50 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
}
-/*
- * Map an iov into an array of pages and offset/length tupples. With the
- * partial_page structure, we can map several non-contiguous ranges into
- * our ones pages[] map instead of splitting that operation into pieces.
- * Could easily be exported as a generic helper for other users, in which
- * case one would probably want to add a 'max_nr_pages' parameter as well.
- */
-static int get_iovec_page_array(const struct iovec __user *iov,
- unsigned int nr_vecs, struct page **pages,
- struct partial_page *partial, bool aligned,
- unsigned int pipe_buffers)
+static int iter_to_pipe(struct iov_iter *from,
+ struct pipe_inode_info *pipe,
+ unsigned flags)
{
- int buffers = 0, error = 0;
-
- while (nr_vecs) {
- unsigned long off, npages;
- struct iovec entry;
- void __user *base;
- size_t len;
- int i;
-
- error = -EFAULT;
- if (copy_from_user(&entry, iov, sizeof(entry)))
- break;
-
- base = entry.iov_base;
- len = entry.iov_len;
-
- /*
- * Sanity check this iovec. 0 read succeeds.
- */
- error = 0;
- if (unlikely(!len))
- break;
- error = -EFAULT;
- if (!access_ok(VERIFY_READ, base, len))
- break;
-
- /*
- * Get this base offset and number of pages, then map
- * in the user pages.
- */
- off = (unsigned long) base & ~PAGE_MASK;
-
- /*
- * If asked for alignment, the offset must be zero and the
- * length a multiple of the PAGE_SIZE.
- */
- error = -EINVAL;
- if (aligned && (off || len & ~PAGE_MASK))
- break;
-
- npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (npages > pipe_buffers - buffers)
- npages = pipe_buffers - buffers;
-
- error = get_user_pages_fast((unsigned long)base, npages,
- 0, &pages[buffers]);
-
- if (unlikely(error <= 0))
+ struct pipe_buffer buf = {
+ .ops = &user_page_pipe_buf_ops,
+ .flags = flags
+ };
+ size_t total = 0;
+ int ret = 0;
+ bool failed = false;
+
+ while (iov_iter_count(from) && !failed) {
+ struct page *pages[16];
+ ssize_t copied;
+ size_t start;
+ int n;
+
+ copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
+ if (copied <= 0) {
+ ret = copied;
break;
-
- /*
- * Fill this contiguous range into the partial page map.
- */
- for (i = 0; i < error; i++) {
- const int plen = min_t(size_t, len, PAGE_SIZE - off);
-
- partial[buffers].offset = off;
- partial[buffers].len = plen;
-
- off = 0;
- len -= plen;
- buffers++;
}
- /*
- * We didn't complete this iov, stop here since it probably
- * means we have to move some of this into a pipe to
- * be able to continue.
- */
- if (len)
- break;
-
- /*
- * Don't continue if we mapped fewer pages than we asked for,
- * or if we mapped the max number of pages that we have
- * room for.
- */
- if (error < npages || buffers == pipe_buffers)
- break;
-
- nr_vecs--;
- iov++;
+ for (n = 0; copied; n++, start = 0) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+ if (!failed) {
+ buf.page = pages[n];
+ buf.offset = start;
+ buf.len = size;
+ ret = add_to_pipe(pipe, &buf);
+ if (unlikely(ret < 0)) {
+ failed = true;
+ } else {
+ iov_iter_advance(from, ret);
+ total += ret;
+ }
+ } else {
+ put_page(pages[n]);
+ }
+ copied -= size;
+ }
}
-
- if (buffers)
- return buffers;
-
- return error;
+ return total ? total : ret;
}
static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
@@ -1590,38 +1299,36 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
* as splice-from-memory, where the regular splice is splice-from-file (or
* to file). In both cases the output is a pipe, naturally.
*/
-static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
- .ops = &user_page_pipe_buf_ops,
- .spd_release = spd_release_page,
- };
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter from;
long ret;
+ unsigned buf_flag = 0;
+
+ if (flags & SPLICE_F_GIFT)
+ buf_flag = PIPE_BUF_FLAG_GIFT;
pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
- spd.partial, false,
- spd.nr_pages_max);
- if (spd.nr_pages <= 0)
- ret = spd.nr_pages;
- else
- ret = splice_to_pipe(pipe, &spd);
+ ret = import_iovec(WRITE, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), &iov, &from);
+ if (ret < 0)
+ return ret;
- splice_shrink_spd(&spd);
+ pipe_lock(pipe);
+ ret = wait_for_space(pipe, flags);
+ if (!ret)
+ ret = iter_to_pipe(&from, pipe, buf_flag);
+ pipe_unlock(pipe);
+ if (ret > 0)
+ wakeup_pipe_readers(pipe);
+ kfree(iov);
return ret;
}
@@ -1876,7 +1583,7 @@ retry:
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- ibuf->ops->get(ipipe, ibuf);
+ pipe_buf_get(ipipe, ibuf);
*obuf = *ibuf;
/*
@@ -1948,7 +1655,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- ibuf->ops->get(ipipe, ibuf);
+ pipe_buf_get(ipipe, ibuf);
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 0927b1e80ab6..e9793b1e49a5 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -425,7 +425,6 @@ failed_read:
const struct inode_operations squashfs_inode_ops = {
- .getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 67cad77fefb4..40c10d9974c9 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -247,6 +247,5 @@ failed:
const struct inode_operations squashfs_dir_inode_ops = {
.lookup = squashfs_lookup,
- .getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index d688ef42a6a1..79b9c31a0c8f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -120,7 +120,6 @@ const struct address_space_operations squashfs_symlink_aops = {
const struct inode_operations squashfs_symlink_inode_ops = {
.readlink = generic_readlink,
.get_link = page_get_link,
- .getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index c83f5d9ec125..afe70f815e3d 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -42,6 +42,5 @@ static inline int squashfs_xattr_lookup(struct super_block *sb,
return 0;
}
#define squashfs_listxattr NULL
-#define generic_getxattr NULL
#define squashfs_xattr_handlers NULL
#endif
diff --git a/fs/super.c b/fs/super.c
index c2ff475c1711..c183835566c1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1269,25 +1269,34 @@ EXPORT_SYMBOL(__sb_start_write);
static void sb_wait_write(struct super_block *sb, int level)
{
percpu_down_write(sb->s_writers.rw_sem + level-1);
- /*
- * We are going to return to userspace and forget about this lock, the
- * ownership goes to the caller of thaw_super() which does unlock.
- *
- * FIXME: we should do this before return from freeze_super() after we
- * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super()
- * should re-acquire these locks before s_op->unfreeze_fs(sb). However
- * this leads to lockdep false-positives, so currently we do the early
- * release right after acquire.
- */
- percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
}
-static void sb_freeze_unlock(struct super_block *sb)
+/*
+ * We are going to return to userspace and forget about these locks, the
+ * ownership goes to the caller of thaw_super() which does unlock().
+ */
+static void lockdep_sb_freeze_release(struct super_block *sb)
+{
+ int level;
+
+ for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
+}
+
+/*
+ * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
+ */
+static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
int level;
for (level = 0; level < SB_FREEZE_LEVELS; ++level)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
+}
+
+static void sb_freeze_unlock(struct super_block *sb)
+{
+ int level;
for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
@@ -1379,10 +1388,11 @@ int freeze_super(struct super_block *sb)
}
}
/*
- * This is just for debugging purposes so that fs can warn if it
- * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+ * For debugging purposes so that fs can warn if it sees write activity
+ * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+ lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return 0;
}
@@ -1399,7 +1409,7 @@ int thaw_super(struct super_block *sb)
int error;
down_write(&sb->s_umount);
- if (sb->s_writers.frozen == SB_UNFROZEN) {
+ if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
up_write(&sb->s_umount);
return -EINVAL;
}
@@ -1409,11 +1419,14 @@ int thaw_super(struct super_block *sb)
goto out;
}
+ lockdep_sb_freeze_acquire(sb);
+
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
printk(KERN_ERR
"VFS:Filesystem thaw failed\n");
+ lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return error;
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 94374e435025..2b67bda2021b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock);
void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
{
- char *buf, *path = NULL;
+ char *buf;
buf = kzalloc(PATH_MAX, GFP_KERNEL);
if (buf)
- path = kernfs_path(parent, buf, PATH_MAX);
+ kernfs_path(parent, buf, PATH_MAX);
WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
- path, name);
+ buf, name);
kfree(buf);
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index dc1358b5ec95..ac2de0ed69ad 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -233,8 +233,8 @@ void sysfs_remove_group(struct kobject *kobj,
kn = kernfs_find_and_get(parent, grp->name);
if (!kn) {
WARN(!kn, KERN_WARNING
- "sysfs group %p not found for kobject '%s'\n",
- grp, kobject_name(kobj));
+ "sysfs group '%s' not found for kobject '%s'\n",
+ grp->name, kobject_name(kobj));
return;
}
} else {
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2661b77fc8a7..5bdae85ceef7 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -215,7 +215,7 @@ got_it:
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
out_page:
dir_put_page(page);
@@ -239,7 +239,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
de->inode = 0;
err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
dir_put_page(page);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
return err;
}
@@ -337,7 +337,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
dir_put_page(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 82ddc09061e2..7ba997e31aeb 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -33,7 +33,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index f9db4eb31db4..53f1b78996dd 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -164,7 +164,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
dirty_sb(sb);
inode_init_owner(inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_blocks = 0;
memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 2fde40acf024..08d3e630b49c 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -178,7 +178,7 @@ static inline int splice_branch(struct inode *inode,
*where->p = where->key;
write_unlock(&pointers_lock);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -418,7 +418,7 @@ do_indirects:
}
n++;
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
sysv_sync_inode (inode);
else
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index a42de45ce40d..d8817f139763 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -120,7 +120,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -206,7 +206,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
* higher-level routines.
*/
static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
- struct inode * new_dir, struct dentry * new_dentry)
+ struct inode * new_dir, struct dentry * new_dentry,
+ unsigned int flags)
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
@@ -216,6 +217,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct sysv_dir_entry * old_de;
int err = -ENOENT;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_de = sysv_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
@@ -240,7 +244,7 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
if (!new_de)
goto out_dir;
sysv_set_link(new_de, new_page, old_inode);
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index ad40b64c5e2f..21d36d284735 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -133,7 +133,7 @@ static struct inode *tracefs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
}
return inode;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 4b86d3a738e1..bd4a5e8ce441 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -301,6 +301,95 @@ out_budg:
return err;
}
+static int do_tmpfile(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct inode **whiteout)
+{
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
+ struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
+ int err, instantiated = 0;
+
+ /*
+ * Budget request settings: new dirty inode, new direntry,
+ * budget for dirtied inode will be released via writeback.
+ */
+
+ dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+ dentry, mode, dir->i_ino);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &ino_req);
+ if (err) {
+ ubifs_release_budget(c, &req);
+ return err;
+ }
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+ ui = ubifs_inode(inode);
+
+ if (whiteout) {
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ ubifs_assert(inode->i_op == &ubifs_file_inode_operations);
+ }
+
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_inode;
+
+ mutex_lock(&ui->ui_mutex);
+ insert_inode_hash(inode);
+
+ if (whiteout) {
+ mark_inode_dirty(inode);
+ drop_nlink(inode);
+ *whiteout = inode;
+ } else {
+ d_tmpfile(dentry, inode);
+ }
+ ubifs_assert(ui->dirty);
+
+ instantiated = 1;
+ mutex_unlock(&ui->ui_mutex);
+
+ mutex_lock(&dir_ui->ui_mutex);
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+
+ return 0;
+
+out_cancel:
+ mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+ make_bad_inode(inode);
+ if (!instantiated)
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ if (!instantiated)
+ ubifs_release_budget(c, &ino_req);
+ ubifs_err(c, "cannot create temporary file, error %d", err);
+ return err;
+}
+
+static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ return do_tmpfile(dir, dentry, mode, NULL);
+}
+
/**
* vfs_dent_type - get VFS directory entry type.
* @type: UBIFS directory entry type
@@ -350,7 +439,7 @@ static unsigned int vfs_dent_type(uint8_t type)
*/
static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
- int err;
+ int err = 0;
struct qstr nm;
union ubifs_key key;
struct ubifs_dent_node *dent;
@@ -452,14 +541,12 @@ out:
kfree(file->private_data);
file->private_data = NULL;
- if (err != -ENOENT) {
+ if (err != -ENOENT)
ubifs_err(c, "cannot find next direntry, error %d", err);
- return err;
- }
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
- return 0;
+ return err;
}
/* Free saved readdir() state when the directory is closed */
@@ -927,37 +1014,43 @@ out_budg:
}
/**
- * lock_3_inodes - a wrapper for locking three UBIFS inodes.
+ * lock_4_inodes - a wrapper for locking three UBIFS inodes.
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
+ * @inode4: fouth inode
*
* This function is used for 'ubifs_rename()' and @inode1 may be the same as
- * @inode2 whereas @inode3 may be %NULL.
+ * @inode2 whereas @inode3 and @inode4 may be %NULL.
*
* We do not implement any tricks to guarantee strict lock ordering, because
* VFS has already done it for us on the @i_mutex. So this is just a simple
* wrapper function.
*/
-static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
- struct inode *inode3)
+static void lock_4_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3, struct inode *inode4)
{
mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
if (inode2 != inode1)
mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
if (inode3)
mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
+ if (inode4)
+ mutex_lock_nested(&ubifs_inode(inode4)->ui_mutex, WB_MUTEX_4);
}
/**
- * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
+ * unlock_4_inodes - a wrapper for unlocking three UBIFS inodes for rename.
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
+ * @inode4: fouth inode
*/
-static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
- struct inode *inode3)
+static void unlock_4_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3, struct inode *inode4)
{
+ if (inode4)
+ mutex_unlock(&ubifs_inode(inode4)->ui_mutex);
if (inode3)
mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
if (inode1 != inode2)
@@ -965,13 +1058,16 @@ static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
}
-static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct ubifs_info *c = old_dir->i_sb->s_fs_info;
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
+ struct inode *whiteout = NULL;
struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+ struct ubifs_inode *whiteout_ui = NULL;
int err, release, sync = 0, move = (new_dir != old_dir);
int is_dir = S_ISDIR(old_inode->i_mode);
int unlink = !!new_inode;
@@ -984,6 +1080,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct timespec time;
unsigned int uninitialized_var(saved_nlink);
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
@@ -993,15 +1092,13 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
* separately.
*/
- dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
+ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
old_dentry, old_inode->i_ino, old_dir->i_ino,
- new_dentry, new_dir->i_ino);
- ubifs_assert(inode_is_locked(old_dir));
- ubifs_assert(inode_is_locked(new_dir));
+ new_dentry, new_dir->i_ino, flags);
+
if (unlink)
ubifs_assert(inode_is_locked(new_inode));
-
if (unlink && is_dir) {
err = check_dir_empty(c, new_inode);
if (err)
@@ -1017,7 +1114,32 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
return err;
}
- lock_3_inodes(old_dir, new_dir, new_inode);
+ if (flags & RENAME_WHITEOUT) {
+ union ubifs_dev_desc *dev = NULL;
+
+ dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!dev) {
+ ubifs_release_budget(c, &req);
+ ubifs_release_budget(c, &ino_req);
+ return -ENOMEM;
+ }
+
+ err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
+ if (err) {
+ ubifs_release_budget(c, &req);
+ ubifs_release_budget(c, &ino_req);
+ kfree(dev);
+ return err;
+ }
+
+ whiteout->i_state |= I_LINKABLE;
+ whiteout_ui = ubifs_inode(whiteout);
+ whiteout_ui->data = dev;
+ whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
+ ubifs_assert(!whiteout_ui->dirty);
+ }
+
+ lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
/*
* Like most other Unix systems, set the @i_ctime for inodes on a
@@ -1087,12 +1209,34 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink && IS_SYNC(new_inode))
sync = 1;
}
- err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+
+ if (whiteout) {
+ struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
+ .dirtied_ino_d = \
+ ALIGN(ubifs_inode(whiteout)->data_len, 8) };
+
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+ ubifs_release_budget(c, &req);
+ ubifs_release_budget(c, &ino_req);
+ kfree(whiteout_ui->data);
+ whiteout_ui->data_len = 0;
+ iput(whiteout);
+ return err;
+ }
+
+ inc_nlink(whiteout);
+ mark_inode_dirty(whiteout);
+ whiteout->i_state &= ~I_LINKABLE;
+ iput(whiteout);
+ }
+
+ err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout,
sync);
if (err)
goto out_cancel;
- unlock_3_inodes(old_dir, new_dir, new_inode);
+ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &req);
mutex_lock(&old_inode_ui->ui_mutex);
@@ -1125,12 +1269,74 @@ out_cancel:
inc_nlink(old_dir);
}
}
- unlock_3_inodes(old_dir, new_dir, new_inode);
+ if (whiteout) {
+ drop_nlink(whiteout);
+ iput(whiteout);
+ }
+ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
return err;
}
+static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+ .dirtied_ino = 2 };
+ int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+ struct inode *fst_inode = d_inode(old_dentry);
+ struct inode *snd_inode = d_inode(new_dentry);
+ struct timespec time;
+ int err;
+
+ ubifs_assert(fst_inode && snd_inode);
+
+ lock_4_inodes(old_dir, new_dir, NULL, NULL);
+
+ time = ubifs_current_time(old_dir);
+ fst_inode->i_ctime = time;
+ snd_inode->i_ctime = time;
+ old_dir->i_mtime = old_dir->i_ctime = time;
+ new_dir->i_mtime = new_dir->i_ctime = time;
+
+ if (old_dir != new_dir) {
+ if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) {
+ inc_nlink(new_dir);
+ drop_nlink(old_dir);
+ }
+ else if (!S_ISDIR(fst_inode->i_mode) && S_ISDIR(snd_inode->i_mode)) {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+
+ err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry,
+ sync);
+
+ unlock_4_inodes(old_dir, new_dir, NULL, NULL);
+ ubifs_release_budget(c, &req);
+
+ return err;
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_WHITEOUT | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ ubifs_assert(inode_is_locked(old_dir));
+ ubifs_assert(inode_is_locked(new_dir));
+
+ if (flags & RENAME_EXCHANGE)
+ return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+}
+
int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
@@ -1182,13 +1388,11 @@ const struct inode_operations ubifs_dir_inode_operations = {
.rename = ubifs_rename,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
+ .tmpfile = ubifs_tmpfile,
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 7bbf420d1289..b4fbeefba246 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1262,7 +1262,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
dbg_gen("ino %lu, mode %#x, ia_valid %#x",
inode->i_ino, inode->i_mode, attr->ia_valid);
- err = inode_change_ok(inode, attr);
+ err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -1397,7 +1397,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
#endif
/**
- * update_ctime - update mtime and ctime of an inode.
+ * update_mctime - update mtime and ctime of an inode.
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
@@ -1621,10 +1621,7 @@ const struct address_space_operations ubifs_file_address_operations = {
const struct inode_operations ubifs_file_inode_operations = {
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
@@ -1635,10 +1632,7 @@ const struct inode_operations ubifs_symlink_inode_operations = {
.get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ubifs_listxattr,
- .removexattr = generic_removexattr,
#ifdef CONFIG_UBIFS_ATIME_SUPPORT
.update_time = ubifs_update_time,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 821b34816976..e845c64b6ce1 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -113,7 +113,7 @@ static int switch_gc_head(struct ubifs_info *c)
* data_nodes_cmp - compare 2 data nodes.
* @priv: UBIFS file-system description object
* @a: first data node
- * @a: second data node
+ * @b: second data node
*
* This function compares data nodes @a and @b. Returns %1 if @a has greater
* inode or block number, and %-1 otherwise.
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 0b9da5b6e0f9..91bc76dc559e 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -908,6 +908,147 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
}
/**
+ * ubifs_jnl_xrename - cross rename two directory entries.
+ * @c: UBIFS file-system description object
+ * @fst_dir: parent inode of 1st directory entry to exchange
+ * @fst_dentry: 1st directory entry to exchange
+ * @snd_dir: parent inode of 2nd directory entry to exchange
+ * @snd_dentry: 2nd directory entry to exchange
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the cross rename operation which may involve
+ * writing 2 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
+ const struct dentry *fst_dentry,
+ const struct inode *snd_dir,
+ const struct dentry *snd_dentry, int sync)
+{
+ union ubifs_key key;
+ struct ubifs_dent_node *dent1, *dent2;
+ int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ;
+ int aligned_dlen1, aligned_dlen2;
+ int twoparents = (fst_dir != snd_dir);
+ const struct inode *fst_inode = d_inode(fst_dentry);
+ const struct inode *snd_inode = d_inode(snd_dentry);
+ void *p;
+
+ dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
+ fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
+
+ ubifs_assert(ubifs_inode(fst_dir)->data_len == 0);
+ ubifs_assert(ubifs_inode(snd_dir)->data_len == 0);
+ ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex));
+ ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex));
+
+ dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1;
+ aligned_dlen1 = ALIGN(dlen1, 8);
+ aligned_dlen2 = ALIGN(dlen2, 8);
+
+ len = aligned_dlen1 + aligned_dlen2 + ALIGN(plen, 8);
+ if (twoparents)
+ len += plen;
+
+ dent1 = kmalloc(len, GFP_NOFS);
+ if (!dent1)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ /* Make new dent for 1st entry */
+ dent1->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name);
+ dent1->inum = cpu_to_le64(fst_inode->i_ino);
+ dent1->type = get_dent_type(fst_inode->i_mode);
+ dent1->nlen = cpu_to_le16(snd_dentry->d_name.len);
+ memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len);
+ dent1->name[snd_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent1);
+ ubifs_prep_grp_node(c, dent1, dlen1, 0);
+
+ /* Make new dent for 2nd entry */
+ dent2 = (void *)dent1 + aligned_dlen1;
+ dent2->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name);
+ dent2->inum = cpu_to_le64(snd_inode->i_ino);
+ dent2->type = get_dent_type(snd_inode->i_mode);
+ dent2->nlen = cpu_to_le16(fst_dentry->d_name.len);
+ memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len);
+ dent2->name[fst_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent2);
+ ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+ p = (void *)dent2 + aligned_dlen2;
+ if (!twoparents)
+ pack_inode(c, p, fst_dir, 1);
+ else {
+ pack_inode(c, p, fst_dir, 0);
+ p += ALIGN(plen, 8);
+ pack_inode(c, p, snd_dir, 1);
+ }
+
+ err = write_head(c, BASEHD, dent1, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, fst_dir->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, snd_dir->i_ino);
+ }
+ release_head(c, BASEHD);
+
+ dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ offs += aligned_dlen1;
+ dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ offs += aligned_dlen2;
+
+ ino_key_init(c, &key, fst_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+
+ if (twoparents) {
+ offs += ALIGN(plen, 8);
+ ino_key_init(c, &key, snd_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+ }
+
+ finish_reservation(c);
+
+ mark_inode_clean(c, ubifs_inode(fst_dir));
+ if (twoparents)
+ mark_inode_clean(c, ubifs_inode(snd_dir));
+ kfree(dent1);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(dent1);
+ return err;
+}
+
+/**
* ubifs_jnl_rename - rename a directory entry.
* @c: UBIFS file-system description object
* @old_dir: parent inode of directory entry to rename
@@ -917,14 +1058,15 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the re-name operation which may involve writing up
- * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * to 4 inodes and 2 directory entries. It marks the written inodes as clean
* and returns zero on success. In case of failure, a negative error code is
* returned.
*/
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
const struct dentry *old_dentry,
const struct inode *new_dir,
- const struct dentry *new_dentry, int sync)
+ const struct dentry *new_dentry,
+ const struct inode *whiteout, int sync)
{
void *p;
union ubifs_key key;
@@ -958,7 +1100,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
aligned_dlen1 = ALIGN(dlen1, 8);
aligned_dlen2 = ALIGN(dlen2, 8);
len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
- if (old_dir != new_dir)
+ if (move)
len += plen;
dent = kmalloc(len, GFP_NOFS);
if (!dent)
@@ -980,13 +1122,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen1, 0);
- /* Make deletion dent */
dent2 = (void *)dent + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
&old_dentry->d_name);
- dent2->inum = 0;
- dent2->type = DT_UNKNOWN;
+
+ if (whiteout) {
+ dent2->inum = cpu_to_le64(whiteout->i_ino);
+ dent2->type = get_dent_type(whiteout->i_mode);
+ } else {
+ /* Make deletion dent */
+ dent2->inum = 0;
+ dent2->type = DT_UNKNOWN;
+ }
dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
dent2->name[old_dentry->d_name.len] = '\0';
@@ -1035,16 +1183,26 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (err)
goto out_ro;
- err = ubifs_add_dirt(c, lnum, dlen2);
- if (err)
- goto out_ro;
+ offs += aligned_dlen1;
+ if (whiteout) {
+ dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name);
+ if (err)
+ goto out_ro;
- dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
- err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
- if (err)
- goto out_ro;
+ ubifs_delete_orphan(c, whiteout->i_ino);
+ } else {
+ err = ubifs_add_dirt(c, lnum, dlen2);
+ if (err)
+ goto out_ro;
+
+ dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+ err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+ if (err)
+ goto out_ro;
+ }
- offs += aligned_dlen1 + aligned_dlen2;
+ offs += aligned_dlen2;
if (new_inode) {
ino_key_init(c, &key, new_inode->i_ino);
err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
@@ -1058,7 +1216,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (err)
goto out_ro;
- if (old_dir != new_dir) {
+ if (move) {
offs += ALIGN(plen, 8);
ino_key_init(c, &key, new_dir->i_ino);
err = ubifs_tnc_add(c, &key, lnum, offs, plen);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index a0011aa3a779..6c3a1abd0e22 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -636,7 +636,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
/**
* ubifs_get_lp_stats - get lprops statistics.
* @c: UBIFS file-system description object
- * @st: return statistics
+ * @lst: return statistics
*/
void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
{
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index ce89bdc3eb02..235654c2fe89 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -34,7 +34,6 @@ static int dbg_populate_lsave(struct ubifs_info *c);
/**
* first_dirty_cnode - find first dirty cnode.
- * @c: UBIFS file-system description object
* @nnode: nnode at which to start
*
* This function returns the first dirty cnode or %NULL if there is not one.
@@ -1623,7 +1622,6 @@ static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
* dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
* @c: the UBIFS file-system description object
* @lnum: LEB number where node was written
- * @offs: offset where node was written
*
* This function returns %0 on success and a negative error code on failure.
*/
@@ -1870,7 +1868,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
}
/**
- * ubifs_dump_lpt_leb - dump an LPT LEB.
+ * dump_lpt_leb - dump an LPT LEB.
* @c: UBIFS file-system description object
* @lnum: LEB number to dump
*
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3ca4540130b5..fb0f44cd1e28 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -267,7 +267,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
* replay_entries_cmp - compare 2 replay entries.
* @priv: UBIFS file-system description object
* @a: first replay entry
- * @a: second replay entry
+ * @b: second replay entry
*
* This is a comparios function for 'list_sort()' which compares 2 replay
* entries @a and @b by comparing their sequence numer. Returns %1 if @a has
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4617d459022a..096035eb29d0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -157,6 +157,7 @@ enum {
WB_MUTEX_1 = 0,
WB_MUTEX_2 = 1,
WB_MUTEX_3 = 2,
+ WB_MUTEX_4 = 3,
};
/*
@@ -1520,10 +1521,15 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len);
int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
+ const struct dentry *fst_dentry,
+ const struct inode *snd_dir,
+ const struct dentry *snd_dentry, int sync);
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
const struct dentry *old_dentry,
const struct inode *new_dir,
- const struct dentry *new_dentry, int sync);
+ const struct dentry *new_dentry,
+ const struct inode *whiteout, int sync);
int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
loff_t old_size, loff_t new_size);
int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 11a004114eba..d9f9615bfd71 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -172,6 +172,7 @@ out_cancel:
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+ host_ui->xattr_names -= nm->len;
mutex_unlock(&host_ui->ui_mutex);
out_free:
make_bad_inode(inode);
@@ -200,6 +201,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_inode *ui = ubifs_inode(inode);
void *buf = NULL;
+ int old_size;
struct ubifs_budget_req req = { .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
@@ -217,12 +219,13 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
kfree(ui->data);
ui->data = buf;
inode->i_size = ui->ui_size = size;
+ old_size = ui->data_len;
ui->data_len = size;
mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
- host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
/*
@@ -241,7 +244,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
- host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(old_size);
mutex_unlock(&host_ui->ui_mutex);
make_bad_inode(inode);
out_free:
@@ -476,6 +479,7 @@ out_cancel:
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_names += nm->len;
mutex_unlock(&host_ui->ui_mutex);
ubifs_release_budget(c, &req);
make_bad_inode(inode);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 632570617327..dbcb3a4a0cb9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -94,7 +94,7 @@ static int udf_adinicb_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
- if (!PageUptodate(page) && len != PAGE_SIZE)
+ if (!PageUptodate(page))
__udf_adinicb_readpage(page);
return 0;
}
@@ -105,11 +105,25 @@ static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0;
}
+static int udf_adinicb_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t last_pos = pos + copied;
+ if (last_pos > inode->i_size)
+ i_size_write(inode, last_pos);
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ return copied;
+}
+
const struct address_space_operations udf_adinicb_aops = {
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
- .write_end = simple_write_end,
+ .write_end = udf_adinicb_write_end,
.direct_IO = udf_adinicb_direct_IO,
};
@@ -247,7 +261,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *inode = d_inode(dentry);
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index e77db621ec89..c1ed18a10ce4 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -121,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
inode->i_mtime = inode->i_atime = inode->i_ctime =
- iinfo->i_crtime = current_fs_time(inode->i_sb);
+ iinfo->i_crtime = current_time(inode);
if (unlikely(insert_inode_locked(inode) < 0)) {
make_bad_inode(inode);
iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 55aa587bbc38..aad46401ede5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -886,7 +886,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
*new = 1;
iinfo->i_next_alloc_block = block;
iinfo->i_next_alloc_goal = newblocknum;
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
@@ -1268,7 +1268,7 @@ set_size:
up_write(&iinfo->i_data_sem);
}
update_time:
- inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c3e5c9679371..2d65e280748b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -616,7 +616,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
@@ -730,7 +730,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
inc_nlink(dir);
- dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
unlock_new_inode(inode);
d_instantiate(dentry, inode);
@@ -845,7 +845,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
inode_dec_link_count(dir);
inode->i_ctime = dir->i_ctime = dir->i_mtime =
- current_fs_time(dir->i_sb);
+ current_time(inode);
mark_inode_dirty(dir);
end_rmdir:
@@ -888,7 +888,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
retval = udf_delete_entry(dir, fi, &fibh, &cfi);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
inode_dec_link_count(inode);
inode->i_ctime = dir->i_ctime;
@@ -1079,9 +1079,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
brelse(fibh.ebh);
brelse(fibh.sbh);
inc_nlink(inode);
- inode->i_ctime = current_fs_time(inode->i_sb);
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
- dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -1093,7 +1093,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
* higher-level routines.
*/
static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
@@ -1105,6 +1106,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (IS_ERR(ofi)) {
retval = PTR_ERR(ofi);
@@ -1172,7 +1176,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_fs_time(old_inode->i_sb);
+ old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
/*
@@ -1188,11 +1192,11 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
udf_delete_entry(old_dir, ofi, &ofibh, &ocfi);
if (new_inode) {
- new_inode->i_ctime = current_fs_time(new_inode->i_sb);
+ new_inode->i_ctime = current_time(new_inode);
inode_dec_link_count(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
- new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+ old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
mark_inode_dirty(old_dir);
mark_inode_dirty(new_dir);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index fa3bda1a860f..de01b8f2aa78 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -100,7 +100,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
err = ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
}
@@ -389,7 +389,7 @@ got_it:
ufs_set_de_type(sb, de, inode->i_mode);
err = ufs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
/* OFFSET_CACHE */
@@ -530,7 +530,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
err = ufs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
out:
ufs_put_page(page);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index fd0203ce1f7f..9774555b3721 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -290,7 +290,7 @@ cg_found:
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
ufsi->i_flags = UFS_I(dir)->i_flags;
ufsi->i_lastfrag = 0;
ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 9f49431e798d..190d64be22ed 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -293,7 +293,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
if (new)
*new = 1;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
if (IS_SYNC(inode))
ufs_sync_inode (inode);
mark_inode_dirty(inode);
@@ -375,7 +375,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
out:
brelse (bh);
@@ -1185,7 +1185,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
__ufs_truncate_blocks(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
out:
UFSD("EXIT: err %d\n", err);
@@ -1208,7 +1208,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
unsigned int ia_valid = attr->ia_valid;
int error;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(dentry, attr);
if (error)
return error;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index a1559f762805..8eca4eda8450 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
struct inode *inode = d_inode(old_dentry);
int error;
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -245,7 +245,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
}
static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
@@ -255,6 +256,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
@@ -279,7 +283,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_de)
goto out_dir;
ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = current_time(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -295,7 +299,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = CURRENT_TIME_SEC;
+ old_inode->i_ctime = current_time(old_inode);
ufs_delete_entry(old_dir, old_de, old_page);
mark_inode_dirty(old_inode);
diff --git a/fs/utimes.c b/fs/utimes.c
index 794f5f5b1fb5..22307cdf7014 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -81,27 +81,13 @@ static int utimes_common(struct path *path, struct timespec *times)
newattrs.ia_valid |= ATTR_MTIME_SET;
}
/*
- * Tell inode_change_ok(), that this is an explicit time
+ * Tell setattr_prepare(), that this is an explicit time
* update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
* were used.
*/
newattrs.ia_valid |= ATTR_TIMES_SET;
} else {
- /*
- * If times is NULL (or both times are UTIME_NOW),
- * then we need to check permissions, because
- * inode_change_ok() won't do it.
- */
- error = -EPERM;
- if (IS_IMMUTABLE(inode))
- goto mnt_drop_write_and_out;
-
- error = -EACCES;
- if (!inode_owner_or_capable(inode)) {
- error = inode_permission(inode, MAY_WRITE);
- if (error)
- goto mnt_drop_write_and_out;
- }
+ newattrs.ia_valid |= ATTR_TOUCH;
}
retry_deleg:
inode_lock(inode);
@@ -113,7 +99,6 @@ retry_deleg:
goto retry_deleg;
}
-mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
out:
return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index c243905835ab..3368659c471e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -24,6 +24,59 @@
#include <asm/uaccess.h>
+static const char *
+strcmp_prefix(const char *a, const char *a_prefix)
+{
+ while (*a_prefix && *a == *a_prefix) {
+ a++;
+ a_prefix++;
+ }
+ return *a_prefix ? NULL : a;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix, a filesystem should create a null-terminated array of struct
+ * xattr_handler (one for each prefix) and hang a pointer to it off of the
+ * s_xattr field of the superblock.
+ */
+#define for_each_xattr_handler(handlers, handler) \
+ if (handlers) \
+ for ((handler) = *(handlers)++; \
+ (handler) != NULL; \
+ (handler) = *(handlers)++)
+
+/*
+ * Find the xattr_handler with the matching prefix.
+ */
+static const struct xattr_handler *
+xattr_resolve_name(struct inode *inode, const char **name)
+{
+ const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+ const struct xattr_handler *handler;
+
+ if (!(inode->i_opflags & IOP_XATTR)) {
+ if (unlikely(is_bad_inode(inode)))
+ return ERR_PTR(-EIO);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ for_each_xattr_handler(handlers, handler) {
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
+ if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
+ *name = n;
+ return handler;
+ }
+ }
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
/*
* Check permissions for extended attribute access. This is a bit complicated
* because different namespaces have very different rules.
@@ -80,6 +133,23 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return inode_permission(inode, mask);
}
+int
+__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ const struct xattr_handler *handler;
+
+ handler = xattr_resolve_name(inode, &name);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ if (!handler->set)
+ return -EOPNOTSUPP;
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+ return handler->set(handler, dentry, inode, name, value, size, flags);
+}
+EXPORT_SYMBOL(__vfs_setxattr);
+
/**
* __vfs_setxattr_noperm - perform setxattr operation without performing
* permission checks.
@@ -106,8 +176,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
if (issec)
inode->i_flags &= ~S_NOSEC;
- if (inode->i_op->setxattr) {
- error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
+ if (inode->i_opflags & IOP_XATTR) {
+ error = __vfs_setxattr(dentry, inode, name, value, size, flags);
if (!error) {
fsnotify_xattr(dentry);
security_inode_post_setxattr(dentry, name, value,
@@ -115,6 +185,9 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
}
} else if (issec) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+
+ if (unlikely(is_bad_inode(inode)))
+ return -EIO;
error = security_inode_setsecurity(inode, suffix, value,
size, flags);
if (!error)
@@ -188,6 +261,7 @@ ssize_t
vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
size_t xattr_size, gfp_t flags)
{
+ const struct xattr_handler *handler;
struct inode *inode = dentry->d_inode;
char *value = *xattr_value;
int error;
@@ -196,10 +270,12 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
if (error)
return error;
- if (!inode->i_op->getxattr)
+ handler = xattr_resolve_name(inode, &name);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ if (!handler->get)
return -EOPNOTSUPP;
-
- error = inode->i_op->getxattr(dentry, inode, name, NULL, 0);
+ error = handler->get(handler, dentry, inode, name, NULL, 0);
if (error < 0)
return error;
@@ -210,12 +286,27 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
memset(value, 0, error + 1);
}
- error = inode->i_op->getxattr(dentry, inode, name, value, error);
+ error = handler->get(handler, dentry, inode, name, value, error);
*xattr_value = value;
return error;
}
ssize_t
+__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ const struct xattr_handler *handler;
+
+ handler = xattr_resolve_name(inode, &name);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ if (!handler->get)
+ return -EOPNOTSUPP;
+ return handler->get(handler, dentry, inode, name, value, size);
+}
+EXPORT_SYMBOL(__vfs_getxattr);
+
+ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
struct inode *inode = dentry->d_inode;
@@ -242,28 +333,24 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
return ret;
}
nolsm:
- if (inode->i_op->getxattr)
- error = inode->i_op->getxattr(dentry, inode, name, value, size);
- else
- error = -EOPNOTSUPP;
-
- return error;
+ return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
ssize_t
-vfs_listxattr(struct dentry *d, char *list, size_t size)
+vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
+ struct inode *inode = d_inode(dentry);
ssize_t error;
- error = security_inode_listxattr(d);
+ error = security_inode_listxattr(dentry);
if (error)
return error;
- error = -EOPNOTSUPP;
- if (d->d_inode->i_op->listxattr) {
- error = d->d_inode->i_op->listxattr(d, list, size);
+ if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) {
+ error = -EOPNOTSUPP;
+ error = inode->i_op->listxattr(dentry, list, size);
} else {
- error = security_inode_listsecurity(d->d_inode, list, size);
+ error = security_inode_listsecurity(inode, list, size);
if (size && error > size)
error = -ERANGE;
}
@@ -272,14 +359,26 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
EXPORT_SYMBOL_GPL(vfs_listxattr);
int
+__vfs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct xattr_handler *handler;
+
+ handler = xattr_resolve_name(inode, &name);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ if (!handler->set)
+ return -EOPNOTSUPP;
+ return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE);
+}
+EXPORT_SYMBOL(__vfs_removexattr);
+
+int
vfs_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = dentry->d_inode;
int error;
- if (!inode->i_op->removexattr)
- return -EOPNOTSUPP;
-
error = xattr_permission(inode, name, MAY_WRITE);
if (error)
return error;
@@ -289,7 +388,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
if (error)
goto out;
- error = inode->i_op->removexattr(dentry, name);
+ error = __vfs_removexattr(dentry, name);
if (!error) {
fsnotify_xattr(dentry);
@@ -641,76 +740,6 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
return error;
}
-
-static const char *
-strcmp_prefix(const char *a, const char *a_prefix)
-{
- while (*a_prefix && *a == *a_prefix) {
- a++;
- a_prefix++;
- }
- return *a_prefix ? NULL : a;
-}
-
-/*
- * In order to implement different sets of xattr operations for each xattr
- * prefix with the generic xattr API, a filesystem should create a
- * null-terminated array of struct xattr_handler (one for each prefix) and
- * hang a pointer to it off of the s_xattr field of the superblock.
- *
- * The generic_fooxattr() functions will use this list to dispatch xattr
- * operations to the correct xattr_handler.
- */
-#define for_each_xattr_handler(handlers, handler) \
- if (handlers) \
- for ((handler) = *(handlers)++; \
- (handler) != NULL; \
- (handler) = *(handlers)++)
-
-/*
- * Find the xattr_handler with the matching prefix.
- */
-static const struct xattr_handler *
-xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
-{
- const struct xattr_handler *handler;
-
- if (!*name)
- return ERR_PTR(-EINVAL);
-
- for_each_xattr_handler(handlers, handler) {
- const char *n;
-
- n = strcmp_prefix(*name, xattr_prefix(handler));
- if (n) {
- if (!handler->prefix ^ !*n) {
- if (*n)
- continue;
- return ERR_PTR(-EINVAL);
- }
- *name = n;
- return handler;
- }
- }
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-/*
- * Find the handler for the prefix and dispatch its get() operation.
- */
-ssize_t
-generic_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- const struct xattr_handler *handler;
-
- handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (IS_ERR(handler))
- return PTR_ERR(handler);
- return handler->get(handler, dentry, inode,
- name, buffer, size);
-}
-
/*
* Combine the results of the list() operation from every xattr_handler in the
* list.
@@ -747,44 +776,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
return size;
}
-
-/*
- * Find the handler for the prefix and dispatch its set() operation.
- */
-int
-generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- const struct xattr_handler *handler;
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
- handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (IS_ERR(handler))
- return PTR_ERR(handler);
- return handler->set(handler, dentry, inode, name, value, size, flags);
-}
-
-/*
- * Find the handler for the prefix and dispatch its set() operation to remove
- * any associated extended attribute.
- */
-int
-generic_removexattr(struct dentry *dentry, const char *name)
-{
- const struct xattr_handler *handler;
-
- handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (IS_ERR(handler))
- return PTR_ERR(handler);
- return handler->set(handler, dentry, d_inode(dentry), name, NULL,
- 0, XATTR_REPLACE);
-}
-
-EXPORT_SYMBOL(generic_getxattr);
EXPORT_SYMBOL(generic_listxattr);
-EXPORT_SYMBOL(generic_setxattr);
-EXPORT_SYMBOL(generic_removexattr);
/**
* xattr_full_name - Compute full attribute name from suffix
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fc593c869493..26ef1958b65b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,8 +52,11 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
+ xfs_ag_resv.o \
xfs_rmap.o \
xfs_rmap_btree.o \
+ xfs_refcount.o \
+ xfs_refcount_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
xfs_trans_resv.o \
@@ -87,6 +90,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_reflink.o \
xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
@@ -99,16 +103,20 @@ xfs-y += xfs_aops.o \
# low-level transaction/log code
xfs-y += xfs_log.o \
xfs_log_cil.o \
+ xfs_bmap_item.o \
xfs_buf_item.o \
xfs_extfree_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
+ xfs_refcount_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
xfs_trans_ail.o \
+ xfs_trans_bmap.o \
xfs_trans_buf.o \
xfs_trans_extfree.o \
xfs_trans_inode.o \
+ xfs_trans_refcount.o \
xfs_trans_rmap.o \
# optional features
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e5ebc3770460
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG. However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion. Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior: First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken. Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting. Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations. Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions. In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function. It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction. The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks. Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks? For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t avail;
+ xfs_extlen_t orig;
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+ orig = pag->pag_meta_resv.ar_asked;
+ break;
+ case XFS_AG_RESV_AGFL:
+ avail = pag->pagf_freeblks + pag->pagf_flcount -
+ pag->pag_meta_resv.ar_reserved;
+ orig = pag->pag_agfl_resv.ar_asked;
+ break;
+ default:
+ ASSERT(0);
+ return false;
+ }
+
+ trace_xfs_ag_resv_critical(pag, type, avail);
+
+ /* Critically low if less than 10% or max btree height remains. */
+ return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL,
+ XFS_RANDOM_AG_RESV_CRITICAL);
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t len;
+
+ len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ len -= xfs_perag_resv(pag, type)->ar_reserved;
+ break;
+ case XFS_AG_RESV_NONE:
+ /* empty */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ trace_xfs_ag_resv_needed(pag, type, len);
+
+ return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t oldresv;
+ int error;
+
+ trace_xfs_ag_resv_free(pag, type, 0);
+
+ resv = xfs_perag_resv(pag, type);
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ /*
+ * AGFL blocks are always considered "free", so whatever
+ * was reserved at mount time must be given back at umount.
+ */
+ if (type == XFS_AG_RESV_AGFL)
+ oldresv = resv->ar_orig_reserved;
+ else
+ oldresv = resv->ar_reserved;
+ error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ resv->ar_reserved = 0;
+ resv->ar_asked = 0;
+
+ if (error)
+ trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+ struct xfs_perag *pag)
+{
+ int error;
+ int err2;
+
+ error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+ err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+ if (err2 && !error)
+ error = err2;
+ return error;
+}
+
+static int
+__xfs_ag_resv_init(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ xfs_extlen_t ask,
+ xfs_extlen_t used)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_resv *resv;
+ int error;
+
+ resv = xfs_perag_resv(pag, type);
+ if (used > ask)
+ ask = used;
+ resv->ar_asked = ask;
+ resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+ mp->m_ag_max_usable -= ask;
+
+ trace_xfs_ag_resv_init(pag, type, ask);
+
+ error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+ if (error)
+ trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+
+ return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t ask;
+ xfs_extlen_t used;
+ int error = 0;
+
+ /* Create the metadata reservation. */
+ if (pag->pag_meta_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+ pag->pag_agno, &ask, &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error)
+ goto out;
+ }
+
+ /* Create the AGFL metadata reservation */
+ if (pag->pag_agfl_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+ &ask, &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+ if (error)
+ goto out;
+ }
+
+out:
+ return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t len;
+ uint field;
+
+ trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS;
+ xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+ return;
+ }
+
+ len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+ resv->ar_reserved -= len;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Allocations of reserved blocks only need on-disk sb updates... */
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+ /* ...but non-reserved blocks need in-core and on-disk updates. */
+ if (args->len > len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+ -((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_trans *tp,
+ xfs_extlen_t len)
+{
+ xfs_extlen_t leftover;
+ struct xfs_ag_resv *resv;
+
+ trace_xfs_ag_resv_free_extent(pag, type, len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ return;
+ }
+
+ leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+ resv->ar_reserved += leftover;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Freeing into the reserved pool only requires on-disk update... */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+ /* ...but freeing beyond that requires in-core and on-disk update. */
+ if (len > leftover)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define __XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+ enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 05b5243d89f6..effb64cf714f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_ag_resv.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -51,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+unsigned int
+xfs_refc_block(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
xfs_extlen_t
xfs_prealloc_blocks(
struct xfs_mount *mp)
{
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ return xfs_refc_block(mp) + 1;
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
return XFS_RMAP_BLOCK(mp) + 1;
if (xfs_sb_version_hasfinobt(&mp->m_sb))
@@ -74,14 +88,8 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block. The bmbt block
- * might not be in the same AG as the file data extent. In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
@@ -90,8 +98,6 @@ xfs_alloc_set_aside(
unsigned int blocks;
blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
return blocks;
}
@@ -122,6 +128,8 @@ xfs_alloc_ag_max_usable(
blocks++; /* finobt root block */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks++; /* rmap root block */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ blocks++; /* refcount root block */
return mp->m_sb.sb_agblocks - blocks;
}
@@ -265,7 +273,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
- char userdata, /* are we allocating data? */
+ int datatype, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -276,6 +284,7 @@ xfs_alloc_compute_diff(
xfs_extlen_t newlen1=0; /* length with newbno1 */
xfs_extlen_t newlen2=0; /* length with newbno2 */
xfs_agblock_t wantend; /* end of target extent */
+ bool userdata = xfs_alloc_is_userdata(datatype);
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
@@ -680,12 +689,29 @@ xfs_alloc_ag_vextent(
xfs_alloc_arg_t *args) /* argument structure for allocation */
{
int error=0;
+ xfs_extlen_t reservation;
+ xfs_extlen_t oldmax;
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->mod < args->prod);
ASSERT(args->alignment > 0);
+
+ /*
+ * Clamp maxlen to the amount of free space minus any reservations
+ * that have been made.
+ */
+ oldmax = args->maxlen;
+ reservation = xfs_ag_resv_needed(args->pag, args->resv);
+ if (args->maxlen > args->pag->pagf_freeblks - reservation)
+ args->maxlen = args->pag->pagf_freeblks - reservation;
+ if (args->maxlen == 0) {
+ args->agbno = NULLAGBLOCK;
+ args->maxlen = oldmax;
+ return 0;
+ }
+
/*
* Branch to correct routine based on the type.
*/
@@ -705,12 +731,14 @@ xfs_alloc_ag_vextent(
/* NOTREACHED */
}
+ args->maxlen = oldmax;
+
if (error || args->agbno == NULLAGBLOCK)
return error;
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || !args->isfl);
+ ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
ASSERT(args->agbno % args->alignment == 0);
/* if not file data, insert new block into the reverse map btree */
@@ -732,12 +760,7 @@ xfs_alloc_ag_vextent(
args->agbno, args->len));
}
- if (!args->isfl) {
- xfs_trans_mod_sb(args->tp, args->wasdel ?
- XFS_TRANS_SB_RES_FDBLOCKS :
- XFS_TRANS_SB_FDBLOCKS,
- -((long)(args->len)));
- }
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
XFS_STATS_INC(args->mp, xs_allocx);
XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -917,7 +940,7 @@ xfs_alloc_find_best_extent(
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment,
- args->userdata, *sbnoa,
+ args->datatype, *sbnoa,
*slena, &new);
/*
@@ -1101,7 +1124,7 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, ltbnoa,
+ args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
@@ -1254,7 +1277,7 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, ltbnoa,
+ args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
@@ -1271,7 +1294,7 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->userdata, gtbnoa,
+ args->alignment, args->datatype, gtbnoa,
gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
@@ -1331,7 +1354,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- args->userdata, ltbnoa, ltlena, &ltnew);
+ args->datatype, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1583,6 +1606,7 @@ xfs_alloc_ag_vextent_small(
int *stat) /* status: 0-freelist, 1-normal/none */
{
struct xfs_owner_info oinfo;
+ struct xfs_perag *pag;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1600,7 +1624,8 @@ xfs_alloc_ag_vextent_small(
* to respect minleft even when pulling from the
* freelist.
*/
- else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+ else if (args->minlen == 1 && args->alignment == 1 &&
+ args->resv != XFS_AG_RESV_AGFL &&
(be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
> args->minleft)) {
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1608,9 +1633,9 @@ xfs_alloc_ag_vextent_small(
goto error0;
if (fbno != NULLAGBLOCK) {
xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- args->userdata);
+ xfs_alloc_allow_busy_reuse(args->datatype));
- if (args->userdata) {
+ if (xfs_alloc_is_userdata(args->datatype)) {
xfs_buf_t *bp;
bp = xfs_btree_get_bufs(args->mp, args->tp,
@@ -1629,13 +1654,18 @@ xfs_alloc_ag_vextent_small(
/*
* If we're feeding an AGFL block to something that
* doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
+ * out the OWN_AG rmap and add the block back to
+ * the AGFL per-AG reservation.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
fbno, 1, &oinfo);
if (error)
goto error0;
+ pag = xfs_perag_get(args->mp, args->agno);
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+ args->tp, 1);
+ xfs_perag_put(pag);
*stat = 0;
return 0;
@@ -1683,7 +1713,7 @@ xfs_free_ag_extent(
xfs_agblock_t bno,
xfs_extlen_t len,
struct xfs_owner_info *oinfo,
- int isfl)
+ enum xfs_ag_resv_type type)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1911,21 +1941,22 @@ xfs_free_ag_extent(
*/
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_update_counters(tp, pag, agbp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
xfs_perag_put(pag);
if (error)
goto error0;
- if (!isfl)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -1950,21 +1981,43 @@ xfs_alloc_compute_maxlevels(
}
/*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG. The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
*/
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
struct xfs_perag *pag,
- xfs_extlen_t need)
+ xfs_extlen_t need,
+ xfs_extlen_t reserved)
{
xfs_extlen_t delta = 0;
+ /*
+ * If the AGFL needs a recharge, we'll have to subtract that from the
+ * longest extent.
+ */
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
+ /*
+ * If we cannot maintain others' reservations with space from the
+ * not-longest freesp extents, we'll have to subtract /that/ from
+ * the longest extent too.
+ */
+ if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+ delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+ /*
+ * If the longest extent is long enough to satisfy all the
+ * reservations and AGFL rules in place, we can return this extent.
+ */
if (pag->pagf_longest > delta)
return pag->pagf_longest - delta;
+
+ /* Otherwise, let the caller try for 1 block if there's space. */
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
@@ -2004,20 +2057,24 @@ xfs_alloc_space_available(
{
struct xfs_perag *pag = args->pag;
xfs_extlen_t longest;
+ xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
if (flags & XFS_ALLOC_FLAG_FREEING)
return true;
+ reservation = xfs_ag_resv_needed(pag, args->resv);
+
/* do we have enough contiguous free space for the allocation? */
- longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+ reservation);
if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
return false;
- /* do have enough free space remaining for the allocation? */
+ /* do we have enough free space remaining for the allocation? */
available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
- min_free - args->total);
- if (available < (int)args->minleft)
+ reservation - min_free - args->total);
+ if (available < (int)args->minleft || available <= 0)
return false;
return true;
@@ -2058,7 +2115,7 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && args->userdata &&
+ if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
@@ -2124,7 +2181,7 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
- &targs.oinfo, 1);
+ &targs.oinfo, XFS_AG_RESV_AGFL);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2192,7 @@ xfs_alloc_fix_freelist(
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+ targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2203,7 @@ xfs_alloc_fix_freelist(
while (pag->pagf_flcount < need) {
targs.agbno = 0;
targs.maxlen = need - pag->pagf_flcount;
+ targs.resv = XFS_AG_RESV_AGFL;
/* Allocate as many blocks as possible at once. */
error = xfs_alloc_ag_vextent(&targs);
@@ -2278,6 +2336,9 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_btreeblks),
offsetof(xfs_agf_t, agf_uuid),
offsetof(xfs_agf_t, agf_rmap_blocks),
+ offsetof(xfs_agf_t, agf_refcount_blocks),
+ offsetof(xfs_agf_t, agf_refcount_root),
+ offsetof(xfs_agf_t, agf_refcount_level),
/* needed so that we don't log the whole rest of the structure: */
offsetof(xfs_agf_t, agf_spare64),
sizeof(xfs_agf_t)
@@ -2415,6 +2476,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
return false;
+ if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+ return false;
+
return true;;
}
@@ -2535,6 +2600,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
pag->pagf_levels[XFS_BTNUM_RMAPi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
@@ -2633,7 +2699,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2766,7 +2832,7 @@ xfs_alloc_vextent(
#endif
/* Zero the extent if we were asked to do so */
- if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(args->ip, args->fsbno, args->len);
if (error)
goto error0;
@@ -2825,7 +2891,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo) /* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type) /* block reservation type */
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
@@ -2834,6 +2901,7 @@ xfs_free_extent(
int error;
ASSERT(len != 0);
+ ASSERT(type != XFS_AG_RESV_AGFL);
if (XFS_TEST_ERROR(false, mp,
XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2919,7 @@ xfs_free_extent(
agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
err);
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
goto err;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6fe2d6b7cfe9..7c404a6b0ae3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
+ int datatype; /* mask defining data type treatment */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
- char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
+ enum xfs_ag_resv_type resv; /* block reservation to use */
} xfs_alloc_arg_t;
/*
- * Defines for userdata
+ * Defines for datatype
*/
#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
+
+static inline bool
+xfs_alloc_is_userdata(int datatype)
+{
+ return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
+}
+
+static inline bool
+xfs_alloc_allow_busy_reuse(int datatype)
+{
+ return (datatype & XFS_ALLOC_NOBUSY) == 0;
+}
/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
@@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
- struct xfs_perag *pag, xfs_extlen_t need);
+ struct xfs_perag *pag, xfs_extlen_t need,
+ xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
@@ -184,7 +198,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo);/* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type); /* block reservation type */
int /* error */
xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b060bca93402..c6eb21940783 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -139,7 +141,8 @@ xfs_bmbt_lookup_ge(
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
- return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ return whichfork != XFS_COW_FORK &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork);
}
@@ -149,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
- return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ return whichfork != XFS_COW_FORK &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
XFS_IFORK_NEXTENTS(ip, whichfork) <=
XFS_IFORK_MAXEXT(ip, whichfork);
}
@@ -639,6 +643,7 @@ xfs_bmap_btree_to_extents(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
rblock = ifp->if_broot;
@@ -705,6 +710,7 @@ xfs_bmap_extents_to_btree(
xfs_bmbt_ptr_t *pp; /* root block address pointer */
mp = ip->i_mount;
+ ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
@@ -747,6 +753,7 @@ xfs_bmap_extents_to_btree(
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (dfops->dop_low) {
+try_another_ag:
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = *firstblock;
} else {
@@ -761,6 +768,21 @@ xfs_bmap_extents_to_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
+
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ dfops->dop_low = true;
+ goto try_another_ag;
+ }
/*
* Allocation can't fail, the space was reserved.
*/
@@ -836,6 +858,7 @@ xfs_bmap_local_to_extents_empty(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(whichfork != XFS_COW_FORK);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
ASSERT(ifp->if_bytes == 0);
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
@@ -895,6 +918,7 @@ xfs_bmap_local_to_extents(
* file currently fits in an inode.
*/
if (*firstblock == NULLFSBLOCK) {
+try_another_ag:
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -907,6 +931,19 @@ xfs_bmap_local_to_extents(
if (error)
goto done;
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ goto try_another_ag;
+ }
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
@@ -1388,7 +1425,7 @@ xfs_bmap_search_multi_extents(
* Else, *lastxp will be set to the index of the found
* entry; *gotp will contain the entry.
*/
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_extents(
xfs_inode_t *ip, /* incore inode pointer */
xfs_fileoff_t bno, /* block number searched for */
@@ -1669,7 +1706,8 @@ xfs_bmap_one_block(
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
- struct xfs_bmalloca *bma)
+ struct xfs_bmalloca *bma,
+ int whichfork)
{
struct xfs_bmbt_irec *new = &bma->got;
int diff; /* temp value */
@@ -1687,11 +1725,14 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
- int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp;
+ xfs_extnum_t *nextents;
mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ ASSERT(whichfork != XFS_ATTR_FORK);
+ nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
+ &bma->ip->i_d.di_nextents);
ASSERT(bma->idx >= 0);
ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1705,6 +1746,9 @@ xfs_bmap_add_extent_delay_real(
#define RIGHT r[1]
#define PREV r[2]
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
+
/*
* Set up a bunch of variables to make the tests simpler.
*/
@@ -1791,7 +1835,7 @@ xfs_bmap_add_extent_delay_real(
trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
- bma->ip->i_d.di_nextents--;
+ (*nextents)--;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1893,7 +1937,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, new->br_startblock);
trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1963,7 +2007,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2047,7 +2091,7 @@ xfs_bmap_add_extent_delay_real(
trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2116,7 +2160,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount = temp2;
/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
- bma->ip->i_d.di_nextents++;
+ (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2214,7 +2258,8 @@ xfs_bmap_add_extent_delay_real(
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
- bma->logflags |= rval;
+ if (whichfork != XFS_COW_FORK)
+ bma->logflags |= rval;
return error;
#undef LEFT
#undef RIGHT
@@ -2758,6 +2803,7 @@ done:
STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork,
xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
@@ -2769,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay(
int state; /* state bits, accessed thru macros */
xfs_filblks_t temp=0; /* temp for indirect calculations */
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
state = 0;
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ASSERT(isnullstartblock(new->br_startblock));
/*
@@ -2788,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
@@ -2922,6 +2970,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(whichfork != XFS_COW_FORK);
XFS_STATS_INC(mp, xs_add_exlist);
@@ -3347,7 +3396,8 @@ xfs_bmap_adjacent(
mp = ap->ip->i_mount;
nullfb = *ap->firstblock == NULLFSBLOCK;
- rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+ rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ xfs_alloc_is_userdata(ap->datatype);
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
@@ -3501,7 +3551,8 @@ xfs_bmap_longest_free_extent(
}
longest = xfs_alloc_longest_free_extent(mp, pag,
- xfs_alloc_min_freelist(mp, pag));
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
@@ -3622,7 +3673,7 @@ xfs_bmap_btalloc(
{
xfs_mount_t *mp; /* mount point structure */
xfs_alloctype_t atype = 0; /* type for allocation routines */
- xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
xfs_agnumber_t ag;
xfs_alloc_arg_t args;
@@ -3645,7 +3696,10 @@ xfs_bmap_btalloc(
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
- align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+ if (ap->flags & XFS_BMAPI_COWFORK)
+ align = xfs_get_cowextsz_hint(ap->ip);
+ else if (xfs_alloc_is_userdata(ap->datatype))
+ align = xfs_get_extsz_hint(ap->ip);
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
@@ -3658,7 +3712,8 @@ xfs_bmap_btalloc(
nullfb = *ap->firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
- if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+ if (xfs_alloc_is_userdata(ap->datatype) &&
+ xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
@@ -3698,7 +3753,8 @@ xfs_bmap_btalloc(
* enough for the request. If one isn't found, then adjust
* the minimum allocation size to the largest space found.
*/
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ if (xfs_alloc_is_userdata(ap->datatype) &&
+ xfs_inode_is_filestream(ap->ip))
error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
else
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
@@ -3781,9 +3837,9 @@ xfs_bmap_btalloc(
}
args.minleft = ap->minleft;
args.wasdel = ap->wasdel;
- args.isfl = 0;
- args.userdata = ap->userdata;
- if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+ if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
@@ -3850,7 +3906,8 @@ xfs_bmap_btalloc(
ASSERT(nullfb || fb_agno == args.agno ||
(ap->dfops->dop_low && fb_agno < args.agno));
ap->length = args.len;
- ap->ip->i_d.di_nblocks += args.len;
+ if (!(ap->flags & XFS_BMAPI_COWFORK))
+ ap->ip->i_d.di_nblocks += args.len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
ap->ip->i_delayed_blks -= args.len;
@@ -3870,6 +3927,60 @@ xfs_bmap_btalloc(
}
/*
+ * For a remap operation, just "allocate" an extent at the address that the
+ * caller passed in, and ensure that the AGFL is the right size. The caller
+ * will then map the "allocated" extent into the file somewhere.
+ */
+STATIC int
+xfs_bmap_remap_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_trans *tp = ap->tp;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agblock_t bno;
+ struct xfs_alloc_arg args;
+ int error;
+
+ /*
+ * validate that the block number is legal - the enables us to detect
+ * and handle a silent filesystem corruption rather than crashing.
+ */
+ memset(&args, 0, sizeof(struct xfs_alloc_arg));
+ args.tp = ap->tp;
+ args.mp = ap->tp->t_mountp;
+ bno = *ap->firstblock;
+ args.agno = XFS_FSB_TO_AGNO(mp, bno);
+ args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ if (args.agno >= mp->m_sb.sb_agcount ||
+ args.agbno >= mp->m_sb.sb_agblocks)
+ return -EFSCORRUPTED;
+
+ /* "Allocate" the extent from the range we passed in. */
+ trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
+ ap->blkno = bno;
+ ap->ip->i_d.di_nblocks += ap->length;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+
+ /* Fix the freelist, like a real allocator does. */
+ args.datatype = ap->datatype;
+ args.pag = xfs_perag_get(args.mp, args.agno);
+ ASSERT(args.pag);
+
+ /*
+ * The freelist fixing code will decline the allocation if
+ * the size and shape of the free space doesn't allow for
+ * allocating the extent and updating all the metadata that
+ * happens during an allocation. We're remapping, not
+ * allocating, so skip that check by pretending to be freeing.
+ */
+ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+ xfs_perag_put(args.pag);
+ if (error)
+ trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
+ return error;
+}
+
+/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -3877,11 +3988,47 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+ if (ap->flags & XFS_BMAPI_REMAP)
+ return xfs_bmap_remap_alloc(ap);
+ if (XFS_IS_REALTIME_INODE(ap->ip) &&
+ xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
return xfs_bmap_btalloc(ap);
}
+/* Trim extent to fit a logical block range. */
+void
+xfs_trim_extent(
+ struct xfs_bmbt_irec *irec,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len)
+{
+ xfs_fileoff_t distance;
+ xfs_fileoff_t end = bno + len;
+
+ if (irec->br_startoff + irec->br_blockcount <= bno ||
+ irec->br_startoff >= end) {
+ irec->br_blockcount = 0;
+ return;
+ }
+
+ if (irec->br_startoff < bno) {
+ distance = bno - irec->br_startoff;
+ if (isnullstartblock(irec->br_startblock))
+ irec->br_startblock = DELAYSTARTBLOCK;
+ if (irec->br_startblock != DELAYSTARTBLOCK &&
+ irec->br_startblock != HOLESTARTBLOCK)
+ irec->br_startblock += distance;
+ irec->br_startoff += distance;
+ irec->br_blockcount -= distance;
+ }
+
+ if (end < irec->br_startoff + irec->br_blockcount) {
+ distance = irec->br_startoff + irec->br_blockcount - end;
+ irec->br_blockcount -= distance;
+ }
+}
+
/*
* Trim the returned map to the required bounds
*/
@@ -4005,12 +4152,11 @@ xfs_bmapi_read(
int error;
int eof;
int n = 0;
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
- XFS_BMAPI_IGSTATE)));
+ XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
@@ -4028,6 +4174,16 @@ xfs_bmapi_read(
ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* No CoW fork? Return a hole. */
+ if (whichfork == XFS_COW_FORK && !ifp) {
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount = len;
+ mval->br_state = XFS_EXT_NORM;
+ *nmap = 1;
+ return 0;
+ }
+
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
if (error)
@@ -4074,9 +4230,10 @@ xfs_bmapi_read(
return 0;
}
-STATIC int
+int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
+ int whichfork,
xfs_fileoff_t aoff,
xfs_filblks_t len,
struct xfs_bmbt_irec *got,
@@ -4085,7 +4242,7 @@ xfs_bmapi_reserve_delalloc(
int eof)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
char rt = XFS_IS_REALTIME_INODE(ip);
@@ -4097,7 +4254,10 @@ xfs_bmapi_reserve_delalloc(
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
/* Figure out the extent size, adjust alen */
- extsz = xfs_get_extsz_hint(ip);
+ if (whichfork == XFS_COW_FORK)
+ extsz = xfs_get_cowextsz_hint(ip);
+ else
+ extsz = xfs_get_extsz_hint(ip);
if (extsz) {
error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
1, 0, &aoff, &alen);
@@ -4144,7 +4304,7 @@ xfs_bmapi_reserve_delalloc(
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
- xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
/*
* Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
@@ -4170,98 +4330,12 @@ out_unreserve_quota:
return error;
}
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- int flags) /* XFS_BMAPI_... */
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- struct xfs_bmbt_irec got; /* current file extent record */
- struct xfs_bmbt_irec prev; /* previous file extent record */
- xfs_fileoff_t obno; /* old block number (offset) */
- xfs_fileoff_t end; /* end of mapped file region */
- xfs_extnum_t lastx; /* last useful extent number */
- int eof; /* we've hit the end of extents */
- int n = 0; /* current extent index */
- int error = 0;
-
- ASSERT(*nmap >= 1);
- ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- XFS_STATS_INC(mp, xs_blk_mapw);
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- return error;
- }
-
- xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
- end = bno + len;
- obno = bno;
-
- while (bno < end && n < *nmap) {
- if (eof || got.br_startoff > bno) {
- error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
- &prev, &lastx, eof);
- if (error) {
- if (n == 0) {
- *nmap = 0;
- return error;
- }
- break;
- }
- }
-
- /* set up the extent map to return. */
- xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
- xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
- /* If we're done, stop now. */
- if (bno >= end || n >= *nmap)
- break;
-
- /* Else go on to the next record. */
- prev = got;
- if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
- else
- eof = 1;
- }
-
- *nmap = n;
- return 0;
-}
-
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
- int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4287,15 +4361,21 @@ xfs_bmapi_allocate(
}
/*
- * Indicate if this is the first user data in the file, or just any
- * user data. And if it is userdata, indicate whether it needs to
- * be initialised to zero during allocation.
+ * Set the data type being allocated. For the data fork, the first data
+ * in the file is treated differently to all other allocations. For the
+ * attribute fork, we only need to ensure the allocated range is not on
+ * the busy list.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
- bma->userdata = (bma->offset == 0) ?
- XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ bma->datatype = XFS_ALLOC_NOBUSY;
+ if (whichfork == XFS_DATA_FORK) {
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+ else
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ }
if (bma->flags & XFS_BMAPI_ZERO)
- bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
+ bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4350,7 +4430,7 @@ xfs_bmapi_allocate(
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
- error = xfs_bmap_add_extent_delay_real(bma);
+ error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma, whichfork);
@@ -4380,8 +4460,7 @@ xfs_bmapi_convert_unwritten(
xfs_filblks_t len,
int flags)
{
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ int whichfork = xfs_bmapi_whichfork(flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4397,6 +4476,8 @@ xfs_bmapi_convert_unwritten(
(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
return 0;
+ ASSERT(whichfork != XFS_COW_FORK);
+
/*
* Modify (by adding) the state flag, if writing.
*/
@@ -4503,8 +4584,7 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
@@ -4513,6 +4593,11 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+ ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+ ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4565,7 +4650,7 @@ xfs_bmapi_write(
bma.tp = tp;
bma.ip = ip;
bma.total = total;
- bma.userdata = 0;
+ bma.datatype = 0;
bma.dfops = dfops;
bma.firstblock = firstblock;
@@ -4574,6 +4659,14 @@ xfs_bmapi_write(
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
/*
+ * Make sure we only reflink into a hole.
+ */
+ if (flags & XFS_BMAPI_REMAP)
+ ASSERT(inhole);
+ if (flags & XFS_BMAPI_COWFORK)
+ ASSERT(!inhole);
+
+ /*
* First, deal with the hole before the allocated space
* that we found, if any.
*/
@@ -4603,6 +4696,17 @@ xfs_bmapi_write(
goto error0;
if (bma.blkno == NULLFSBLOCK)
break;
+
+ /*
+ * If this is a CoW allocation, record the data in
+ * the refcount btree for orphan recovery.
+ */
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(mp, dfops,
+ bma.blkno, bma.length);
+ if (error)
+ goto error0;
+ }
}
/* Deal with the allocated space we found. */
@@ -4755,6 +4859,219 @@ xfs_bmap_split_indlen(
return stolen;
}
+int
+xfs_bmap_del_extent_delay(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec new;
+ int64_t da_old, da_new, da_diff = 0;
+ xfs_fileoff_t del_endoff, got_endoff;
+ xfs_filblks_t got_indlen, new_indlen, stolen;
+ int error = 0, state = 0;
+ bool isrt;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+ da_old = startblockval(got->br_startblock);
+ da_new = 0;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+
+ if (isrt) {
+ int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+
+ do_div(rtexts, mp->m_sb.sb_rextsize);
+ xfs_mod_frextents(mp, rtexts);
+ }
+
+ /*
+ * Update the inode delalloc counter now and wait to update the
+ * sb counters as we might have to borrow some blocks for the
+ * indirect block accounting.
+ */
+ xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+ isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ ip->i_delayed_blks -= del->br_blockcount;
+
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_CONTIG;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_CONTIG;
+
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1, state);
+ --*idx;
+ break;
+ case BMAP_LEFT_CONTIG:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * Deleting the last part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount = got->br_blockcount - del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ *
+ * Distribute the original indlen reservation across the two new
+ * extents. Steal blocks from the deleted extent if necessary.
+ * Stealing blocks simply fudges the fdblocks accounting below.
+ * Warn if either of the new indlen reservations is zero as this
+ * can lead to delalloc problems.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+ got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
+
+ new.br_blockcount = got_endoff - del_endoff;
+ new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+
+ WARN_ON_ONCE(!got_indlen || !new_indlen);
+ stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
+ del->br_blockcount);
+
+ got->br_startblock = nullstartblock((int)got_indlen);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_);
+
+ new.br_startoff = del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = nullstartblock((int)new_indlen);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, &new, state);
+
+ da_new = got_indlen + new_indlen - stolen;
+ del->br_blockcount -= stolen;
+ break;
+ }
+
+ ASSERT(da_old >= da_new);
+ da_diff = da_old - da_new;
+ if (!isrt)
+ da_diff += del->br_blockcount;
+ if (da_diff)
+ xfs_mod_fdblocks(mp, da_diff, false);
+ return error;
+}
+
+void
+xfs_bmap_del_extent_cow(
+ struct xfs_inode *ip,
+ xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec new;
+ xfs_fileoff_t del_endoff, got_endoff;
+ int state = BMAP_COWFORK;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_CONTIG;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_CONTIG;
+
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1, state);
+ --*idx;
+ break;
+ case BMAP_LEFT_CONTIG:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ got->br_startblock = del->br_startblock + del->br_blockcount;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * Deleting the last part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount -= del->br_blockcount;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ new.br_startoff = del_endoff;
+ new.br_blockcount = got_endoff - del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = del->br_startblock + del->br_blockcount;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, &new, state);
+ break;
+ }
+}
+
/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space (or undoing a delayed allocation).
@@ -4768,7 +5085,8 @@ xfs_bmap_del_extent(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+ int whichfork, /* data or attr fork */
+ int bflags) /* bmapi flags */
{
xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -4797,6 +5115,8 @@ xfs_bmap_del_extent(
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
+ else if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
@@ -4877,6 +5197,7 @@ xfs_bmap_del_extent(
/*
* Matches the whole extent. Delete the entry.
*/
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_iext_remove(ip, *idx, 1,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
--*idx;
@@ -5060,9 +5381,16 @@ xfs_bmap_del_extent(
/*
* If we need to, add to list of extents to delete.
*/
- if (do_fx)
- xfs_bmap_add_free(mp, dfops, del->br_startblock,
- del->br_blockcount, NULL);
+ if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+ error = xfs_refcount_decrease_extent(mp, dfops, del);
+ if (error)
+ goto done;
+ } else
+ xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL);
+ }
+
/*
* Adjust inode # blocks in the file.
*/
@@ -5071,7 +5399,7 @@ xfs_bmap_del_extent(
/*
* Adjust quota data.
*/
- if (qfield)
+ if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
/*
@@ -5093,17 +5421,16 @@ done:
* *done is set.
*/
int /* error */
-xfs_bunmapi(
+__xfs_bunmapi(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting offset to unmap */
- xfs_filblks_t len, /* length to unmap in file */
+ xfs_filblks_t *rlen, /* i/o: amount remaining */
int flags, /* misc flags */
xfs_extnum_t nexts, /* number of extents max */
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
- struct xfs_defer_ops *dfops, /* i/o: list extents to free */
- int *done) /* set if not done yet */
+ struct xfs_defer_ops *dfops) /* i/o: deferred updates */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_irec_t del; /* extent being deleted */
@@ -5125,11 +5452,12 @@ xfs_bunmapi(
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
+ xfs_filblks_t len = *rlen; /* length to unmap in file */
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ whichfork = xfs_bmapi_whichfork(flags);
+ ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
if (unlikely(
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5151,7 +5479,7 @@ xfs_bunmapi(
return error;
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
if (nextents == 0) {
- *done = 1;
+ *rlen = 0;
return 0;
}
XFS_STATS_INC(mp, xs_blk_unmap);
@@ -5396,7 +5724,7 @@ xfs_bunmapi(
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
- &tmp_logflags, whichfork);
+ &tmp_logflags, whichfork, flags);
logflags |= tmp_logflags;
if (error)
goto error0;
@@ -5422,7 +5750,10 @@ nodelete:
extno++;
}
}
- *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+ if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+ *rlen = 0;
+ else
+ *rlen = bno - start + 1;
/*
* Convert to a btree if necessary.
@@ -5478,6 +5809,27 @@ error0:
return error;
}
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+ xfs_trans_t *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_extnum_t nexts,
+ xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops,
+ int *done)
+{
+ int error;
+
+ error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock,
+ dfops);
+ *done = (len == 0);
+ return error;
+}
+
/*
* Determine whether an extent shift can be accomplished by a merge with the
* extent that precedes the target hole of the shift.
@@ -6057,3 +6409,146 @@ out:
xfs_trans_cancel(tp);
return error;
}
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+ struct xfs_bmbt_irec *bmap)
+{
+ return bmap->br_startblock != HOLESTARTBLOCK &&
+ bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ int error;
+ struct xfs_bmap_intent *bi;
+
+ trace_xfs_bmap_defer(mp,
+ XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+ ip->i_ino, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&bi->bi_list);
+ bi->bi_type = type;
+ bi->bi_owner = ip;
+ bi->bi_whichfork = whichfork;
+ bi->bi_bmap = *bmap;
+
+ error = xfs_defer_join(dfops, bi->bi_owner);
+ if (error) {
+ kmem_free(bi);
+ return error;
+ }
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+ return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_bmap_map_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return 0;
+
+ return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip,
+ XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_bmap_unmap_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_bmap_is_update_needed(PREV))
+ return 0;
+
+ return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip,
+ XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations. We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
+{
+ struct xfs_bmbt_irec bmap;
+ int nimaps = 1;
+ xfs_fsblock_t firstfsb;
+ int flags = XFS_BMAPI_REMAP;
+ int done;
+ int error = 0;
+
+ bmap.br_startblock = startblock;
+ bmap.br_startoff = startoff;
+ bmap.br_blockcount = blockcount;
+ bmap.br_state = state;
+
+ trace_xfs_bmap_deferred(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+ ip->i_ino, whichfork, startoff, blockcount, state);
+
+ if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ return -EFSCORRUPTED;
+ if (whichfork == XFS_ATTR_FORK)
+ flags |= XFS_BMAPI_ATTRFORK;
+
+ if (XFS_TEST_ERROR(false, tp->t_mountp,
+ XFS_ERRTAG_BMAP_FINISH_ONE,
+ XFS_RANDOM_BMAP_FINISH_ONE))
+ return -EIO;
+
+ switch (type) {
+ case XFS_BMAP_MAP:
+ firstfsb = bmap.br_startblock;
+ error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
+ bmap.br_blockcount, flags, &firstfsb,
+ bmap.br_blockcount, &bmap, &nimaps,
+ dfops);
+ break;
+ case XFS_BMAP_UNMAP:
+ error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+ bmap.br_blockcount, flags, 1, &firstfsb,
+ dfops, &done);
+ ASSERT(done);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 254034f96941..7cae6ec27fa6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -54,7 +54,7 @@ struct xfs_bmalloca {
bool wasdel; /* replacing a delayed allocation */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
- char userdata;/* userdata mask */
+ int datatype;/* data type being allocated */
int flags;
};
@@ -97,6 +97,19 @@ struct xfs_extent_free_item
*/
#define XFS_BMAPI_ZERO 0x080
+/*
+ * Map the inode offset to the block given in ap->firstblock. Primarily
+ * used for reflink. The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP 0x100
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK 0x200
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -105,12 +118,24 @@ struct xfs_extent_free_item
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
{ XFS_BMAPI_CONVERT, "CONVERT" }, \
- { XFS_BMAPI_ZERO, "ZERO" }
+ { XFS_BMAPI_ZERO, "ZERO" }, \
+ { XFS_BMAPI_REMAP, "REMAP" }, \
+ { XFS_BMAPI_COWFORK, "COWFORK" }
static inline int xfs_bmapi_aflag(int w)
{
- return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+ return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+ (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(int bmapi_flags)
+{
+ if (bmapi_flags & XFS_BMAPI_COWFORK)
+ return XFS_COW_FORK;
+ else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+ return XFS_ATTR_FORK;
+ return XFS_DATA_FORK;
}
/*
@@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w)
#define BMAP_LEFT_VALID (1 << 6)
#define BMAP_RIGHT_VALID (1 << 7)
#define BMAP_ATTRFORK (1 << 8)
+#define BMAP_COWFORK (1 << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
{ BMAP_RIGHT_CONTIG, "RC" }, \
{ BMAP_LEFT_FILLING, "LF" }, \
{ BMAP_RIGHT_FILLING, "RF" }, \
- { BMAP_ATTRFORK, "ATTR" }
+ { BMAP_ATTRFORK, "ATTR" }, \
+ { BMAP_COWFORK, "COW" }
/*
@@ -163,6 +190,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
#endif
+void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
+ xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
@@ -181,18 +210,24 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
int *nmap, int flags);
-int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
- xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
struct xfs_bmbt_irec *mval, int *nmap,
struct xfs_defer_ops *dfops);
+int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops, int *done);
+int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+ xfs_extnum_t *idx, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
@@ -202,5 +237,35 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+struct xfs_bmbt_rec_host *
+ xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
+ int fork, int *eofp, xfs_extnum_t *lastxp,
+ struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
+int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t aoff, xfs_filblks_t len,
+ struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
+ xfs_extnum_t *lastx, int eof);
+
+enum xfs_bmap_intent_type {
+ XFS_BMAP_MAP = 1,
+ XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+ struct list_head bi_list;
+ enum xfs_bmap_intent_type bi_type;
+ struct xfs_inode *bi_owner;
+ int bi_whichfork;
+ struct xfs_bmbt_irec bi_bmap;
+};
+
+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+ int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount, xfs_exntst_t state);
+int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cd85274e810c..8007d2ba9aef 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -453,6 +453,7 @@ xfs_bmbt_alloc_block(
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
+try_another_ag:
args.type = XFS_ALLOCTYPE_START_BNO;
/*
* Make sure there is sufficient room left in the AG to
@@ -482,6 +483,22 @@ xfs_bmbt_alloc_block(
if (error)
goto error0;
+ /*
+ * During a CoW operation, the allocation and bmbt updates occur in
+ * different transactions. The mapping code tries to put new bmbt
+ * blocks near extents being mapped, but the only way to guarantee this
+ * is if the alloc and the mapping happen in a single transaction that
+ * has a block reservation. That isn't the case here, so if we run out
+ * of space we'll try again with another AG.
+ */
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+ cur->bc_private.b.dfops->dop_low = true;
+ args.fsbno = cur->bc_private.b.firstblock;
+ goto try_another_ag;
+ }
+
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
@@ -777,6 +794,7 @@ xfs_bmbt_init_cursor(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_cur *cur;
+ ASSERT(whichfork != XFS_COW_FORK);
cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..0e80993c8a59 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone;
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
- XFS_FIBT_MAGIC },
+ XFS_FIBT_MAGIC, 0 },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+ XFS_REFC_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -1216,6 +1217,9 @@ xfs_btree_set_refs(
case XFS_BTNUM_RMAP:
xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
break;
+ case XFS_BTNUM_REFC:
+ xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+ break;
default:
ASSERT(0);
}
@@ -2070,7 +2074,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp0,
bool force_all)
{
- union xfs_btree_bigkey key; /* keys from current level */
+ union xfs_btree_key key; /* keys from current level */
union xfs_btree_key *lkey; /* keys from the next level up */
union xfs_btree_key *hkey;
union xfs_btree_key *nlkey; /* keys from the next level up */
@@ -2086,7 +2090,7 @@ __xfs_btree_updkeys(
trace_xfs_btree_updkeys(cur, level, bp0);
- lkey = (union xfs_btree_key *)&key;
+ lkey = &key;
hkey = xfs_btree_high_key_from_key(cur, lkey);
xfs_btree_get_keys(cur, block, lkey);
for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3230,7 @@ xfs_btree_insrec(
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */
- union xfs_btree_bigkey nkey; /* new block key */
+ union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
int ptr; /* key/record index */
@@ -3241,7 +3245,7 @@ xfs_btree_insrec(
XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
ncur = NULL;
- lkey = (union xfs_btree_key *)&nkey;
+ lkey = &nkey;
/*
* If we have an external root pointer, and we've made it to the
@@ -3444,14 +3448,14 @@ xfs_btree_insert(
union xfs_btree_ptr nptr; /* new block number (split result) */
struct xfs_btree_cur *ncur; /* new cursor (split result) */
struct xfs_btree_cur *pcur; /* previous level's cursor */
- union xfs_btree_bigkey bkey; /* key of block to insert */
+ union xfs_btree_key bkey; /* key of block to insert */
union xfs_btree_key *key;
union xfs_btree_rec rec; /* record to insert */
level = 0;
ncur = NULL;
pcur = cur;
- key = (union xfs_btree_key *)&bkey;
+ key = &bkey;
xfs_btree_set_ptr_null(cur, &nptr);
@@ -4797,3 +4801,50 @@ xfs_btree_query_range(
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
fn, priv);
}
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+ struct xfs_mount *mp,
+ uint *limits,
+ unsigned long long len)
+{
+ int level;
+ int maxrecs;
+ xfs_extlen_t rval;
+
+ maxrecs = limits[0];
+ for (level = 0, rval = 0; len > 1; level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ maxrecs = limits[1];
+ rval += len;
+ }
+ return rval;
+}
+
+static int
+xfs_btree_count_blocks_helper(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *data)
+{
+ xfs_extlen_t *blocks = data;
+ (*blocks)++;
+
+ return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_extlen_t *blocks)
+{
+ *blocks = 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+ blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..c2b01d1c79ee 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,19 @@ union xfs_btree_ptr {
__be64 l; /* long form ptr */
};
-union xfs_btree_key {
- struct xfs_bmbt_key bmbt;
- xfs_bmdr_key_t bmbr; /* bmbt root block */
- xfs_alloc_key_t alloc;
- struct xfs_inobt_key inobt;
- struct xfs_rmap_key rmap;
-};
-
/*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory. Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key. Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both. The __*bigkey
+ * items should never be accessed directly.
*/
-union xfs_btree_bigkey {
+union xfs_btree_key {
struct xfs_bmbt_key bmbt;
xfs_bmdr_key_t bmbr; /* bmbt root block */
xfs_alloc_key_t alloc;
struct xfs_inobt_key inobt;
- struct {
- struct xfs_rmap_key rmap;
- struct xfs_rmap_key rmap_hi;
- };
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key __rmap_bigkey[2];
+ struct xfs_refcount_key refc;
};
union xfs_btree_rec {
@@ -69,6 +58,7 @@ union xfs_btree_rec {
struct xfs_alloc_rec alloc;
struct xfs_inobt_rec inobt;
struct xfs_rmap_rec rmap;
+ struct xfs_refcount_rec refc;
};
/*
@@ -84,6 +74,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
/*
* For logging record fields.
@@ -117,6 +108,7 @@ do { \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
+ case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -139,6 +131,8 @@ do { \
__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
case XFS_BTNUM_RMAP: \
__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
+ case XFS_BTNUM_REFC: \
+ __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -229,6 +223,15 @@ union xfs_btree_irec {
struct xfs_bmbt_irec b;
struct xfs_inobt_rec_incore i;
struct xfs_rmap_irec r;
+ struct xfs_refcount_irec rc;
+};
+
+/* Per-AG btree private information. */
+union xfs_btree_cur_private {
+ struct {
+ unsigned long nr_ops; /* # record updates */
+ int shape_changes; /* # of extent splits */
+ } refc;
};
/*
@@ -255,6 +258,7 @@ typedef struct xfs_btree_cur
struct xfs_buf *agbp; /* agf/agi buffer pointer */
struct xfs_defer_ops *dfops; /* deferred updates */
xfs_agnumber_t agno; /* ag number */
+ union xfs_btree_cur_private priv;
} a;
struct { /* needed for BMAP */
struct xfs_inode *ip; /* pointer to our inode */
@@ -513,6 +517,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+ unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -529,4 +535,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn, void *data);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
* - For each work item attached to the log intent item,
* * Perform the described action.
* * Attach the work item to the log done item.
+ * * If the result of doing the work was -EAGAIN, ->finish work
+ * wants a new transaction. See the "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" section below for
+ * details.
*
* The key here is that we must log an intent item for all pending
* work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
* we can perform complex remapping operations, chaining intent items
* as needed.
*
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation. The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off. The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish. Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work. Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue. Then it rolls the
+ * transaction and picks up processing where it left off. It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
* This is an example of remapping the extent (E, E+B) into file X at
* offset A and dealing with the extent (C, C+B) already being mapped
* there:
@@ -104,21 +136,26 @@
* | Intent to add rmap (X, E, A, B) |
* +-------------------------------------------------+
* | Reduce refcount for extent (C, B) | t2
- * | Done reducing refcount for extent (C, B) |
+ * | Done reducing refcount for extent (C, 9) |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9) | t3
+ * | Done reducing refcount for extent (C+9, B-9) |
* | Increase refcount for extent (E, B) |
* | Done increasing refcount for extent (E, B) |
* | Intent to free extent (C, B) |
* | Intent to free extent (F, 1) (refcountbt block) |
* | Intent to remove rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Remove rmap (X, C, A, B) | t3
+ * | Remove rmap (X, C, A, B) | t4
* | Done removing rmap (X, C, A, B) |
* | Add rmap (X, E, A, B) |
* | Done adding rmap (X, E, A, B) |
* | Remove rmap (F, 1, REFC) |
* | Done removing rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Free extent (C, B) | t4
+ * | Free extent (C, B) | t5
* | Done freeing extent (C, B) |
* | Free extent (D, 1) |
* | Done freeing extent (D, 1) |
@@ -141,6 +178,9 @@
* - Intent to free extent (C, B)
* - Intent to free extent (F, 1) (refcountbt block)
* - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
*/
static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
dfp->dfp_count--;
error = dfp->dfp_type->finish_item(*tp, dop, li,
dfp->dfp_done, &state);
- if (error) {
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction;
+ * put the work item back on the list
+ * and jump out.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ break;
+ } else if (error) {
/*
* Clean up after ourselves and jump out.
* xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
goto out;
}
}
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction, so log a
+ * new log intent item to replace the old one
+ * and roll the transaction. See "Requesting
+ * a Fresh Transaction while Finishing
+ * Deferred Work" above.
+ */
+ dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+ dfp->dfp_count);
+ dfp->dfp_done = NULL;
+ list_for_each(li, &dfp->dfp_work)
+ dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+ li);
+ } else {
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+ }
if (cleanup_fn)
cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index e96533d178cf..f6e93ef0bffe 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -51,6 +51,8 @@ struct xfs_defer_pending {
* find all the space it needs.
*/
enum xfs_defer_ops_type {
+ XFS_DEFER_OPS_TYPE_BMAP,
+ XFS_DEFER_OPS_TYPE_REFCOUNT,
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_MAX,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 3cc3cf767474..ac9a003dd29a 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc(
if (mp->m_quotainfo)
ndquots = mp->m_quotainfo->qi_dqperchunk;
else
- ndquots = xfs_calc_dquots_per_chunk(
- XFS_BB_TO_FSB(mp, bp->b_length));
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 270fb5cf4fa1..6b7579e7b60a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -456,9 +456,11 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
- XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+ XFS_SB_FEAT_RO_COMPAT_REFLINK)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
}
+static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
+}
+
/*
* end of superblock version macros
*/
@@ -641,14 +649,17 @@ typedef struct xfs_agf {
uuid_t agf_uuid; /* uuid of filesystem */
__be32 agf_rmap_blocks; /* rmapbt blocks used */
- __be32 agf_padding; /* padding */
+ __be32 agf_refcount_blocks; /* refcountbt blocks used */
+
+ __be32 agf_refcount_root; /* refcount tree root block */
+ __be32 agf_refcount_level; /* refcount btree levels */
/*
* reserve some contiguous space for future logged fields before we add
* the unlogged fields. This makes the range logging via flags and
* structure offsets much simpler.
*/
- __be64 agf_spare64[15];
+ __be64 agf_spare64[14];
/* unlogged fields, written during buffer writeback. */
__be64 agf_lsn; /* last write sequence */
@@ -674,8 +685,11 @@ typedef struct xfs_agf {
#define XFS_AGF_BTREEBLKS 0x00000800
#define XFS_AGF_UUID 0x00001000
#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_SPARE64 0x00004000
-#define XFS_AGF_NUM_BITS 15
+#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
+#define XFS_AGF_REFCOUNT_ROOT 0x00008000
+#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
+#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_NUM_BITS 18
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -693,6 +707,9 @@ typedef struct xfs_agf {
{ XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
{ XFS_AGF_UUID, "UUID" }, \
{ XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \
+ { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \
+ { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \
{ XFS_AGF_SPARE64, "SPARE64" }
/* disk block (xfs_daddr_t) in the AG */
@@ -848,7 +865,6 @@ typedef struct xfs_timestamp {
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
@@ -885,7 +901,8 @@ typedef struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __u8 di_pad2[16]; /* more padding for future expansion */
+ __be32 di_cowextsize; /* basic cow extent size for file */
+ __u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_timestamp_t di_crtime; /* time created */
@@ -1041,9 +1058,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* 16 bits of the XFS_XFLAG_s range.
*/
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX)
+#define XFS_DIFLAG2_ANY \
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
/*
* Inode number format:
@@ -1353,7 +1375,9 @@ struct xfs_owner_info {
#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */
#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */
#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */
-#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */
+#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */
#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63)))
@@ -1434,6 +1458,62 @@ typedef __be32 xfs_rmap_ptr_t;
XFS_IBT_BLOCK(mp) + 1)
/*
+ * Reference Count Btree format definitions
+ *
+ */
+#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount). Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1. All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here. Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock. This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31))
+#define REFCNTBT_COWFLAG_BITLEN 1
+#define REFCNTBT_AGBLOCK_BITLEN 31
+
+struct xfs_refcount_rec {
+ __be32 rc_startblock; /* starting block number */
+ __be32 rc_blockcount; /* count of blocks */
+ __be32 rc_refcount; /* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+ __be32 rc_startblock; /* starting block number */
+};
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+};
+
+#define MAXREFCOUNT ((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
* BMAP Btree format definitions
*
* This includes both the root block definition that sits inside an inode fork
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 79455058b752..b72dc821d78b 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -81,14 +81,16 @@ struct getbmapx {
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
+#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */
#define BMV_IF_VALID \
(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
- BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
+#define BMV_OF_SHARED 0x8 /* segment shared with another file */
/*
* Structure for XFS_IOC_FSSETDM.
@@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */
/*
* Minimum and maximum sizes need for growth checks.
@@ -275,7 +278,8 @@ typedef struct xfs_bstat {
#define bs_projid bs_projid_lo /* (previously just bs_projid) */
__u16 bs_forkoff; /* inode fork offset in bytes */
__u16 bs_projid_hi; /* higher part of project id */
- unsigned char bs_pad[10]; /* pad space, unused */
+ unsigned char bs_pad[6]; /* pad space, unused */
+ __u32 bs_cowextsize; /* cow extent size */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
- &oinfo);
+ &oinfo, XFS_AG_RESV_NONE);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4b9769e23c83..134424fac434 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -57,6 +57,17 @@ xfs_inobp_check(
}
#endif
+bool
+xfs_dinode_good_version(
+ struct xfs_mount *mp,
+ __u8 version)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return version == 3;
+
+ return version == 1 || version == 2;
+}
+
/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
@@ -91,7 +102,7 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
- XFS_DINODE_GOOD_VERSION(dip->di_version);
+ xfs_dinode_good_version(mp, dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP,
XFS_RANDOM_ITOBP_INOTOBP))) {
@@ -256,6 +267,7 @@ xfs_inode_from_disk(
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
+ to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
}
@@ -305,7 +317,7 @@ xfs_inode_to_disk(
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
-
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
@@ -357,6 +369,7 @@ xfs_log_dinode_to_disk(
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(from->di_lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
@@ -373,6 +386,9 @@ xfs_dinode_verify(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ uint16_t flags;
+ uint64_t flags2;
+
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return false;
@@ -389,6 +405,23 @@ xfs_dinode_verify(
return false;
if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
+
+ flags = be16_to_cpu(dip->di_flags);
+ flags2 = be64_to_cpu(dip->di_flags2);
+
+ /* don't allow reflink/cowextsize if we don't have reflink */
+ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+ !xfs_sb_version_hasreflink(&mp->m_sb))
+ return false;
+
+ /* don't let reflink and realtime mix */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ return false;
+
+ /* don't let reflink and dax mix */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX))
+ return false;
+
return true;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7c4dd321b215..3cfe12a4f58a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -47,6 +47,7 @@ struct xfs_icdinode {
__uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
__uint64_t di_flags2; /* more random flags */
+ __uint32_t di_cowextsize; /* basic cow extent size for file */
xfs_ictimestamp_t di_crtime; /* time created */
};
@@ -73,6 +74,8 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
struct xfs_dinode *to);
+bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
+
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bbcc8c7a44b3..5dd56d3dbb3a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -121,6 +121,26 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
+ if (unlikely(xfs_is_reflink_inode(ip) &&
+ (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %llu, wrong file type for reflink.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return -EFSCORRUPTED;
+ }
+
+ if (unlikely(xfs_is_reflink_inode(ip) &&
+ (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %llu, has reflink+realtime flag set.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return -EFSCORRUPTED;
+ }
+
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
@@ -186,9 +206,14 @@ xfs_iformat_fork(
XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
return -EFSCORRUPTED;
}
- if (error) {
+ if (error)
return error;
+
+ if (xfs_is_reflink_inode(ip)) {
+ ASSERT(ip->i_cowfp == NULL);
+ xfs_ifork_init_cow(ip);
}
+
if (!XFS_DFORK_Q(dip))
return 0;
@@ -208,7 +233,8 @@ xfs_iformat_fork(
XFS_CORRUPTION_ERROR("xfs_iformat(8)",
XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ break;
}
error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -226,6 +252,9 @@ xfs_iformat_fork(
if (error) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
+ if (ip->i_cowfp)
+ kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ ip->i_cowfp = NULL;
xfs_idestroy_fork(ip, XFS_DATA_FORK);
}
return error;
@@ -740,6 +769,9 @@ xfs_idestroy_fork(
if (whichfork == XFS_ATTR_FORK) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
+ } else if (whichfork == XFS_COW_FORK) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ ip->i_cowfp = NULL;
}
}
@@ -927,6 +959,19 @@ xfs_iext_get_ext(
}
}
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+ struct xfs_inode *ip,
+ int state)
+{
+ if (state & BMAP_COWFORK)
+ return ip->i_cowfp;
+ else if (state & BMAP_ATTRFORK)
+ return ip->i_afp;
+ return &ip->i_df;
+}
+
/*
* Insert new item(s) into the extent records for incore inode
* fork 'ifp'. 'count' new items are inserted at index 'idx'.
@@ -939,7 +984,7 @@ xfs_iext_insert(
xfs_bmbt_irec_t *new, /* items to insert */
int state) /* type of extent conversion */
{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
xfs_extnum_t i; /* extent record index */
trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
@@ -1189,7 +1234,7 @@ xfs_iext_remove(
int ext_diff, /* number of extents to remove */
int state) /* type of extent conversion */
{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
xfs_extnum_t nextents; /* number of extents in file */
int new_size; /* size of extents after removal */
@@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs(
ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
}
}
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+ struct xfs_inode *ip)
+{
+ if (ip->i_cowfp)
+ return;
+
+ ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
+ KM_SLEEP | KM_NOFS);
+ ip->i_cowfp->if_flags = XFS_IFEXTENTS;
+ ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_cnextents = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index f95e072ae646..c9476f50e32d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -92,7 +92,9 @@ typedef struct xfs_ifork {
#define XFS_IFORK_PTR(ip,w) \
((w) == XFS_DATA_FORK ? \
&(ip)->i_df : \
- (ip)->i_afp)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_afp : \
+ (ip)->i_cowfp))
#define XFS_IFORK_DSIZE(ip) \
(XFS_IFORK_Q(ip) ? \
XFS_IFORK_BOFF(ip) : \
@@ -105,26 +107,38 @@ typedef struct xfs_ifork {
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
XFS_IFORK_DSIZE(ip) : \
- XFS_IFORK_ASIZE(ip))
+ ((w) == XFS_ATTR_FORK ? \
+ XFS_IFORK_ASIZE(ip) : \
+ 0))
#define XFS_IFORK_FORMAT(ip,w) \
((w) == XFS_DATA_FORK ? \
(ip)->i_d.di_format : \
- (ip)->i_d.di_aformat)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_d.di_aformat : \
+ (ip)->i_cformat))
#define XFS_IFORK_FMT_SET(ip,w,n) \
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_format = (n)) : \
- ((ip)->i_d.di_aformat = (n)))
+ ((w) == XFS_ATTR_FORK ? \
+ ((ip)->i_d.di_aformat = (n)) : \
+ ((ip)->i_cformat = (n))))
#define XFS_IFORK_NEXTENTS(ip,w) \
((w) == XFS_DATA_FORK ? \
(ip)->i_d.di_nextents : \
- (ip)->i_d.di_anextents)
+ ((w) == XFS_ATTR_FORK ? \
+ (ip)->i_d.di_anextents : \
+ (ip)->i_cnextents))
#define XFS_IFORK_NEXT_SET(ip,w,n) \
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
- ((ip)->i_d.di_anextents = (n)))
+ ((w) == XFS_ATTR_FORK ? \
+ ((ip)->i_d.di_anextents = (n)) : \
+ ((ip)->i_cnextents = (n))))
#define XFS_IFORK_MAXEXT(ip, w) \
(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
@@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
extern struct kmem_zone *xfs_ifork_zone;
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..083cdd6d6c28 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr)
#define XLOG_REG_TYPE_ICREATE 20
#define XLOG_REG_TYPE_RUI_FORMAT 21
#define XLOG_REG_TYPE_RUD_FORMAT 22
-#define XLOG_REG_TYPE_MAX 22
+#define XLOG_REG_TYPE_CUI_FORMAT 23
+#define XLOG_REG_TYPE_CUD_FORMAT 24
+#define XLOG_REG_TYPE_BUI_FORMAT 25
+#define XLOG_REG_TYPE_BUD_FORMAT 26
+#define XLOG_REG_TYPE_MAX 26
/*
* Flags to log operation header
@@ -231,6 +235,10 @@ typedef struct xfs_trans_header {
#define XFS_LI_ICREATE 0x123f
#define XFS_LI_RUI 0x1240 /* rmap update intent */
#define XFS_LI_RUD 0x1241
+#define XFS_LI_CUI 0x1242 /* refcount update intent */
+#define XFS_LI_CUD 0x1243
+#define XFS_LI_BUI 0x1244 /* bmbt update intent */
+#define XFS_LI_BUD 0x1245
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -242,7 +250,11 @@ typedef struct xfs_trans_header {
{ XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
{ XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \
{ XFS_LI_RUI, "XFS_LI_RUI" }, \
- { XFS_LI_RUD, "XFS_LI_RUD" }
+ { XFS_LI_RUD, "XFS_LI_RUD" }, \
+ { XFS_LI_CUI, "XFS_LI_CUI" }, \
+ { XFS_LI_CUD, "XFS_LI_CUD" }, \
+ { XFS_LI_BUI, "XFS_LI_BUI" }, \
+ { XFS_LI_BUD, "XFS_LI_BUD" }
/*
* Inode Log Item Format definitions.
@@ -411,7 +423,8 @@ struct xfs_log_dinode {
__uint64_t di_changecount; /* number of attribute changes */
xfs_lsn_t di_lsn; /* flush sequence */
__uint64_t di_flags2; /* more random flags */
- __uint8_t di_pad2[16]; /* more padding for future expansion */
+ __uint32_t di_cowextsize; /* basic cow extent size for file */
+ __uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_ictimestamp_t di_crtime; /* time created */
@@ -622,8 +635,11 @@ struct xfs_map_extent {
/* rmap me_flags: upper bits are flags, lower byte is type code */
#define XFS_RMAP_EXTENT_MAP 1
+#define XFS_RMAP_EXTENT_MAP_SHARED 2
#define XFS_RMAP_EXTENT_UNMAP 3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED 4
#define XFS_RMAP_EXTENT_CONVERT 5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED 6
#define XFS_RMAP_EXTENT_ALLOC 7
#define XFS_RMAP_EXTENT_FREE 8
#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF
@@ -647,9 +663,17 @@ struct xfs_rui_log_format {
__uint16_t rui_size; /* size of this item */
__uint32_t rui_nextents; /* # extents to free */
__uint64_t rui_id; /* rui identifier */
- struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */
+ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
};
+static inline size_t
+xfs_rui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_rui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
/*
* This is the structure used to lay out an rud log item in the
* log. The rud_extents array is a variable size array whose
@@ -663,6 +687,102 @@ struct xfs_rud_log_format {
};
/*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+ __uint64_t pe_startblock;
+ __uint32_t pe_len;
+ __uint32_t pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log. The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+ __uint16_t cui_type; /* cui log item type */
+ __uint16_t cui_size; /* size of this item */
+ __uint32_t cui_nextents; /* # extents to free */
+ __uint64_t cui_id; /* cui identifier */
+ struct xfs_phys_extent cui_extents[]; /* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_cui_log_format) +
+ nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log. The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+ __uint16_t cud_type; /* cud log item type */
+ __uint16_t cud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t cud_cui_id; /* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
+ XFS_BMAP_EXTENT_ATTR_FORK | \
+ XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log. The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+ __uint16_t bui_type; /* bui log item type */
+ __uint16_t bui_size; /* size of this item */
+ __uint32_t bui_nextents; /* # extents to free */
+ __uint64_t bui_id; /* bui identifier */
+ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_bui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log. The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+ __uint16_t bud_type; /* bud log item type */
+ __uint16_t bud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t bud_bui_id; /* id of corresponding bui */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 000000000000..b177ef33cd4c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+ XFS_REFCOUNT_ADJUST_INCREASE = 1,
+ XFS_REFCOUNT_ADJUST_DECREASE = -1,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC = 0,
+ XFS_REFCOUNT_ADJUST_COW_FREE = -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_GE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+static inline void
+xfs_refcount_btrec_to_irec(
+ union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec)
+{
+ irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+ irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+ irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ xfs_refcount_btrec_to_irec(rec, irec);
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno,
+ irec);
+ }
+ return error;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec)
+{
+ union xfs_btree_rec rec;
+ int error;
+
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+ rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+ rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+ rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_refcount_update_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_insert(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *i)
+{
+ int error;
+
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+ cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+ cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+ cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ error = xfs_btree_insert(cur, i);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+out_error:
+ if (error)
+ trace_xfs_refcount_insert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+ struct xfs_btree_cur *cur,
+ int *i)
+{
+ struct xfs_refcount_irec irec;
+ int found_rec;
+ int error;
+
+ error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+ error = xfs_btree_delete(cur, i);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (error)
+ goto out_error;
+ error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+out_error:
+ if (error)
+ trace_xfs_refcount_delete_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks. In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+---------
+ * 2 | | 3 | 4 | |17| 55 | 10
+ * ----+ +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted. For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+-----+ +--+--------+----+----
+ * 2 | | 3 | 2 | |17| 55 | 10 | 10
+ * ----+ +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once. Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ * <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent. Now we
+ * have a left, current, and right extent. If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so. If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those. In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 3 | 2 |1|17| 55 | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ * ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2. If we were
+ * incrementing, the end result looks like this:
+ *
+ * <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ * 2 | 4 | 3 |2|18| 56 | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ * <------ adjustment range ------>
+ * ----+ +---+ +--+--------+----+----
+ * 2 | | 2 | |16| 54 | 9 | 10
+ * ----+ +---+ +--+--------+----+----
+ * DDDD 111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec rcext, tmp;
+ int found_rec;
+ int error;
+
+ *shape_changed = false;
+ error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+ return 0;
+
+ *shape_changed = true;
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+ &rcext, agbno);
+
+ /* Establish the right extent. */
+ tmp = rcext;
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+ error = xfs_refcount_update(cur, &tmp);
+ if (error)
+ goto out_error;
+
+ /* Insert the left extent. */
+ tmp = rcext;
+ tmp.rc_blockcount = agbno - rcext.rc_startblock;
+ error = xfs_refcount_insert(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ return error;
+
+out_error:
+ trace_xfs_refcount_split_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *center,
+ struct xfs_refcount_irec *right,
+ unsigned long long extlen,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+ cur->bc_private.a.agno, left, center, right);
+
+ /*
+ * Make sure the center and right extents are not in the btree.
+ * If the center extent was synthesized, the first delete call
+ * removes the right extent and we skip the second deletion.
+ * If center and right were in the btree, then the first delete
+ * call removes the center and the second one removes the right
+ * extent.
+ */
+ error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (center->rc_refcount > 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ left->rc_blockcount = extlen;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *aglen = 0;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+ cur->bc_private.a.agno, left, cleft);
+
+ /* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+ if (cleft->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the left extent. */
+ error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ left->rc_blockcount += cleft->rc_blockcount;
+ error = xfs_refcount_update(cur, left);
+ if (error)
+ goto out_error;
+
+ *agbno += cleft->rc_blockcount;
+ *aglen -= cleft->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen)
+{
+ int error;
+ int found_rec;
+
+ trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+ cur->bc_private.a.agno, cright, right);
+
+ /*
+ * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+ * remove it.
+ */
+ if (cright->rc_refcount > 1) {
+ error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+ }
+
+ /* Enlarge the right extent. */
+ error = xfs_refcount_lookup_le(cur, right->rc_startblock,
+ &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ right->rc_startblock -= cright->rc_blockcount;
+ right->rc_blockcount += cright->rc_blockcount;
+ error = xfs_refcount_update(cur, right);
+ if (error)
+ goto out_error;
+
+ *aglen -= cright->rc_blockcount;
+ return error;
+
+out_error:
+ trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+#define XFS_FIND_RCEXT_SHARED 1
+#define XFS_FIND_RCEXT_COW 2
+/*
+ * Find the left extent and the one after it (cleft). This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *left,
+ struct xfs_refcount_irec *cleft,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ int flags)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (xfs_refc_next(&tmp) != agbno)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ return 0;
+ /* We have a left extent; retrieve (or invent) the next right one */
+ *left = tmp;
+
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ /* if tmp starts at the end of our range, just use that */
+ if (tmp.rc_startblock == agbno)
+ *cleft = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the start of the
+ * range we're interested in (refcount == 1) so
+ * synthesize the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = min(aglen,
+ tmp.rc_startblock - agbno);
+ cleft->rc_refcount = 1;
+ }
+ } else {
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cleft->rc_startblock = agbno;
+ cleft->rc_blockcount = aglen;
+ cleft->rc_refcount = 1;
+ }
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+ left, cleft, agbno);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright). This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *right,
+ struct xfs_refcount_irec *cright,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ int flags)
+{
+ struct xfs_refcount_irec tmp;
+ int error;
+ int found_rec;
+
+ right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+ error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec)
+ return 0;
+
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+ if (tmp.rc_startblock != agbno + aglen)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ return 0;
+ if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ return 0;
+ /* We have a right extent; retrieve (or invent) the next left one */
+ *right = tmp;
+
+ error = xfs_btree_decrement(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+ if (found_rec) {
+ error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+ out_error);
+
+ /* if tmp ends at the end of our range, just use that */
+ if (xfs_refc_next(&tmp) == agbno + aglen)
+ *cright = tmp;
+ else {
+ /*
+ * There's a gap in the refcntbt at the end of the
+ * range we're interested in (refcount == 1) so
+ * create the implied extent and pass it back.
+ * We assume here that the agbno/aglen range was
+ * passed in from a data fork extent mapping and
+ * therefore is allocated to exactly one owner.
+ */
+ cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+ cright->rc_blockcount = right->rc_startblock -
+ cright->rc_startblock;
+ cright->rc_refcount = 1;
+ }
+ } else {
+ /*
+ * No extents, so pretend that there's one covering the whole
+ * range.
+ */
+ cright->rc_startblock = agbno;
+ cright->rc_blockcount = aglen;
+ cright->rc_refcount = 1;
+ }
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+ cright, right, agbno + aglen);
+ return error;
+
+out_error:
+ trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+ struct xfs_refcount_irec *rc)
+{
+ return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adjust,
+ int flags,
+ bool *shape_changed)
+{
+ struct xfs_refcount_irec left = {0}, cleft = {0};
+ struct xfs_refcount_irec cright = {0}, right = {0};
+ int error;
+ unsigned long long ulen;
+ bool cequal;
+
+ *shape_changed = false;
+ /*
+ * Find the extent just below agbno [left], just above agbno [cleft],
+ * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+ * [right].
+ */
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
+ *aglen, flags);
+ if (error)
+ return error;
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
+ *aglen, flags);
+ if (error)
+ return error;
+
+ /* No left or right extent to merge; exit. */
+ if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+ return 0;
+
+ cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+ (cleft.rc_blockcount == cright.rc_blockcount);
+
+ /* Try to merge left, cleft, and right. cleft must == cright. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+ right.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+ xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ right.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+ &right, ulen, agbno, aglen);
+ }
+
+ /* Try to merge left and cleft. */
+ ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+ if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+ left.rc_refcount == cleft.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+ agbno, aglen);
+ if (error)
+ return error;
+
+ /*
+ * If we just merged left + cleft and cleft == cright,
+ * we no longer have a cright to merge with right. We're done.
+ */
+ if (cequal)
+ return 0;
+ }
+
+ /* Try to merge cright and right. */
+ ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+ if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+ right.rc_refcount == cright.rc_refcount + adjust &&
+ ulen < MAXREFCEXTLEN) {
+ *shape_changed = true;
+ return xfs_refcount_merge_right_extent(cur, &right, &cright,
+ agbno, aglen);
+ }
+
+ return error;
+}
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs. Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 24 bytes
+ * per record. We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+ struct xfs_btree_cur *cur)
+{
+ unsigned long overhead;
+
+ overhead = cur->bc_private.a.priv.refc.shape_changes *
+ xfs_allocfree_log_count(cur->bc_mp, 1);
+ overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+ /*
+ * Only allow 2 refcount extent updates per transaction if the
+ * refcount continue update "error" has been injected.
+ */
+ if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+ XFS_TEST_ERROR(false, cur->bc_mp,
+ XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE,
+ XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE))
+ return false;
+
+ if (cur->bc_private.a.priv.refc.nr_ops == 0)
+ return true;
+ else if (overhead > cur->bc_tp->t_log_res)
+ return false;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_private.a.priv.refc.nr_ops * 32;
+}
+
+/*
+ * Adjust the refcounts of middle extents. At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges. Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+ xfs_fsblock_t fsbno;
+
+ /* Merging did all the work already. */
+ if (*aglen == 0)
+ return 0;
+
+ error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+ if (error)
+ goto out_error;
+
+ while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ }
+
+ /*
+ * Deal with a hole in the refcount tree; if a file maps to
+ * these blocks and there's no refcountbt record, pretend that
+ * there is one with refcount == 1.
+ */
+ if (ext.rc_startblock != *agbno) {
+ tmp.rc_startblock = *agbno;
+ tmp.rc_blockcount = min(*aglen,
+ ext.rc_startblock - *agbno);
+ tmp.rc_refcount = 1 + adj;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &tmp);
+
+ /*
+ * Either cover the hole (increment) or
+ * delete the range (decrement).
+ */
+ if (tmp.rc_refcount) {
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_tmp == 1, out_error);
+ cur->bc_private.a.priv.refc.nr_ops++;
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_private.a.agno,
+ tmp.rc_startblock);
+ xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+ tmp.rc_blockcount, oinfo);
+ }
+
+ (*agbno) += tmp.rc_blockcount;
+ (*aglen) -= tmp.rc_blockcount;
+
+ error = xfs_refcount_lookup_ge(cur, *agbno,
+ &found_rec);
+ if (error)
+ goto out_error;
+ }
+
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /*
+ * Adjust the reference count and either update the tree
+ * (incr) or free the blocks (decr).
+ */
+ if (ext.rc_refcount == MAXREFCOUNT)
+ goto skip;
+ ext.rc_refcount += adj;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &ext);
+ if (ext.rc_refcount > 1) {
+ error = xfs_refcount_update(cur, &ext);
+ if (error)
+ goto out_error;
+ cur->bc_private.a.priv.refc.nr_ops++;
+ } else if (ext.rc_refcount == 1) {
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_rec == 1, out_error);
+ cur->bc_private.a.priv.refc.nr_ops++;
+ goto advloop;
+ } else {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_private.a.agno,
+ ext.rc_startblock);
+ xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+ ext.rc_blockcount, oinfo);
+ }
+
+skip:
+ error = xfs_btree_increment(cur, 0, &found_rec);
+ if (error)
+ goto out_error;
+
+advloop:
+ (*agbno) += ext.rc_blockcount;
+ (*aglen) -= ext.rc_blockcount;
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *new_agbno,
+ xfs_extlen_t *new_aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ bool shape_changed;
+ int shape_changes = 0;
+ int error;
+
+ *new_agbno = agbno;
+ *new_aglen = aglen;
+ if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+ else
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
+ XFS_FIND_RCEXT_SHARED, &shape_changed);
+ if (error)
+ goto out_error;
+ if (shape_changed)
+ shape_changes++;
+ if (shape_changes)
+ cur->bc_private.a.priv.refc.shape_changes++;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
+ adj, dfops, oinfo);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_private.a.agbp;
+ xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_agblock_t new_agbno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+
+ agno = XFS_FSB_TO_AGNO(mp, startblock);
+ ASSERT(agno != NULLAGNUMBER);
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+ type, XFS_FSB_TO_AGBNO(mp, startblock),
+ blockcount);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_REFCOUNT_FINISH_ONE,
+ XFS_RANDOM_REFCOUNT_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
+ shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+ xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
+ XFS_ALLOC_FLAG_FREEING, &agbp);
+ if (error)
+ return error;
+ if (!agbp)
+ return -EFSCORRUPTED;
+
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops);
+ if (!rcur) {
+ error = -ENOMEM;
+ goto out_cur;
+ }
+ rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
+ rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+ }
+ *pcur = rcur;
+
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL);
+ *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL);
+ *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops);
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ *new_fsb = startblock + blockcount;
+ *new_len = 0;
+ error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (!error && *new_len > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+ bno, blockcount, new_agbno, *new_len);
+ return error;
+
+out_cur:
+ xfs_trans_brelse(tp, agbp);
+
+ return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static int
+__xfs_refcount_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount)
+{
+ struct xfs_refcount_intent *ri;
+
+ trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock),
+ type, XFS_FSB_TO_AGBNO(mp, startblock),
+ blockcount);
+
+ ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
+ KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_startblock = startblock;
+ ri->ri_blockcount = blockcount;
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+ return 0;
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_increase_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE,
+ PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_decrease_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE,
+ PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen. If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find. If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *fbno,
+ xfs_extlen_t *flen,
+ bool find_end_of_shared)
+{
+ struct xfs_refcount_irec tmp;
+ int i;
+ int have;
+ int error;
+
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* By default, skip the whole range */
+ *fbno = NULLAGBLOCK;
+ *flen = 0;
+
+ /* Try to find a refcount extent that crosses the start */
+ error = xfs_refcount_lookup_le(cur, agbno, &have);
+ if (error)
+ goto out_error;
+ if (!have) {
+ /* No left extent, look at the next one */
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ }
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+
+ /* If the extent ends before the start, look at the next one */
+ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ goto done;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ }
+
+ /* If the extent starts after the range we want, bail out */
+ if (tmp.rc_startblock >= agbno + aglen)
+ goto done;
+
+ /* We found the start of a shared extent! */
+ if (tmp.rc_startblock < agbno) {
+ tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+ tmp.rc_startblock = agbno;
+ }
+
+ *fbno = tmp.rc_startblock;
+ *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+ if (!find_end_of_shared)
+ goto done;
+
+ /* Otherwise, find the end of this shared extent */
+ while (*fbno + *flen < agbno + aglen) {
+ error = xfs_btree_increment(cur, 0, &have);
+ if (error)
+ goto out_error;
+ if (!have)
+ break;
+ error = xfs_refcount_get_rec(cur, &tmp, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (tmp.rc_startblock >= agbno + aglen ||
+ tmp.rc_startblock != *fbno + *flen)
+ break;
+ *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+ }
+
+done:
+ trace_xfs_refcount_find_shared_result(cur->bc_mp,
+ cur->bc_private.a.agno, *fbno, *flen);
+
+out_error:
+ if (error)
+ trace_xfs_refcount_find_shared_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork. The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file. A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost. This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree. The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1. Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records. These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1. The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings. These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log. Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail. Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time. Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations. These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_refcount_irec ext, tmp;
+ int error;
+ int found_rec, found_tmp;
+
+ if (aglen == 0)
+ return 0;
+
+ /* Find any overlapping refcount records */
+ error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+ if (error)
+ goto out_error;
+ error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+ if (error)
+ goto out_error;
+ if (!found_rec) {
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
+ XFS_REFC_COW_START;
+ ext.rc_blockcount = 0;
+ ext.rc_refcount = 0;
+ }
+
+ switch (adj) {
+ case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+ /* Adding a CoW reservation, there should be nothing here. */
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_startblock >= agbno + aglen, out_error);
+
+ tmp.rc_startblock = agbno;
+ tmp.rc_blockcount = aglen;
+ tmp.rc_refcount = 1;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &tmp);
+
+ error = xfs_refcount_insert(cur, &tmp,
+ &found_tmp);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_tmp == 1, out_error);
+ break;
+ case XFS_REFCOUNT_ADJUST_COW_FREE:
+ /* Removing a CoW reservation, there should be one extent. */
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_startblock == agbno, out_error);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_blockcount == aglen, out_error);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ ext.rc_refcount == 1, out_error);
+
+ ext.rc_refcount = 0;
+ trace_xfs_refcount_modify_extent(cur->bc_mp,
+ cur->bc_private.a.agno, &ext);
+ error = xfs_refcount_delete(cur, &found_rec);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+ found_rec == 1, out_error);
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ return error;
+out_error:
+ trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ enum xfs_refc_adjust_op adj,
+ struct xfs_defer_ops *dfops)
+{
+ bool shape_changed;
+ int error;
+
+ agbno += XFS_REFC_COW_START;
+
+ /*
+ * Ensure that no rcextents cross the boundary of the adjustment range.
+ */
+ error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ if (error)
+ goto out_error;
+
+ error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /*
+ * Try to merge with the left or right extents of the range.
+ */
+ error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
+ XFS_FIND_RCEXT_COW, &shape_changed);
+ if (error)
+ goto out_error;
+
+ /* Now that we've taken care of the ends, adjust the middle extents */
+ error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj,
+ dfops, NULL);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops)
+{
+ int error;
+
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* Add refcount btree reservation */
+ error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
+ if (error)
+ return error;
+
+ /* Add rmap entry */
+ if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+ error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
+ rcur->bc_private.a.agno,
+ agbno, aglen, XFS_RMAP_OWN_COW);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ struct xfs_defer_ops *dfops)
+{
+ int error;
+
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+ agbno, aglen);
+
+ /* Remove refcount btree reservation */
+ error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+ XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
+ if (error)
+ return error;
+
+ /* Remove rmap entry */
+ if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+ error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
+ rcur->bc_private.a.agno,
+ agbno, aglen, XFS_RMAP_OWN_COW);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+int
+xfs_refcount_alloc_cow_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+ fsb, len);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+int
+xfs_refcount_free_cow_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t fsb,
+ xfs_extlen_t len)
+{
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
+ fsb, len);
+}
+
+struct xfs_refcount_recovery {
+ struct list_head rr_list;
+ struct xfs_refcount_irec rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct list_head *debris = priv;
+ struct xfs_refcount_recovery *rr;
+
+ if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+ return -EFSCORRUPTED;
+
+ rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+ xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+ list_add_tail(&rr->rr_list, debris);
+
+ return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_trans *tp;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_refcount_recovery *rr, *n;
+ struct list_head debris;
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t fsb;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+ return -EOPNOTSUPP;
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+ cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+ /* Find all the leftover CoW staging extents. */
+ INIT_LIST_HEAD(&debris);
+ memset(&low, 0, sizeof(low));
+ memset(&high, 0, sizeof(high));
+ low.rc.rc_startblock = XFS_REFC_COW_START;
+ high.rc.rc_startblock = -1U;
+ error = xfs_btree_query_range(cur, &low, &high,
+ xfs_refcount_recover_extent, &debris);
+ if (error)
+ goto out_cursor;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+
+ /* Now iterate the list to free the leftovers */
+ list_for_each_entry(rr, &debris, rr_list) {
+ /* Set up transaction. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+
+ /* Free the orphan record */
+ xfs_defer_init(&dfops, &fsb);
+ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
+ fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+ error = xfs_refcount_free_cow_extent(mp, &dfops, fsb,
+ rr->rr_rrec.rc_blockcount);
+ if (error)
+ goto out_defer;
+
+ /* Free the block. */
+ xfs_bmap_add_free(mp, &dfops, fsb,
+ rr->rr_rrec.rc_blockcount, NULL);
+
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto out_defer;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+ }
+
+out_free:
+ /* Free the leftover list */
+ list_for_each_entry_safe(rr, n, &debris, rr_list) {
+ list_del(&rr->rr_list);
+ kmem_free(rr);
+ }
+ return error;
+
+out_cursor:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_buf_relse(agbp);
+ goto out_free;
+
+out_defer:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ goto out_free;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 000000000000..098dc668ab2c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
+
+enum xfs_refcount_intent_type {
+ XFS_REFCOUNT_INCREASE = 1,
+ XFS_REFCOUNT_DECREASE,
+ XFS_REFCOUNT_ALLOC_COW,
+ XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+ struct list_head ri_list;
+ enum xfs_refcount_intent_type ri_type;
+ xfs_fsblock_t ri_startblock;
+ xfs_extlen_t ri_blockcount;
+};
+
+extern int xfs_refcount_increase_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+extern int xfs_refcount_decrease_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+ struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock, xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+ xfs_extlen_t *flen, bool find_end_of_shared);
+
+extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ xfs_agnumber_t agno);
+
+#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 000000000000..453bb2757ec2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_private.a.dfops);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_refcount_root = ptr->s;
+ be32_add_cpu(&agf->agf_refcount_level, inc);
+ pag->pagf_refcount_level += inc;
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp,
+ XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_alloc_arg args; /* block allocation args */
+ int error; /* error return value */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_refc_block(args.mp));
+ args.firstblock = args.fsbno;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+ args.minlen = args.maxlen = args.prod = 1;
+ args.resv = XFS_AG_RESV_METADATA;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_error;
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ args.agbno, 1);
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.agno == cur->bc_private.a.agno);
+ ASSERT(args.len == 1);
+
+ new->s = cpu_to_be32(args.agbno);
+ be32_add_cpu(&agf->agf_refcount_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out_error:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ struct xfs_owner_info oinfo;
+ int error;
+
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+ be32_add_cpu(&agf->agf_refcount_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+ XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_refcount_root != 0);
+
+ ptr->s = agf->agf_refcount_root;
+}
+
+STATIC __int64_t
+xfs_refcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
+ struct xfs_refcount_key *kp = &key->refc;
+
+ return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+}
+
+STATIC __int64_t
+xfs_refcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC bool
+xfs_refcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return false;
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
+ return false;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_refcount_level)
+ return false;
+ } else if (level >= mp->m_refc_maxlevels)
+ return false;
+
+ return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, -EFSBADCRC);
+ else if (!xfs_refcountbt_verify(bp))
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_refcountbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+ .name = "xfs_refcountbt",
+ .verify_read = xfs_refcountbt_read_verify,
+ .verify_write = xfs_refcountbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_refcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+#endif
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+
+ .dup_cursor = xfs_refcountbt_dup_cursor,
+ .set_root = xfs_refcountbt_set_root,
+ .alloc_block = xfs_refcountbt_alloc_block,
+ .free_block = xfs_refcountbt_free_block,
+ .get_minrecs = xfs_refcountbt_get_minrecs,
+ .get_maxrecs = xfs_refcountbt_get_maxrecs,
+ .init_key_from_rec = xfs_refcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
+ .key_diff = xfs_refcountbt_key_diff,
+ .buf_ops = &xfs_refcountbt_buf_ops,
+ .diff_two_keys = xfs_refcountbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_refcountbt_keys_inorder,
+ .recs_inorder = xfs_refcountbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new refcount btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = XFS_BTNUM_REFC;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ops = &xfs_refcountbt_ops;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+ cur->bc_private.a.dfops = dfops;
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ cur->bc_private.a.priv.refc.nr_ops = 0;
+ cur->bc_private.a.priv.refc.shape_changes = 0;
+
+ return cur;
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+ struct xfs_mount *mp)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_refc_mxr[0] == 0)
+ return 0;
+
+ return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ *ask += xfs_refcountbt_max_size(mp);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+ xfs_buf_relse(agbp);
+
+ *used += tree_len;
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 000000000000..3be7768bd51a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define __XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+ ((struct xfs_refcount_rec *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+ ((struct xfs_refcount_key *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_refcount_ptr_t *) \
+ ((char *)(block) + \
+ XFS_REFCOUNT_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct xfs_refcount_key) + \
+ ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno,
+ struct xfs_defer_ops *dfops);
+extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
+ bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+ xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 73d05407d663..3a8cc7139912 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -148,6 +148,37 @@ done:
return error;
}
+STATIC int
+xfs_rmap_delete(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+
+ error = xfs_btree_delete(rcur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+ if (error)
+ trace_xfs_rmap_delete_error(rcur->bc_mp,
+ rcur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
static int
xfs_rmap_btrec_to_irec(
union xfs_btree_rec *rec,
@@ -180,6 +211,160 @@ xfs_rmap_get_rec(
return xfs_rmap_btrec_to_irec(rec, irec);
}
+struct xfs_find_left_neighbor_info {
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_irec *irec;
+ int *stat;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+ cur->bc_private.a.agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ *info->irec = *rec;
+ *info->stat = 1;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+int
+xfs_rmap_find_left_neighbor(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int error;
+
+ *stat = 0;
+ if (bno == 0)
+ return 0;
+ info.high.rm_startblock = bno - 1;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+ !(flags & XFS_RMAP_BMBT_BLOCK)) {
+ if (offset == 0)
+ return 0;
+ info.high.rm_offset = offset - 1;
+ } else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ info.irec = irec;
+ info.stat = stat;
+
+ trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+ cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+ if (*stat)
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner,
+ irec->rm_offset, irec->rm_flags);
+ return error;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_find_left_neighbor_info *info = priv;
+
+ trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+ cur->bc_private.a.agno, rec->rm_startblock,
+ rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+ rec->rm_flags);
+
+ if (rec->rm_owner != info->high.rm_owner)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ (rec->rm_offset > info->high.rm_offset ||
+ rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ *info->irec = *rec;
+ *info->stat = 1;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges. This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ struct xfs_find_left_neighbor_info info;
+ int error;
+
+ info.high.rm_startblock = bno;
+ info.high.rm_owner = owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+ info.high.rm_offset = offset;
+ else
+ info.high.rm_offset = 0;
+ info.high.rm_flags = flags;
+ info.high.rm_blockcount = 0;
+ *stat = 0;
+ info.irec = irec;
+ info.stat = stat;
+
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp,
+ cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+ if (*stat)
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner,
+ irec->rm_offset, irec->rm_flags);
+ return error;
+}
+
/*
* Find the extent in the rmap btree and remove it.
*
@@ -1093,11 +1278,704 @@ done:
return error;
}
+/*
+ * Convert an unwritten extent to a real extent or vice versa. If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ /* new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for and exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ &PREV, &i);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+ &LEFT, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+ done);
+ if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ newext, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+ done);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ _RET_IP_);
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ error = xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (error)
+ goto done;
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += RIGHT.rm_blockcount;
+ NEW.rm_flags = RIGHT.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ error = xfs_rmap_delete(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+ if (error)
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner,
+ NEW.rm_offset, NEW.rm_flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW.rm_blockcount = offset - NEW.rm_offset;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new middle extent - newext */
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount = len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
#undef NEW
#undef LEFT
#undef RIGHT
#undef PREV
+/*
+ * Find an extent in the rmap btree and unmap it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ &ltrec, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltoff = ltrec.rm_offset;
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+ ltrec.rm_startblock + ltrec.rm_blockcount >=
+ bno + len, out_error);
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+
+ /* Make sure the unwritten flag matches. */
+ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+ /* Check the offset. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
+ out_error);
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* Exact match, simply remove the record from rmap tree. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * Overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+
+ /* Delete prev rmap. */
+ error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset. */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ ltrec.rm_offset += len;
+ error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * Overlap right hand side of extent: trim the length and
+ * update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * Overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ /* Shrink the left side of the rmap */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ /* Add an rmap at the new offset */
+ error = xfs_rmap_insert(cur, bno + len,
+ orig_len - len - ltrec.rm_blockcount,
+ ltrec.rm_owner, offset + len,
+ ltrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it. For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /* Is there a left record that abuts our range? */
+ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+ &ltrec, &have_lt);
+ if (error)
+ goto out_error;
+ if (have_lt &&
+ !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+
+ /* Is there a right record that abuts our range? */
+ error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+ flags, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+ /*
+ * Left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ }
+
+ /* Point the cursor back to the left record and update. */
+ error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ offset + len == gtrec.rm_offset) {
+ /*
+ * Right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ /* Delete the old record. */
+ error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+
+ /* Move the start and re-add it. */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * No contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+ if (error)
+ goto out_error;
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
struct xfs_rmap_query_range_info {
xfs_rmap_query_range_fn fn;
void *priv;
@@ -1237,15 +2115,27 @@ xfs_rmap_finish_one(
case XFS_RMAP_MAP:
error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
break;
+ case XFS_RMAP_MAP_SHARED:
+ error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
case XFS_RMAP_FREE:
case XFS_RMAP_UNMAP:
error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
&oinfo);
break;
+ case XFS_RMAP_UNMAP_SHARED:
+ error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
case XFS_RMAP_CONVERT:
error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
&oinfo);
break;
+ case XFS_RMAP_CONVERT_SHARED:
+ error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+ !unwritten, &oinfo);
+ break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
@@ -1263,9 +2153,10 @@ out_cur:
*/
static bool
xfs_rmap_update_is_needed(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ int whichfork)
{
- return xfs_sb_version_hasrmapbt(&mp->m_sb);
+ return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
}
/*
@@ -1311,10 +2202,11 @@ xfs_rmap_map_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
whichfork, PREV);
}
@@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
whichfork, PREV);
}
@@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, whichfork))
return 0;
- return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+ return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+ XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
whichfork, PREV);
}
@@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent(
{
struct xfs_bmbt_irec bmap;
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
return 0;
bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
@@ -1386,7 +2280,7 @@ xfs_rmap_free_extent(
{
struct xfs_bmbt_irec bmap;
- if (!xfs_rmap_update_is_needed(mp))
+ if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
return 0;
bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 71cf99a4acba..789930599339 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
+int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 17b8eeb34ac8..83e672ff7577 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -35,6 +35,7 @@
#include "xfs_cksum.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
/*
* Reverse map btree.
@@ -512,6 +513,83 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
- mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ /*
+ * On a non-reflink filesystem, the maximum number of rmap
+ * records is the number of blocks in the AG, hence the max
+ * rmapbt height is log_$maxrecs($agblocks). However, with
+ * reflink each AG block can have up to 2^32 (per the refcount
+ * record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records.
+ *
+ * That effectively means that the max rmapbt height must be
+ * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
+ * blocks to feed the rmapbt long before the rmapbt reaches
+ * maximum height. The reflink code uses ag_resv_critical to
+ * disallow reflinking when less than 10% of the per-AG metadata
+ * block reservation since the fallback is a regular file copy.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
+ else
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+ struct xfs_mount *mp)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rmap_mxr[0] == 0)
+ return 0;
+
+ return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agf *agf;
+ xfs_extlen_t pool_len;
+ xfs_extlen_t tree_len;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ /* Reserve 1% of the AG or enough for 1 block per record. */
+ pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
+ *ask += pool_len;
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+ xfs_buf_relse(agbp);
+
+ *used += tree_len;
+
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index e73a55357dab..2a9ac472fb15 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
+ xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4aecc5fefe96..a70aec910626 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -38,6 +38,8 @@
#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -737,6 +739,13 @@ xfs_sb_mount_common(
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+ mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+ true);
+ mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+ false);
+ mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+ mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 0c5b30bd884c..c6f4eb46fe26 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1
#define XFS_DQUOT_REF 1
+#define XFS_REFC_BTREE_REF 1
/*
* Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 301ef2f4dbd6..b456cca1bfb2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -67,13 +67,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). The number of blocks reserved is based on the formula:
+ * (rmapbt). With reflink, there are four trees (refcountbt). The number of
+ * blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
-static uint
+uint
xfs_allocfree_log_count(
struct xfs_mount *mp,
uint num_ops)
@@ -83,6 +84,8 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
@@ -809,11 +812,18 @@ xfs_trans_resv_calc(
* require a permanent reservation on space.
*/
resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -870,7 +880,10 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ else
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0eb46ed6d404..b7e5357d060a 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -87,6 +87,7 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -96,11 +97,13 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 41e0428d8175..7917f6e44286 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,6 +21,8 @@
/*
* Components of space reservations.
*/
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
+ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -28,6 +30,13 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+ XFS_EXTENTADD_SPACE_RES(mp,w) + \
+ ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ (mp)->m_rmap_maxlevels)
#define XFS_DAENTER_1B(mp,w) \
((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3d503647f26b..8d74870468c2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
*/
#define XFS_DATA_FORK 0
#define XFS_ATTR_FORK 1
+#define XFS_COW_FORK 2
/*
* Min numbers of data/attr fork btree root pointers.
@@ -109,7 +110,7 @@ typedef enum {
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
- XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+ XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6e527b8eccb..b468e041f207 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -236,7 +236,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
iattr.ia_mode = mode;
- iattr.ia_ctime = current_fs_time(inode->i_sb);
+ iattr.ia_ctime = current_time(inode);
error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
}
@@ -257,16 +257,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- umode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
-
- if (error <= 0) {
- acl = NULL;
-
- if (error < 0)
- return error;
- }
+ umode_t mode;
+ error = posix_acl_update_mode(inode, &mode, &acl);
+ if (error)
+ return error;
error = xfs_set_mode(inode, mode);
if (error)
return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc3ad15..3e57a56cf829 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -39,6 +40,7 @@
/* flags for direct write completions */
#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
#define XFS_DIO_FLAG_APPEND (1 << 1)
+#define XFS_DIO_FLAG_COW (1 << 2)
/*
* structure owned by writepages passed to individual writepage calls
@@ -200,7 +202,7 @@ xfs_setfilesize_trans_alloc(
* Update on-disk file size now that data has been written to disk.
*/
STATIC int
-xfs_setfilesize(
+__xfs_setfilesize(
struct xfs_inode *ip,
struct xfs_trans *tp,
xfs_off_t offset,
@@ -225,6 +227,23 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
+int
+xfs_setfilesize(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ size_t size)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ return __xfs_setfilesize(ip, tp, offset, size);
+}
+
STATIC int
xfs_setfilesize_ioend(
struct xfs_ioend *ioend,
@@ -247,7 +266,7 @@ xfs_setfilesize_ioend(
return error;
}
- return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+ return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
/*
@@ -270,6 +289,25 @@ xfs_end_io(
error = -EIO;
/*
+ * For a CoW extent, we need to move the mapping from the CoW fork
+ * to the data fork. If instead an error happened, just dump the
+ * new blocks.
+ */
+ if (ioend->io_type == XFS_IO_COW) {
+ if (error)
+ goto done;
+ if (ioend->io_bio->bi_error) {
+ error = xfs_reflink_cancel_cow_range(ip,
+ ioend->io_offset, ioend->io_size);
+ goto done;
+ }
+ error = xfs_reflink_end_cow(ip, ioend->io_offset,
+ ioend->io_size);
+ if (error)
+ goto done;
+ }
+
+ /*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
* Detecting and handling completion IO errors is done individually
@@ -284,7 +322,8 @@ xfs_end_io(
} else if (ioend->io_append_trans) {
error = xfs_setfilesize_ioend(ioend, error);
} else {
- ASSERT(!xfs_ioend_is_append(ioend));
+ ASSERT(!xfs_ioend_is_append(ioend) ||
+ ioend->io_type == XFS_IO_COW);
}
done:
@@ -298,7 +337,7 @@ xfs_end_bio(
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN)
+ if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -324,6 +363,7 @@ xfs_map_blocks(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
+ ASSERT(type != XFS_IO_COW);
if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
@@ -338,6 +378,13 @@ xfs_map_blocks(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
imap, &nimaps, bmapi_flags);
+ /*
+ * Truncate an overwrite extent if there's a pending CoW
+ * reservation before the end of this extent. This forces us
+ * to come back to writepage to take care of the CoW.
+ */
+ if (nimaps && type == XFS_IO_OVERWRITE)
+ xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
@@ -345,7 +392,8 @@ xfs_map_blocks(
if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
- error = xfs_iomap_write_allocate(ip, offset, imap);
+ error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
+ imap);
if (!error)
trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
return error;
@@ -720,6 +768,56 @@ out_invalidate:
return;
}
+static int
+xfs_map_cow(
+ struct xfs_writepage_ctx *wpc,
+ struct inode *inode,
+ loff_t offset,
+ unsigned int *new_type)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_bmbt_irec imap;
+ bool is_cow = false, need_alloc = false;
+ int error;
+
+ /*
+ * If we already have a valid COW mapping keep using it.
+ */
+ if (wpc->io_type == XFS_IO_COW) {
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+ if (wpc->imap_valid) {
+ *new_type = XFS_IO_COW;
+ return 0;
+ }
+ }
+
+ /*
+ * Else we need to check if there is a COW mapping at this offset.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!is_cow)
+ return 0;
+
+ /*
+ * And if the COW mapping has a delayed extent here we need to
+ * allocate real space for it now.
+ */
+ if (need_alloc) {
+ error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
+ &imap);
+ if (error)
+ return error;
+ }
+
+ wpc->io_type = *new_type = XFS_IO_COW;
+ wpc->imap_valid = true;
+ wpc->imap = imap;
+ return 0;
+}
+
/*
* We implement an immediate ioend submission policy here to avoid needing to
* chain multiple ioends and hence nest mempool allocations which can violate
@@ -752,6 +850,7 @@ xfs_writepage_map(
int error = 0;
int count = 0;
int uptodate = 1;
+ unsigned int new_type;
bh = head = page_buffers(page);
offset = page_offset(page);
@@ -772,22 +871,13 @@ xfs_writepage_map(
continue;
}
- if (buffer_unwritten(bh)) {
- if (wpc->io_type != XFS_IO_UNWRITTEN) {
- wpc->io_type = XFS_IO_UNWRITTEN;
- wpc->imap_valid = false;
- }
- } else if (buffer_delay(bh)) {
- if (wpc->io_type != XFS_IO_DELALLOC) {
- wpc->io_type = XFS_IO_DELALLOC;
- wpc->imap_valid = false;
- }
- } else if (buffer_uptodate(bh)) {
- if (wpc->io_type != XFS_IO_OVERWRITE) {
- wpc->io_type = XFS_IO_OVERWRITE;
- wpc->imap_valid = false;
- }
- } else {
+ if (buffer_unwritten(bh))
+ new_type = XFS_IO_UNWRITTEN;
+ else if (buffer_delay(bh))
+ new_type = XFS_IO_DELALLOC;
+ else if (buffer_uptodate(bh))
+ new_type = XFS_IO_OVERWRITE;
+ else {
if (PageUptodate(page))
ASSERT(buffer_mapped(bh));
/*
@@ -800,6 +890,17 @@ xfs_writepage_map(
continue;
}
+ if (xfs_is_reflink_inode(XFS_I(inode))) {
+ error = xfs_map_cow(wpc, inode, offset, &new_type);
+ if (error)
+ goto out;
+ }
+
+ if (wpc->io_type != new_type) {
+ wpc->io_type = new_type;
+ wpc->imap_valid = false;
+ }
+
if (wpc->imap_valid)
wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
offset);
@@ -1090,18 +1191,24 @@ xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset)
+ xfs_off_t offset,
+ bool is_cow)
{
uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
- ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
+ XFS_IO_OVERWRITE, imap);
if (ISUNWRITTEN(imap)) {
*flags |= XFS_DIO_FLAG_UNWRITTEN;
set_buffer_defer_completion(bh_result);
- } else if (offset + size > i_size_read(inode) || offset + size < 0) {
+ } else if (is_cow) {
+ *flags |= XFS_DIO_FLAG_COW;
+ set_buffer_defer_completion(bh_result);
+ }
+ if (offset + size > i_size_read(inode) || offset + size < 0) {
*flags |= XFS_DIO_FLAG_APPEND;
set_buffer_defer_completion(bh_result);
}
@@ -1147,6 +1254,44 @@ xfs_map_trim_size(
bh_result->b_size = mapping_size;
}
+/* Bounce unaligned directio writes to the page cache. */
+static int
+xfs_bounce_unaligned_dio_write(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_bmbt_irec irec;
+ xfs_fileoff_t delta;
+ bool shared;
+ bool x;
+ int error;
+
+ irec = *imap;
+ if (offset_fsb > irec.br_startoff) {
+ delta = offset_fsb - irec.br_startoff;
+ irec.br_blockcount -= delta;
+ irec.br_startblock += delta;
+ irec.br_startoff = offset_fsb;
+ }
+ error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
+ if (error)
+ return error;
+
+ /*
+ * We're here because we're trying to do a directio write to a
+ * region that isn't aligned to a filesystem block. If any part
+ * of the extent is shared, fall back to buffered mode to handle
+ * the RMW. This is done by returning -EREMCHG ("remote addr
+ * changed"), which is caught further up the call stack.
+ */
+ if (shared) {
+ trace_xfs_reflink_bounce_dio_write(ip, imap);
+ return -EREMCHG;
+ }
+ return 0;
+}
+
STATIC int
__xfs_get_blocks(
struct inode *inode,
@@ -1166,6 +1311,8 @@ __xfs_get_blocks(
xfs_off_t offset;
ssize_t size;
int new = 0;
+ bool is_cow = false;
+ bool need_alloc = false;
BUG_ON(create && !direct);
@@ -1191,8 +1338,26 @@ __xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
- &imap, &nimaps, XFS_BMAPI_ENTIRE);
+ if (create && direct && xfs_is_reflink_inode(ip))
+ is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
+ &need_alloc);
+ if (!is_cow) {
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
+ /*
+ * Truncate an overwrite extent if there's a pending CoW
+ * reservation before the end of this extent. This
+ * forces us to come back to get_blocks to take care of
+ * the CoW.
+ */
+ if (create && direct && nimaps &&
+ imap.br_startblock != HOLESTARTBLOCK &&
+ imap.br_startblock != DELAYSTARTBLOCK &&
+ !ISUNWRITTEN(&imap))
+ xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
+ &imap);
+ }
+ ASSERT(!need_alloc);
if (error)
goto out_unlock;
@@ -1244,6 +1409,13 @@ __xfs_get_blocks(
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
(create || !ISUNWRITTEN(&imap))) {
+ if (create && direct && !is_cow) {
+ error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
+ &imap);
+ if (error)
+ return error;
+ }
+
xfs_map_buffer(inode, bh_result, &imap, offset);
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
@@ -1252,7 +1424,8 @@ __xfs_get_blocks(
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
- xfs_map_direct(inode, bh_result, &imap, offset);
+ xfs_map_direct(inode, bh_result, &imap, offset,
+ is_cow);
}
}
@@ -1336,13 +1509,12 @@ xfs_end_io_direct_write(
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
uintptr_t flags = (uintptr_t)private;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (size <= 0)
@@ -1375,19 +1547,17 @@ xfs_end_io_direct_write(
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
+ if (flags & XFS_DIO_FLAG_COW)
+ error = xfs_reflink_end_cow(ip, offset, size);
if (flags & XFS_DIO_FLAG_UNWRITTEN) {
trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
error = xfs_iomap_write_unwritten(ip, offset, size);
- } else if (flags & XFS_DIO_FLAG_APPEND) {
- struct xfs_trans *tp;
-
+ }
+ if (flags & XFS_DIO_FLAG_APPEND) {
trace_xfs_end_io_direct_write_append(ip, offset, size);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
- &tp);
- if (!error)
- error = xfs_setfilesize(ip, tp, offset, size);
+ error = xfs_setfilesize(ip, offset, size);
}
return error;
@@ -1414,6 +1584,17 @@ xfs_vm_bmap(
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+ /*
+ * The swap code (ab-)uses ->bmap to get a block mapping and then
+ * bypasseѕ the file system for actual I/O. We really can't allow
+ * that on reflinks inodes, so we have to skip out here. And yes,
+ * 0 is the magic code for a bmap error..
+ */
+ if (xfs_is_reflink_inode(ip)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index bf2d9a141a73..b3c6634f9518 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -28,13 +28,15 @@ enum {
XFS_IO_DELALLOC, /* covers delalloc region */
XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
XFS_IO_OVERWRITE, /* covers already allocated extent */
+ XFS_IO_COW, /* covers copy-on-write extent */
};
#define XFS_IO_TYPES \
{ XFS_IO_INVALID, "invalid" }, \
{ XFS_IO_DELALLOC, "delalloc" }, \
{ XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }
+ { XFS_IO_OVERWRITE, "overwrite" }, \
+ { XFS_IO_COW, "CoW" }
/*
* Structure for buffered I/O completions.
@@ -62,6 +64,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
new file mode 100644
index 000000000000..9bf57c76623b
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t *xfs_bui_zone;
+kmem_zone_t *xfs_bud_zone;
+
+static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_bui_log_item, bui_item);
+}
+
+void
+xfs_bui_item_free(
+ struct xfs_bui_log_item *buip)
+{
+ kmem_zone_free(xfs_bui_zone, buip);
+}
+
+STATIC void
+xfs_bui_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_bui_log_item *buip = BUI_ITEM(lip);
+
+ *nvecs += 1;
+ *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bui log item. We use only 1 iovec, and we point that
+ * at the bui_log_format structure embedded in the bui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bui item have been filled.
+ */
+STATIC void
+xfs_bui_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_bui_log_item *buip = BUI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ ASSERT(atomic_read(&buip->bui_next_extent) ==
+ buip->bui_format.bui_nextents);
+
+ buip->bui_format.bui_type = XFS_LI_BUI;
+ buip->bui_format.bui_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format,
+ xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an bui item, so just return.
+ */
+STATIC void
+xfs_bui_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an BUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the BUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the BUI to either construct
+ * and commit the BUD or drop the BUD's reference in the event of error. Simply
+ * drop the log's BUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_bui_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_bui_log_item *buip = BUI_ITEM(lip);
+
+ xfs_bui_release(buip);
+}
+
+/*
+ * BUI items have no locking or pushing. However, since BUIs are pulled from
+ * the AIL when their corresponding BUDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the BUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_bui_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an BUD isn't going to be
+ * constructed and thus we free the BUI here directly.
+ */
+STATIC void
+xfs_bui_item_unlock(
+ struct xfs_log_item *lip)
+{
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_bui_item_free(BUI_ITEM(lip));
+}
+
+/*
+ * The BUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_bui_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ return lsn;
+}
+
+/*
+ * The BUI dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bui_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bui log items.
+ */
+static const struct xfs_item_ops xfs_bui_item_ops = {
+ .iop_size = xfs_bui_item_size,
+ .iop_format = xfs_bui_item_format,
+ .iop_pin = xfs_bui_item_pin,
+ .iop_unpin = xfs_bui_item_unpin,
+ .iop_unlock = xfs_bui_item_unlock,
+ .iop_committed = xfs_bui_item_committed,
+ .iop_push = xfs_bui_item_push,
+ .iop_committing = xfs_bui_item_committing,
+};
+
+/*
+ * Allocate and initialize an bui item with the given number of extents.
+ */
+struct xfs_bui_log_item *
+xfs_bui_init(
+ struct xfs_mount *mp)
+
+{
+ struct xfs_bui_log_item *buip;
+
+ buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+
+ xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+ buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+ buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+ atomic_set(&buip->bui_next_extent, 0);
+ atomic_set(&buip->bui_refcount, 2);
+
+ return buip;
+}
+
+/*
+ * Freeing the BUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the BUI may not yet have been placed in the AIL
+ * when called by xfs_bui_release() from BUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the BUI.
+ */
+void
+xfs_bui_release(
+ struct xfs_bui_log_item *buip)
+{
+ if (atomic_dec_and_test(&buip->bui_refcount)) {
+ xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_bui_item_free(buip);
+ }
+}
+
+static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_bud_log_item, bud_item);
+}
+
+STATIC void
+xfs_bud_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_bud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bud log item. We use only 1 iovec, and we point that
+ * at the bud_log_format structure embedded in the bud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bud item have been filled.
+ */
+STATIC void
+xfs_bud_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ budp->bud_format.bud_type = XFS_LI_BUD;
+ budp->bud_format.bud_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format,
+ sizeof(struct xfs_bud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an bud item, so just return.
+ */
+STATIC void
+xfs_bud_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an bud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_bud_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an bud item. It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_bud_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the BUI and free the
+ * BUD.
+ */
+STATIC void
+xfs_bud_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_bui_release(budp->bud_buip);
+ kmem_zone_free(xfs_bud_zone, budp);
+ }
+}
+
+/*
+ * When the bud item is committed to disk, all we need to do is delete our
+ * reference to our partner bui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_bud_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+
+ /*
+ * Drop the BUI reference regardless of whether the BUD has been
+ * aborted. Once the BUD transaction is constructed, it is the sole
+ * responsibility of the BUD to release the BUI (even if the BUI is
+ * aborted due to log I/O error).
+ */
+ xfs_bui_release(budp->bud_buip);
+ kmem_zone_free(xfs_bud_zone, budp);
+
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * The BUD dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bud_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bud log items.
+ */
+static const struct xfs_item_ops xfs_bud_item_ops = {
+ .iop_size = xfs_bud_item_size,
+ .iop_format = xfs_bud_item_format,
+ .iop_pin = xfs_bud_item_pin,
+ .iop_unpin = xfs_bud_item_unpin,
+ .iop_unlock = xfs_bud_item_unlock,
+ .iop_committed = xfs_bud_item_committed,
+ .iop_push = xfs_bud_item_push,
+ .iop_committing = xfs_bud_item_committing,
+};
+
+/*
+ * Allocate and initialize an bud item with the given number of extents.
+ */
+struct xfs_bud_log_item *
+xfs_bud_init(
+ struct xfs_mount *mp,
+ struct xfs_bui_log_item *buip)
+
+{
+ struct xfs_bud_log_item *budp;
+
+ budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+ xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
+ budp->bud_buip = buip;
+ budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+ return budp;
+}
+
+/*
+ * Process a bmap update intent item that was recovered from the log.
+ * We need to update some inode's bmbt.
+ */
+int
+xfs_bui_recover(
+ struct xfs_mount *mp,
+ struct xfs_bui_log_item *buip)
+{
+ int error = 0;
+ unsigned int bui_type;
+ struct xfs_map_extent *bmap;
+ xfs_fsblock_t startblock_fsb;
+ xfs_fsblock_t inode_fsb;
+ bool op_ok;
+ struct xfs_bud_log_item *budp;
+ enum xfs_bmap_intent_type type;
+ int whichfork;
+ xfs_exntst_t state;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t firstfsb;
+
+ ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+
+ /* Only one mapping operation per BUI... */
+ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+ xfs_bui_release(buip);
+ return -EIO;
+ }
+
+ /*
+ * First check the validity of the extent described by the
+ * BUI. If anything is bad, then toss the BUI.
+ */
+ bmap = &buip->bui_format.bui_extents[0];
+ startblock_fsb = XFS_BB_TO_FSB(mp,
+ XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
+ inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
+ XFS_INO_TO_FSB(mp, bmap->me_owner)));
+ switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ op_ok = true;
+ break;
+ default:
+ op_ok = false;
+ break;
+ }
+ if (!op_ok || startblock_fsb == 0 ||
+ bmap->me_len == 0 ||
+ inode_fsb == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+ bmap->me_len >= mp->m_sb.sb_agblocks ||
+ inode_fsb >= mp->m_sb.sb_dblocks ||
+ (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
+ /*
+ * This will pull the BUI from the AIL and
+ * free the memory associated with it.
+ */
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+ xfs_bui_release(buip);
+ return -EIO;
+ }
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
+ return error;
+ budp = xfs_trans_get_bud(tp, buip);
+
+ /* Grab the inode. */
+ error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+ if (error)
+ goto err_inode;
+
+ if (VFS_I(ip)->i_nlink == 0)
+ xfs_iflags_set(ip, XFS_IRECOVERY);
+ xfs_defer_init(&dfops, &firstfsb);
+
+ /* Process deferred bmap item. */
+ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ switch (bui_type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ type = bui_type;
+ break;
+ default:
+ error = -EFSCORRUPTED;
+ goto err_dfops;
+ }
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+ ip, whichfork, bmap->me_startoff,
+ bmap->me_startblock, bmap->me_len,
+ state);
+ if (error)
+ goto err_dfops;
+
+ /* Finish transaction, free inodes. */
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto err_dfops;
+
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
+
+ return error;
+
+err_dfops:
+ xfs_defer_cancel(&dfops);
+err_inode:
+ xfs_trans_cancel(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
+ }
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
new file mode 100644
index 000000000000..c867daae4a3c
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_BMAP_ITEM_H__
+#define __XFS_BMAP_ITEM_H__
+
+/*
+ * There are (currently) two pairs of bmap btree redo item types: map & unmap.
+ * The common abbreviations for these are BUI (bmap update intent) and BUD
+ * (bmap update done). The redo item type is encoded in the flags field of
+ * each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * bmbt metadata updates in the non-first transaction.
+ */
+
+/* kernel only BUI/BUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_BUI_MAX_FAST_EXTENTS 1
+
+/*
+ * Define BUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define XFS_BUI_RECOVERED 1
+
+/*
+ * This is the "bmap update intent" log item. It is used to log the fact that
+ * some reverse mappings need to change. It is used in conjunction with the
+ * "bmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_bui_log_item {
+ struct xfs_log_item bui_item;
+ atomic_t bui_refcount;
+ atomic_t bui_next_extent;
+ unsigned long bui_flags; /* misc flags */
+ struct xfs_bui_log_format bui_format;
+};
+
+static inline size_t
+xfs_bui_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_bui_log_item, bui_format) +
+ xfs_bui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "bmap update done" log item. It is used to log the fact that
+ * some bmbt updates mentioned in an earlier bui item have been performed.
+ */
+struct xfs_bud_log_item {
+ struct xfs_log_item bud_item;
+ struct xfs_bui_log_item *bud_buip;
+ struct xfs_bud_log_format bud_format;
+};
+
+extern struct kmem_zone *xfs_bui_zone;
+extern struct kmem_zone *xfs_bud_zone;
+
+struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
+struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
+ struct xfs_bui_log_item *);
+void xfs_bui_item_free(struct xfs_bui_log_item *);
+void xfs_bui_release(struct xfs_bui_log_item *);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+
+#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4ece4f2ffc72..552465e011ec 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -42,6 +42,9 @@
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+#include "xfs_refcount.h"
/* Kernel only BMAP related definitions and functions */
@@ -182,7 +185,7 @@ xfs_bmap_rtalloc(
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
/* Zero the extent if we were asked to do so */
- if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
if (error)
return error;
@@ -389,11 +392,13 @@ xfs_bmap_count_blocks(
STATIC int
xfs_getbmapx_fix_eof_hole(
xfs_inode_t *ip, /* xfs incore inode pointer */
+ int whichfork,
struct getbmapx *out, /* output structure */
int prealloced, /* this is a file with
* preallocated data space */
__int64_t end, /* last block requested */
- xfs_fsblock_t startblock)
+ xfs_fsblock_t startblock,
+ bool moretocome)
{
__int64_t fixlen;
xfs_mount_t *mp; /* file system mount point */
@@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole(
else
out->bmv_block = xfs_fsb_to_db(ip, startblock);
fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!moretocome &&
+ xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
(lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
out->bmv_oflags |= BMV_OF_LAST;
}
@@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole(
return 1;
}
+/* Adjust the reported bmap around shared/unshared extent transitions. */
+STATIC int
+xfs_getbmap_adjust_shared(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *map,
+ struct getbmapx *out,
+ struct xfs_bmbt_irec *next_map)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t ebno;
+ xfs_extlen_t elen;
+ xfs_extlen_t nlen;
+ int error;
+
+ next_map->br_startblock = NULLFSBLOCK;
+ next_map->br_startoff = NULLFILEOFF;
+ next_map->br_blockcount = 0;
+
+ /* Only written data blocks can be shared. */
+ if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
+ map->br_startblock == DELAYSTARTBLOCK ||
+ map->br_startblock == HOLESTARTBLOCK ||
+ ISUNWRITTEN(map))
+ return 0;
+
+ agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
+ error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount,
+ &ebno, &elen, true);
+ if (error)
+ return error;
+
+ if (ebno == NULLAGBLOCK) {
+ /* No shared blocks at all. */
+ return 0;
+ } else if (agbno == ebno) {
+ /*
+ * Shared extent at (agbno, elen). Shrink the reported
+ * extent length and prepare to move the start of map[i]
+ * to agbno+elen, with the aim of (re)formatting the new
+ * map[i] the next time through the inner loop.
+ */
+ out->bmv_length = XFS_FSB_TO_BB(mp, elen);
+ out->bmv_oflags |= BMV_OF_SHARED;
+ if (elen != map->br_blockcount) {
+ *next_map = *map;
+ next_map->br_startblock += elen;
+ next_map->br_startoff += elen;
+ next_map->br_blockcount -= elen;
+ }
+ map->br_blockcount -= elen;
+ } else {
+ /*
+ * There's an unshared extent (agbno, ebno - agbno)
+ * followed by shared extent at (ebno, elen). Shrink
+ * the reported extent length to cover only the unshared
+ * extent and prepare to move up the start of map[i] to
+ * ebno, with the aim of (re)formatting the new map[i]
+ * the next time through the inner loop.
+ */
+ *next_map = *map;
+ nlen = ebno - agbno;
+ out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
+ next_map->br_startblock += nlen;
+ next_map->br_startoff += nlen;
+ next_map->br_blockcount -= nlen;
+ map->br_blockcount -= nlen;
+ }
+
+ return 0;
+}
+
/*
* Get inode's extents as described in bmv, and format for output.
* Calls formatter to fill the user's buffer until all extents
@@ -459,12 +540,28 @@ xfs_getbmap(
int iflags; /* interface flags */
int bmapi_flags; /* flags for xfs_bmapi */
int cur_ext = 0;
+ struct xfs_bmbt_irec inject_map;
mp = ip->i_mount;
iflags = bmv->bmv_iflags;
- whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
- if (whichfork == XFS_ATTR_FORK) {
+#ifndef DEBUG
+ /* Only allow CoW fork queries if we're debugging. */
+ if (iflags & BMV_IF_COWFORK)
+ return -EINVAL;
+#endif
+ if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
+ return -EINVAL;
+
+ if (iflags & BMV_IF_ATTRFORK)
+ whichfork = XFS_ATTR_FORK;
+ else if (iflags & BMV_IF_COWFORK)
+ whichfork = XFS_COW_FORK;
+ else
+ whichfork = XFS_DATA_FORK;
+
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
if (XFS_IFORK_Q(ip)) {
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
@@ -480,7 +577,20 @@ xfs_getbmap(
prealloced = 0;
fixlen = 1LL << 32;
- } else {
+ break;
+ case XFS_COW_FORK:
+ if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
+ return -EINVAL;
+
+ if (xfs_get_cowextsz_hint(ip)) {
+ prealloced = 1;
+ fixlen = mp->m_super->s_maxbytes;
+ } else {
+ prealloced = 0;
+ fixlen = XFS_ISIZE(ip);
+ }
+ break;
+ default:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -494,6 +604,7 @@ xfs_getbmap(
prealloced = 0;
fixlen = XFS_ISIZE(ip);
}
+ break;
}
if (bmv->bmv_length == -1) {
@@ -520,7 +631,8 @@ xfs_getbmap(
return -ENOMEM;
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (whichfork == XFS_DATA_FORK) {
+ switch (whichfork) {
+ case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
(ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -538,8 +650,14 @@ xfs_getbmap(
}
lock = xfs_ilock_data_map_shared(ip);
- } else {
+ break;
+ case XFS_COW_FORK:
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
+ break;
+ case XFS_ATTR_FORK:
lock = xfs_ilock_attr_map_shared(ip);
+ break;
}
/*
@@ -581,7 +699,8 @@ xfs_getbmap(
goto out_free_map;
ASSERT(nmap <= subnex);
- for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+ for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
+ cur_ext < bmv->bmv_count; i++) {
out[cur_ext].bmv_oflags = 0;
if (map[i].br_state == XFS_EXT_UNWRITTEN)
out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -614,9 +733,16 @@ xfs_getbmap(
goto out_free_map;
}
- if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
- prealloced, bmvend,
- map[i].br_startblock))
+ /* Is this a shared block? */
+ error = xfs_getbmap_adjust_shared(ip, whichfork,
+ &map[i], &out[cur_ext], &inject_map);
+ if (error)
+ goto out_free_map;
+
+ if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
+ &out[cur_ext], prealloced, bmvend,
+ map[i].br_startblock,
+ inject_map.br_startblock != NULLFSBLOCK))
goto out_free_map;
bmv->bmv_offset =
@@ -636,11 +762,16 @@ xfs_getbmap(
continue;
}
- nexleft--;
+ if (inject_map.br_startblock != NULLFSBLOCK) {
+ map[i] = inject_map;
+ i--;
+ } else
+ nexleft--;
bmv->bmv_entries++;
cur_ext++;
}
- } while (nmap && nexleft && bmv->bmv_length);
+ } while (nmap && nexleft && bmv->bmv_length &&
+ cur_ext < bmv->bmv_count);
out_free_map:
kmem_free(map);
@@ -1433,8 +1564,8 @@ xfs_insert_file_space(
*/
static int
xfs_swap_extents_check_format(
- xfs_inode_t *ip, /* target inode */
- xfs_inode_t *tip) /* tmp inode */
+ struct xfs_inode *ip, /* target inode */
+ struct xfs_inode *tip) /* tmp inode */
{
/* Should never get a local format */
@@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format(
return -EINVAL;
/*
+ * If we have to use the (expensive) rmap swap method, we can
+ * handle any number of extents and any format.
+ */
+ if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+ return 0;
+
+ /*
* if the target inode is in extent form and the temp inode is in btree
* form then we will end up with the target inode in the wrong format
* as we already know there are less extents in the temp inode.
@@ -1518,125 +1656,161 @@ xfs_swap_extent_flush(
return 0;
}
-int
-xfs_swap_extents(
- xfs_inode_t *ip, /* target inode */
- xfs_inode_t *tip, /* tmp inode */
- xfs_swapext_t *sxp)
+/*
+ * Move extents from one file to another, when rmap is enabled.
+ */
+STATIC int
+xfs_swap_extent_rmap(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ struct xfs_inode *tip)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- xfs_bstat_t *sbp = &sxp->sx_stat;
- xfs_ifork_t *tempifp, *ifp, *tifp;
- int src_log_flags, target_log_flags;
- int error = 0;
- int aforkblks = 0;
- int taforkblks = 0;
- __uint64_t tmp;
- int lock_flags;
-
- /* XXX: we can't do this with rmap, will fix later */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- return -EOPNOTSUPP;
-
- tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
- if (!tempifp) {
- error = -ENOMEM;
- goto out;
- }
+ struct xfs_bmbt_irec irec;
+ struct xfs_bmbt_irec uirec;
+ struct xfs_bmbt_irec tirec;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_fsblock_t firstfsb;
+ struct xfs_defer_ops dfops;
+ int error;
+ xfs_filblks_t ilen;
+ xfs_filblks_t rlen;
+ int nimaps;
+ __uint64_t tip_flags2;
/*
- * Lock the inodes against other IO, page faults and truncate to
- * begin with. Then we can ensure the inodes are flushed and have no
- * page cache safely. Once we have done this we can take the ilocks and
- * do the rest of the checks.
+ * If the source file has shared blocks, we must flag the donor
+ * file as having shared blocks so that we get the shared-block
+ * rmap functions when we go to fix up the rmaps. The flags
+ * will be switch for reals later.
*/
- lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
- xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
-
- /* Verify that both files have the same format */
- if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
- error = -EINVAL;
- goto out_unlock;
- }
+ tip_flags2 = tip->i_d.di_flags2;
+ if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
+ tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+
+ offset_fsb = 0;
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
+ count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+ while (count_fsb) {
+ /* Read extent from the donor file */
+ nimaps = 1;
+ error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
+ &nimaps, 0);
+ if (error)
+ goto out;
+ ASSERT(nimaps == 1);
+ ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
+
+ trace_xfs_swap_extent_rmap_remap(tip, &tirec);
+ ilen = tirec.br_blockcount;
+
+ /* Unmap the old blocks in the source file. */
+ while (tirec.br_blockcount) {
+ xfs_defer_init(&dfops, &firstfsb);
+ trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
+
+ /* Read extent from the source file */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, tirec.br_startoff,
+ tirec.br_blockcount, &irec,
+ &nimaps, 0);
+ if (error)
+ goto out_defer;
+ ASSERT(nimaps == 1);
+ ASSERT(tirec.br_startoff == irec.br_startoff);
+ trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
+
+ /* Trim the extent. */
+ uirec = tirec;
+ uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
+ tirec.br_blockcount,
+ irec.br_blockcount);
+ trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+
+ /* Remove the mapping from the donor file. */
+ error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+ tip, &uirec);
+ if (error)
+ goto out_defer;
- /* Verify both files are either real-time or non-realtime */
- if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
- error = -EINVAL;
- goto out_unlock;
- }
+ /* Remove the mapping from the source file. */
+ error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+ ip, &irec);
+ if (error)
+ goto out_defer;
- error = xfs_swap_extent_flush(ip);
- if (error)
- goto out_unlock;
- error = xfs_swap_extent_flush(tip);
- if (error)
- goto out_unlock;
+ /* Map the donor file's blocks into the source file. */
+ error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+ ip, &uirec);
+ if (error)
+ goto out_defer;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
- if (error)
- goto out_unlock;
+ /* Map the source file's blocks into the donor file. */
+ error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+ tip, &irec);
+ if (error)
+ goto out_defer;
- /*
- * Lock and join the inodes to the tansaction so that transaction commit
- * or cancel will unlock the inodes from this point onwards.
- */
- xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
- lock_flags |= XFS_ILOCK_EXCL;
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ijoin(tp, tip, lock_flags);
+ error = xfs_defer_finish(tpp, &dfops, ip);
+ if (error)
+ goto out_defer;
+ tirec.br_startoff += rlen;
+ if (tirec.br_startblock != HOLESTARTBLOCK &&
+ tirec.br_startblock != DELAYSTARTBLOCK)
+ tirec.br_startblock += rlen;
+ tirec.br_blockcount -= rlen;
+ }
- /* Verify all data are being swapped */
- if (sxp->sx_offset != 0 ||
- sxp->sx_length != ip->i_d.di_size ||
- sxp->sx_length != tip->i_d.di_size) {
- error = -EFAULT;
- goto out_trans_cancel;
+ /* Roll on... */
+ count_fsb -= ilen;
+ offset_fsb += ilen;
}
- trace_xfs_swap_extent_before(ip, 0);
- trace_xfs_swap_extent_before(tip, 1);
+ tip->i_d.di_flags2 = tip_flags2;
+ return 0;
- /* check inode formats now that data is flushed */
- error = xfs_swap_extents_check_format(ip, tip);
- if (error) {
- xfs_notice(mp,
- "%s: inode 0x%llx format is incompatible for exchanging.",
- __func__, ip->i_ino);
- goto out_trans_cancel;
- }
+out_defer:
+ xfs_defer_cancel(&dfops);
+out:
+ trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
+ tip->i_d.di_flags2 = tip_flags2;
+ return error;
+}
+
+/* Swap the extents of two files by swapping data forks. */
+STATIC int
+xfs_swap_extent_forks(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_inode *tip,
+ int *src_log_flags,
+ int *target_log_flags)
+{
+ struct xfs_ifork tempifp, *ifp, *tifp;
+ int aforkblks = 0;
+ int taforkblks = 0;
+ __uint64_t tmp;
+ int error;
- /*
- * Compare the current change & modify times with that
- * passed in. If they differ, we abort this swap.
- * This is the mechanism used to ensure the calling
- * process that the file was not changed out from
- * under it.
- */
- if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
- (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
- (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
- (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
- error = -EBUSY;
- goto out_trans_cancel;
- }
/*
* Count the number of extended attribute blocks
*/
if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+ error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK,
+ &aforkblks);
if (error)
- goto out_trans_cancel;
+ return error;
}
if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
(tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
- &taforkblks);
+ &taforkblks);
if (error)
- goto out_trans_cancel;
+ return error;
}
/*
@@ -1645,31 +1819,23 @@ xfs_swap_extents(
* buffers, and so the validation done on read will expect the owner
* field to be correctly set. Once we change the owners, we can swap the
* inode forks.
- *
- * Note the trickiness in setting the log flags - we set the owner log
- * flag on the opposite inode (i.e. the inode we are setting the new
- * owner to be) because once we swap the forks and log that, log
- * recovery is going to see the fork as owned by the swapped inode,
- * not the pre-swapped inodes.
*/
- src_log_flags = XFS_ILOG_CORE;
- target_log_flags = XFS_ILOG_CORE;
if (ip->i_d.di_version == 3 &&
ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- target_log_flags |= XFS_ILOG_DOWNER;
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
tip->i_ino, NULL);
if (error)
- goto out_trans_cancel;
+ return error;
}
if (tip->i_d.di_version == 3 &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- src_log_flags |= XFS_ILOG_DOWNER;
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
ip->i_ino, NULL);
if (error)
- goto out_trans_cancel;
+ return error;
}
/*
@@ -1677,9 +1843,9 @@ xfs_swap_extents(
*/
ifp = &ip->i_df;
tifp = &tip->i_df;
- *tempifp = *ifp; /* struct copy */
+ tempifp = *ifp; /* struct copy */
*ifp = *tifp; /* struct copy */
- *tifp = *tempifp; /* struct copy */
+ *tifp = tempifp; /* struct copy */
/*
* Fix the on-disk inode values
@@ -1719,12 +1885,12 @@ xfs_swap_extents(
ifp->if_u1.if_extents =
ifp->if_u2.if_inline_ext;
}
- src_log_flags |= XFS_ILOG_DEXT;
+ (*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
ASSERT(ip->i_d.di_version < 3 ||
- (src_log_flags & XFS_ILOG_DOWNER));
- src_log_flags |= XFS_ILOG_DBROOT;
+ (*src_log_flags & XFS_ILOG_DOWNER));
+ (*src_log_flags) |= XFS_ILOG_DBROOT;
break;
}
@@ -1738,15 +1904,166 @@ xfs_swap_extents(
tifp->if_u1.if_extents =
tifp->if_u2.if_inline_ext;
}
- target_log_flags |= XFS_ILOG_DEXT;
+ (*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- target_log_flags |= XFS_ILOG_DBROOT;
+ (*target_log_flags) |= XFS_ILOG_DBROOT;
ASSERT(tip->i_d.di_version < 3 ||
- (target_log_flags & XFS_ILOG_DOWNER));
+ (*target_log_flags & XFS_ILOG_DOWNER));
break;
}
+ return 0;
+}
+
+int
+xfs_swap_extents(
+ struct xfs_inode *ip, /* target inode */
+ struct xfs_inode *tip, /* tmp inode */
+ struct xfs_swapext *sxp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_bstat *sbp = &sxp->sx_stat;
+ int src_log_flags, target_log_flags;
+ int error = 0;
+ int lock_flags;
+ struct xfs_ifork *cowfp;
+ __uint64_t f;
+ int resblks;
+
+ /*
+ * Lock the inodes against other IO, page faults and truncate to
+ * begin with. Then we can ensure the inodes are flushed and have no
+ * page cache safely. Once we have done this we can take the ilocks and
+ * do the rest of the checks.
+ */
+ lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
+
+ /* Verify that both files have the same format */
+ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
+ error = xfs_swap_extent_flush(ip);
+ if (error)
+ goto out_unlock;
+ error = xfs_swap_extent_flush(tip);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Extent "swapping" with rmap requires a permanent reservation and
+ * a block reservation because it's really just a remap operation
+ * performed with log redo items!
+ */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ /*
+ * Conceptually this shouldn't affect the shape of either
+ * bmbt, but since we atomically move extents one by one,
+ * we reserve enough space to rebuild both trees.
+ */
+ resblks = XFS_SWAP_RMAP_SPACE_RES(mp,
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK),
+ XFS_DATA_FORK) +
+ XFS_SWAP_RMAP_SPACE_RES(mp,
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+ XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+ 0, 0, &tp);
+ } else
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+ 0, 0, &tp);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Lock and join the inodes to the tansaction so that transaction commit
+ * or cancel will unlock the inodes from this point onwards.
+ */
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+ lock_flags |= XFS_ILOCK_EXCL;
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, tip, 0);
+
+
+ /* Verify all data are being swapped */
+ if (sxp->sx_offset != 0 ||
+ sxp->sx_length != ip->i_d.di_size ||
+ sxp->sx_length != tip->i_d.di_size) {
+ error = -EFAULT;
+ goto out_trans_cancel;
+ }
+
+ trace_xfs_swap_extent_before(ip, 0);
+ trace_xfs_swap_extent_before(tip, 1);
+
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_notice(mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __func__, ip->i_ino);
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Compare the current change & modify times with that
+ * passed in. If they differ, we abort this swap.
+ * This is the mechanism used to ensure the calling
+ * process that the file was not changed out from
+ * under it.
+ */
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ error = -EBUSY;
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Note the trickiness in setting the log flags - we set the owner log
+ * flag on the opposite inode (i.e. the inode we are setting the new
+ * owner to be) because once we swap the forks and log that, log
+ * recovery is going to see the fork as owned by the swapped inode,
+ * not the pre-swapped inodes.
+ */
+ src_log_flags = XFS_ILOG_CORE;
+ target_log_flags = XFS_ILOG_CORE;
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ error = xfs_swap_extent_rmap(&tp, ip, tip);
+ else
+ error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
+ &target_log_flags);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Do we have to swap reflink flags? */
+ if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
+ (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
+ f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+ tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+ cowfp = ip->i_cowfp;
+ ip->i_cowfp = tip->i_cowfp;
+ tip->i_cowfp = cowfp;
+ xfs_inode_set_cowblocks_tag(ip);
+ xfs_inode_set_cowblocks_tag(tip);
+ }
+
xfs_trans_log_inode(tp, ip, src_log_flags);
xfs_trans_log_inode(tp, tip, target_log_flags);
@@ -1761,16 +2078,16 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
-out:
- kmem_free(tempifp);
- return error;
-out_unlock:
xfs_iunlock(ip, lock_flags);
xfs_iunlock(tip, lock_flags);
- goto out;
+ return error;
out_trans_cancel:
xfs_trans_cancel(tp);
- goto out;
+
+out_unlock:
+ xfs_iunlock(ip, lock_flags);
+ xfs_iunlock(tip, lock_flags);
+ return error;
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e455f9098d49..2975cb2319f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
*/
if (bit) {
end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
- mask = ((1 << (end_bit - bit)) - 1) << bit;
+ mask = ((1U << (end_bit - bit)) - 1) << bit;
*wordp |= mask;
wordp++;
bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
*/
end_bit = bits_to_set - bits_set;
if (end_bit) {
- mask = (1 << end_bit) - 1;
+ mask = (1U << end_bit) - 1;
*wordp |= mask;
}
}
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
bp->b_last_error != bp->b_error) {
bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout && !bp->b_first_retry_time)
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
bp->b_first_retry_time = jiffies;
xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
++bp->b_retries > cfg->max_retries)
goto permanent_error;
- if (cfg->retry_timeout &&
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
goto permanent_error;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f44f79996978..29816981b50a 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -84,7 +84,8 @@ xfs_dir2_sf_getdents(
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count))
+ return -EFSCORRUPTED;
/*
* If the block number in the offset is out of range, we're done.
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 3d224702fbc0..05f8666733a0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERRTAG_BMAPIFORMAT 21
#define XFS_ERRTAG_FREE_EXTENT 22
#define XFS_ERRTAG_RMAP_FINISH_ONE 23
-#define XFS_ERRTAG_MAX 24
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
+#define XFS_ERRTAG_BMAP_FINISH_ONE 26
+#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+#define XFS_ERRTAG_MAX 28
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
#define XFS_RANDOM_FREE_EXTENT 1
#define XFS_RANDOM_RMAP_FINISH_ONE 1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
+#define XFS_RANDOM_BMAP_FINISH_ONE 1
+#define XFS_RANDOM_AG_RESV_CRITICAL 4
#ifdef DEBUG
extern int xfs_error_test_active;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index c263e079273e..162dc186cf04 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -384,7 +384,7 @@ restart:
* If this is a metadata allocation, try to reuse the busy
* extent instead of trimming the allocation.
*/
- if (!args->userdata &&
+ if (!xfs_alloc_is_userdata(args->datatype) &&
!(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
if (!xfs_extent_busy_update_extent(args->mp, args->pag,
busyp, fbno, flen,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e612a0233710..6e4f7f900fea 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
#include "xfs_icache.h"
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
+#include "xfs_reflink.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -248,6 +249,7 @@ xfs_file_dio_aio_read(
struct xfs_inode *ip = XFS_I(inode);
loff_t isize = i_size_read(inode);
size_t count = iov_iter_count(to);
+ loff_t end = iocb->ki_pos + count - 1;
struct iov_iter data;
struct xfs_buftarg *target;
ssize_t ret = 0;
@@ -269,61 +271,35 @@ xfs_file_dio_aio_read(
return -EINVAL;
}
- /*
- * Locking is a bit tricky here. If we take an exclusive lock for direct
- * IO, we effectively serialise all new concurrent read IO to this file
- * and block it behind IO that is currently in progress because IO in
- * progress holds the IO lock shared. We only need to hold the lock
- * exclusive to blow away the page cache, so only take lock exclusively
- * if the page cache needs invalidation. This allows the normal direct
- * IO case of no page cache pages to proceeed concurrently without
- * serialisation.
- */
+ file_accessed(iocb->ki_filp);
+
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if (mapping->nrpages) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_unlock;
/*
- * The generic dio code only flushes the range of the particular
- * I/O. Because we take an exclusive lock here, this whole
- * sequence is considerably more expensive for us. This has a
- * noticeable performance impact for any file with cached pages,
- * even when outside of the range of the particular I/O.
- *
- * Hence, amortize the cost of the lock against a full file
- * flush and reduce the chances of repeated iolock cycles going
- * forward.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- if (mapping->nrpages) {
- ret = filemap_write_and_wait(mapping);
- if (ret) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
- }
-
- /*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
- */
- ret = invalidate_inode_pages2(mapping);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
data = *to;
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, NULL, NULL, 0);
- if (ret > 0) {
+ if (ret >= 0) {
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
}
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- file_accessed(iocb->ki_filp);
+out_unlock:
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -332,10 +308,7 @@ xfs_file_dax_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct iov_iter data = *to;
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
size_t count = iov_iter_count(to);
ssize_t ret = 0;
@@ -345,11 +318,7 @@ xfs_file_dax_read(
return 0; /* skip atime */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(to, ret);
- }
+ ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
@@ -399,45 +368,6 @@ xfs_file_read_iter(
return ret;
}
-STATIC ssize_t
-xfs_file_splice_read(
- struct file *infilp,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t count,
- unsigned int flags)
-{
- struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- ssize_t ret;
-
- XFS_STATS_INC(ip->i_mount, xs_read_calls);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- trace_xfs_file_splice_read(ip, count, *ppos);
-
- /*
- * DAX inodes cannot ues the page cache for splice, so we have to push
- * them through the VFS IO path. This means it goes through
- * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
- * cannot lock the splice operation at this level for DAX inodes.
- */
- if (IS_DAX(VFS_I(ip))) {
- ret = default_file_splice_read(infilp, ppos, pipe, count,
- flags);
- goto out;
- }
-
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-out:
- if (ret > 0)
- XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
- return ret;
-}
-
/*
* Zero any on disk space between the current EOF and the new, larger EOF.
*
@@ -614,61 +544,49 @@ xfs_file_dio_aio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask))
- unaligned_io = 1;
-
/*
- * We don't need to take an exclusive lock unless there page cache needs
- * to be invalidated or unaligned IO is being executed. We don't need to
- * consider the EOF extension case here because
- * xfs_file_aio_write_checks() will relock the inode as necessary for
- * EOF zeroing cases and fill out the new inode size as appropriate.
+ * Don't take the exclusive iolock here unless the I/O is unaligned to
+ * the file system block size. We don't need to consider the EOF
+ * extension case here because xfs_file_aio_write_checks() will relock
+ * the inode as necessary for EOF zeroing cases and fill out the new
+ * inode size as appropriate.
*/
- if (unaligned_io || mapping->nrpages)
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ unaligned_io = 1;
iolock = XFS_IOLOCK_EXCL;
- else
+ } else {
iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, iolock);
-
- /*
- * Recheck if there are cached pages that need invalidate after we got
- * the iolock to protect against other threads adding new pages while
- * we were waiting for the iolock.
- */
- if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, iolock);
- iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
}
+ xfs_rw_ilock(ip, iolock);
+
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
end = iocb->ki_pos + count - 1;
- /*
- * See xfs_file_dio_aio_read() for why we do a full-file flush here.
- */
if (mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
if (ret)
goto out;
+
/*
* Invalidate whole pages. This can return an error if we fail
* to invalidate a page, but this should never happen on XFS.
* Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
}
/*
* If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to flush cached pages
+ * otherwise demote the lock if we had to take the exclusive lock
+ * for other reasons in xfs_file_aio_write_checks.
*/
if (unaligned_io)
inode_dio_wait(inode);
@@ -679,6 +597,13 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ /* If this is a block-aligned directio CoW, remap immediately. */
+ if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+ ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+ if (ret)
+ goto out;
+ }
+
data = *from;
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, xfs_end_io_direct_write,
@@ -711,70 +636,32 @@ xfs_file_dax_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- struct iov_iter data;
+ int iolock = XFS_IOLOCK_EXCL;
+ ssize_t ret, error = 0;
+ size_t count;
+ loff_t pos;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
- unaligned_io = 1;
- iolock = XFS_IOLOCK_EXCL;
- } else if (mapping->nrpages) {
- iolock = XFS_IOLOCK_EXCL;
- } else {
- iolock = XFS_IOLOCK_SHARED;
- }
xfs_rw_ilock(ip, iolock);
-
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
- /*
- * Yes, even DAX files can have page cache attached to them: A zeroed
- * page is inserted into the pagecache when we have to serve a write
- * fault on a hole. It should never be dirtied and can simply be
- * dropped from the pagecache once we get real data for the page.
- *
- * XXX: This is racy against mmap, and there's nothing we can do about
- * it. dax_do_io() should really do this invalidation internally as
- * it will know if we've allocated over a holei for this specific IO and
- * if so it needs to update the mapping tree and invalidate existing
- * PTEs over the newly allocated range. Remove this invalidation when
- * dax_do_io() is fixed up.
- */
- if (mapping->nrpages) {
- loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- }
+ trace_xfs_file_dax_write(ip, count, pos);
- if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+ i_size_write(inode, iocb->ki_pos);
+ error = xfs_setfilesize(ip, pos, ret);
}
- trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-
- data = *from;
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
- xfs_end_io_direct_write, 0);
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(from, ret);
- }
out:
xfs_rw_iunlock(ip, iolock);
- return ret;
+ return error ? error : ret;
}
STATIC ssize_t
@@ -818,6 +705,9 @@ write_retry:
enospc = xfs_inode_free_quota_eofblocks(ip);
if (enospc)
goto write_retry;
+ enospc = xfs_inode_free_quota_cowblocks(ip);
+ if (enospc)
+ goto write_retry;
} else if (ret == -ENOSPC && !enospc) {
struct xfs_eofblocks eofb = {0};
@@ -857,10 +747,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
ret = xfs_file_dax_write(iocb, from);
- else if (iocb->ki_flags & IOCB_DIRECT)
+ else if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Allow a directio write to fall back to a buffered
+ * write *only* in the case that we're doing a reflink
+ * CoW. In all other directio scenarios we do not
+ * allow an operation to fall back to buffered mode.
+ */
ret = xfs_file_dio_aio_write(iocb, from);
- else
+ if (ret == -EREMCHG)
+ goto buffered;
+ } else {
+buffered:
ret = xfs_file_buffered_aio_write(iocb, from);
+ }
if (ret > 0) {
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
@@ -874,7 +774,7 @@ xfs_file_write_iter(
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
- FALLOC_FL_INSERT_RANGE)
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
xfs_file_fallocate(
@@ -964,9 +864,15 @@ xfs_file_fallocate(
if (mode & FALLOC_FL_ZERO_RANGE)
error = xfs_zero_file_space(ip, offset, len);
- else
+ else {
+ if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
+ }
if (error)
goto out_unlock;
}
@@ -984,7 +890,7 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_setattr_size(ip, &iattr);
+ error = xfs_vn_setattr_size(file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
@@ -1003,6 +909,61 @@ out_unlock:
return error;
}
+STATIC ssize_t
+xfs_file_copy_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ size_t len,
+ unsigned int flags)
+{
+ int error;
+
+ error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+ if (error)
+ return error;
+ return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+ struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > XFS_MAX_DEDUPE_LEN)
+ len = XFS_MAX_DEDUPE_LEN;
+
+ error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
STATIC int
xfs_file_open(
@@ -1513,7 +1474,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+ ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1508,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+ ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1652,7 +1613,7 @@ const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
.read_iter = xfs_file_read_iter,
.write_iter = xfs_file_write_iter,
- .splice_read = xfs_file_splice_read,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
@@ -1662,7 +1623,11 @@ const struct file_operations xfs_file_operations = {
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
+ .copy_file_range = xfs_file_copy_range,
+ .clone_file_range = xfs_file_clone_range,
+ .dedupe_file_range = xfs_file_dedupe_range,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 4a33a3304369..043ca3808ea2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
+#include "xfs_ag_resv.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
}
longest = xfs_alloc_longest_free_extent(mp, pag,
- xfs_alloc_min_freelist(mp, pag));
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
@@ -369,7 +371,8 @@ xfs_filestream_new_ag(
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t minlen = ap->length;
xfs_agnumber_t startag = 0;
- int flags, err = 0;
+ int flags = 0;
+ int err = 0;
struct xfs_mru_cache_elem *mru;
*agp = NULLAGNUMBER;
@@ -385,8 +388,10 @@ xfs_filestream_new_ag(
startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
- (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
+ if (xfs_alloc_is_userdata(ap->datatype))
+ flags |= XFS_PICK_USERDATA;
+ if (ap->dfops->dop_low)
+ flags |= XFS_PICK_LOWSPACE;
err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0b7f986745c1..93d12fa2670d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -43,6 +43,7 @@
#include "xfs_log.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
/*
* File system operations
@@ -108,7 +109,9 @@ xfs_fs_geometry(
(xfs_sb_version_hassparseinodes(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |
(xfs_sb_version_hasrmapbt(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_RMAPBT : 0);
+ XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) |
+ (xfs_sb_version_hasreflink(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_REFLINK : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -259,6 +262,12 @@ xfs_growfs_data_private(
agf->agf_longest = cpu_to_be32(tmpsize);
if (xfs_sb_version_hascrc(&mp->m_sb))
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agf->agf_refcount_root = cpu_to_be32(
+ xfs_refc_block(mp));
+ agf->agf_refcount_level = cpu_to_be32(1);
+ agf->agf_refcount_blocks = cpu_to_be32(1);
+ }
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -450,6 +459,17 @@ xfs_growfs_data_private(
rrec->rm_offset = 0;
be16_add_cpu(&block->bb_numrecs, 1);
+ /* account for refc btree root */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ rrec = XFS_RMAP_REC_ADDR(block, 5);
+ rrec->rm_startblock = cpu_to_be32(
+ xfs_refc_block(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -507,6 +527,28 @@ xfs_growfs_data_private(
goto error0;
}
+ /*
+ * refcount btree root block
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_refcountbt_buf_ops);
+ if (!bp) {
+ error = -ENOMEM;
+ goto error0;
+ }
+
+ xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC,
+ 0, 0, agno,
+ XFS_BTREE_CRC_BLOCKS);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
}
xfs_trans_agblocks_delta(tp, nfree);
/*
@@ -553,7 +595,7 @@ xfs_growfs_data_private(
error = xfs_free_extent(tp,
XFS_AGB_TO_FSB(mp, agno,
be32_to_cpu(agf->agf_length) - new),
- new, &oinfo);
+ new, &oinfo, XFS_AG_RESV_NONE);
if (error)
goto error0;
}
@@ -589,6 +631,11 @@ xfs_growfs_data_private(
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ /* Reserve AG metadata blocks. */
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+ goto out;
+
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
error = 0;
@@ -639,6 +686,8 @@ xfs_growfs_data_private(
continue;
}
}
+
+ out:
return saved_error ? saved_error : error;
error0:
@@ -948,3 +997,59 @@ xfs_do_force_shutdown(
"Please umount the filesystem and rectify the problem(s)");
}
}
+
+/*
+ * Reserve free space for per-AG metadata.
+ */
+int
+xfs_fs_reserve_ag_blocks(
+ struct xfs_mount *mp)
+{
+ xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ int error = 0;
+ int err2;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+ err2 = xfs_ag_resv_init(pag);
+ xfs_perag_put(pag);
+ if (err2 && !error)
+ error = err2;
+ }
+
+ if (error && error != -ENOSPC) {
+ xfs_warn(mp,
+ "Error %d reserving per-AG metadata reserve pool.", error);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
+ return error;
+}
+
+/*
+ * Free space reserved for per-AG metadata.
+ */
+int
+xfs_fs_unreserve_ag_blocks(
+ struct xfs_mount *mp)
+{
+ xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ int error = 0;
+ int err2;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+ err2 = xfs_ag_resv_free(pag);
+ xfs_perag_put(pag);
+ if (err2 && !error)
+ error = err2;
+ }
+
+ if (error)
+ xfs_warn(mp,
+ "Error %d freeing per-AG metadata reserve pool.", error);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f32713f14f9a..f34915898fea 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 4d41b241298f..687a4b01fc53 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,8 +21,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer, which is measured in
- * seconds.
+ * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
+ * are measured in seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -42,6 +42,7 @@ xfs_param_t xfs_params = {
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
.eofb_timer = { 1, 300, 3600*24},
+ .cowb_timer = { 1, 1800, 3600*24},
};
struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fb39a66914dd..f295049db681 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,7 @@
#include "xfs_bmap_util.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
+#include "xfs_reflink.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -76,6 +77,9 @@ xfs_inode_alloc(
ip->i_mount = mp;
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
ip->i_afp = NULL;
+ ip->i_cowfp = NULL;
+ ip->i_cnextents = 0;
+ ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
@@ -101,6 +105,8 @@ xfs_inode_free_callback(
if (ip->i_afp)
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ if (ip->i_cowfp)
+ xfs_idestroy_fork(ip, XFS_COW_FORK);
if (ip->i_itemp) {
ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
@@ -787,6 +793,33 @@ xfs_eofblocks_worker(
xfs_queue_eofblocks(mp);
}
+/*
+ * Background scanning to trim preallocated CoW space. This is queued
+ * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
+ * (We'll just piggyback on the post-EOF prealloc space workqueue.)
+ */
+STATIC void
+xfs_queue_cowblocks(
+ struct xfs_mount *mp)
+{
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
+ queue_delayed_work(mp->m_eofblocks_workqueue,
+ &mp->m_cowblocks_work,
+ msecs_to_jiffies(xfs_cowb_secs * 1000));
+ rcu_read_unlock();
+}
+
+void
+xfs_cowblocks_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_cowblocks_work);
+ xfs_icache_free_cowblocks(mp, NULL);
+ xfs_queue_cowblocks(mp);
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
@@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks(
return ret;
}
-int
-xfs_icache_free_eofblocks(
+static int
+__xfs_icache_free_eofblocks(
struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
+ struct xfs_eofblocks *eofb,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int tag)
{
int flags = SYNC_TRYLOCK;
if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
flags = SYNC_WAIT;
- return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
- eofb, XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_inode_ag_iterator_tag(mp, execute, flags,
+ eofb, tag);
+}
+
+int
+xfs_icache_free_eofblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+ XFS_ICI_EOFBLOCKS_TAG);
}
/*
@@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks(
* failure. We make a best effort by including each quota under low free space
* conditions (less than 1% free space) in the scan.
*/
-int
-xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip)
+static int
+__xfs_inode_free_quota_eofblocks(
+ struct xfs_inode *ip,
+ int (*execute)(struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb))
{
int scan = 0;
struct xfs_eofblocks eofb = {0};
@@ -1401,41 +1448,58 @@ xfs_inode_free_quota_eofblocks(
}
if (scan)
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+ execute(ip->i_mount, &eofb);
return scan;
}
-void
-xfs_inode_set_eofblocks_tag(
- xfs_inode_t *ip)
+int
+xfs_inode_free_quota_eofblocks(
+ struct xfs_inode *ip)
+{
+ return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
+}
+
+static void
+__xfs_inode_set_eofblocks_tag(
+ xfs_inode_t *ip,
+ void (*execute)(struct xfs_mount *mp),
+ void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int error, unsigned long caller_ip),
+ int tag)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
int tagged;
+ /*
+ * Don't bother locking the AG and looking up in the radix trees
+ * if we already know that we have the tag set.
+ */
+ if (ip->i_flags & XFS_IEOFBLOCKS)
+ return;
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags |= XFS_IEOFBLOCKS;
+ spin_unlock(&ip->i_flags_lock);
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- trace_xfs_inode_set_eofblocks_tag(ip);
- tagged = radix_tree_tagged(&pag->pag_ici_root,
- XFS_ICI_EOFBLOCKS_TAG);
+ tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
if (!tagged) {
/* propagate the eofblocks tag up into the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_set(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ tag);
spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */
- xfs_queue_eofblocks(ip->i_mount);
+ execute(ip->i_mount);
- trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
+ set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1443,31 +1507,166 @@ xfs_inode_set_eofblocks_tag(
}
void
-xfs_inode_clear_eofblocks_tag(
+xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
+ trace_xfs_inode_set_eofblocks_tag(ip);
+ return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+ trace_xfs_perag_set_eofblocks,
+ XFS_ICI_EOFBLOCKS_TAG);
+}
+
+static void
+__xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip,
+ void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int error, unsigned long caller_ip),
+ int tag)
+{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~XFS_IEOFBLOCKS;
+ spin_unlock(&ip->i_flags_lock);
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- trace_xfs_inode_clear_eofblocks_tag(ip);
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
- if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+ if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
/* clear the eofblocks tag from the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ tag);
spin_unlock(&ip->i_mount->m_perag_lock);
- trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
+ clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}
+void
+xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+ return __xfs_inode_clear_eofblocks_tag(ip,
+ trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+/*
+ * Automatic CoW Reservation Freeing
+ *
+ * These functions automatically garbage collect leftover CoW reservations
+ * that were made on behalf of a cowextsize hint when we start to run out
+ * of quota or when the reservations sit around for too long. If the file
+ * has dirty pages or is undergoing writeback, its CoW reservations will
+ * be retained.
+ *
+ * The actual garbage collection piggybacks off the same code that runs
+ * the speculative EOF preallocation garbage collector.
+ */
+STATIC int
+xfs_inode_free_cowblocks(
+ struct xfs_inode *ip,
+ int flags,
+ void *args)
+{
+ int ret;
+ struct xfs_eofblocks *eofb = args;
+ bool need_iolock = true;
+ int match;
+
+ ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+
+ if (!xfs_reflink_has_real_cow_blocks(ip)) {
+ trace_xfs_inode_free_cowblocks_invalid(ip);
+ xfs_inode_clear_cowblocks_tag(ip);
+ return 0;
+ }
+
+ /*
+ * If the mapping is dirty or under writeback we cannot touch the
+ * CoW fork. Leave it alone if we're in the midst of a directio.
+ */
+ if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
+ atomic_read(&VFS_I(ip)->i_dio_count))
+ return 0;
+
+ if (eofb) {
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+ match = xfs_inode_match_id_union(ip, eofb);
+ else
+ match = xfs_inode_match_id(ip, eofb);
+ if (!match)
+ return 0;
+
+ /* skip the inode if the file size is too small */
+ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return 0;
+
+ /*
+ * A scan owner implies we already hold the iolock. Skip it in
+ * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+ * the possibility of EAGAIN being returned.
+ */
+ if (eofb->eof_scan_owner == ip->i_ino)
+ need_iolock = false;
+ }
+
+ /* Free the CoW blocks */
+ if (need_iolock) {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ }
+
+ ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+
+ if (need_iolock) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return ret;
+}
+
+int
+xfs_icache_free_cowblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+ XFS_ICI_COWBLOCKS_TAG);
+}
+
+int
+xfs_inode_free_quota_cowblocks(
+ struct xfs_inode *ip)
+{
+ return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
+}
+
+void
+xfs_inode_set_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_set_cowblocks_tag(ip);
+ return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+ trace_xfs_perag_set_cowblocks,
+ XFS_ICI_COWBLOCKS_TAG);
+}
+
+void
+xfs_inode_clear_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_clear_cowblocks_tag(ip);
+ return __xfs_inode_clear_eofblocks_tag(ip,
+ trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 05bac99bef75..a1e02f4708ab 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,6 +40,7 @@ struct xfs_eofblocks {
in xfs_inode_ag_iterator */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
+#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */
/*
* Flags for xfs_iget()
@@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
void xfs_eofblocks_worker(struct work_struct *);
void xfs_queue_eofblocks(struct xfs_mount *);
+void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
+void xfs_cowblocks_worker(struct work_struct *);
+
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e08eaea6327b..4e560e6a12c1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,6 +49,7 @@
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
kmem_zone_t *xfs_inode_zone;
@@ -77,6 +78,29 @@ xfs_get_extsz_hint(
}
/*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two. If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+ struct xfs_inode *ip)
+{
+ xfs_extlen_t a, b;
+
+ a = 0;
+ if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+ a = ip->i_d.di_cowextsize;
+ b = xfs_get_extsz_hint(ip);
+
+ a = max(a, b);
+ if (a == 0)
+ return XFS_DEFAULT_COWEXTSZ_HINT;
+ return a;
+}
+
+/*
* These two are wrapper routines around the xfs_ilock() routine used to
* centralize some grungy code. They are used in places that wish to lock the
* inode solely for reading the extents. The reason these places can't just
@@ -651,6 +675,8 @@ _xfs_dic2xflags(
if (di_flags2 & XFS_DIFLAG2_ANY) {
if (di_flags2 & XFS_DIFLAG2_DAX)
flags |= FS_XFLAG_DAX;
+ if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+ flags |= FS_XFLAG_COWEXTSIZE;
}
if (has_attr)
@@ -821,7 +847,7 @@ xfs_ialloc(
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
- tv = current_fs_time(mp->m_super);
+ tv = current_time(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
inode->i_ctime = tv;
@@ -834,6 +860,7 @@ xfs_ialloc(
if (ip->i_d.di_version == 3) {
inode->i_version = 1;
ip->i_d.di_flags2 = 0;
+ ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
}
@@ -896,6 +923,15 @@ xfs_ialloc(
ip->i_d.di_flags |= di_flags;
ip->i_d.di_flags2 |= di_flags2;
}
+ if (pip &&
+ (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+ pip->i_d.di_version == 3 &&
+ ip->i_d.di_version == 3) {
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ }
+ }
/* FALLTHROUGH */
case S_IFLNK:
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
@@ -1586,6 +1622,20 @@ xfs_itruncate_extents(
goto out;
}
+ /* Remove all pending CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+ last_block);
+ if (error)
+ goto out;
+
+ /*
+ * Clear the reflink flag if we truncated everything.
+ */
+ if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ xfs_inode_clear_cowblocks_tag(ip);
+ }
+
/*
* Always re-log the inode so that our permanent transaction can keep
* on rolling it forward in the log.
@@ -1710,7 +1760,7 @@ xfs_inactive_truncate(
/*
* Log the inode size first to prevent stale data exposure in the event
* of a system crash before the truncate completes. See the related
- * comment in xfs_setattr_size() for details.
+ * comment in xfs_vn_setattr_size() for details.
*/
ip->i_d.di_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1850,6 +1900,7 @@ xfs_inactive(
}
mp = ip->i_mount;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
/* If this is a read-only mount, don't do this (would generate I/O) */
if (mp->m_flags & XFS_MOUNT_RDONLY)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e1a411e08f00..f14c1de2549d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -47,6 +47,7 @@ typedef struct xfs_inode {
/* Extent information. */
xfs_ifork_t *i_afp; /* attribute fork pointer */
+ xfs_ifork_t *i_cowfp; /* copy on write extents */
xfs_ifork_t i_df; /* data fork */
/* operations vectors */
@@ -65,6 +66,9 @@ typedef struct xfs_inode {
struct xfs_icdinode i_d; /* most of ondisk inode */
+ xfs_extnum_t i_cnextents; /* # of extents in cow fork */
+ unsigned int i_cformat; /* format of cow fork */
+
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
} xfs_inode_t;
@@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp)
return XFS_PROJID_DEFAULT;
}
+static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
+{
+ return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+}
+
/*
* In-core inode flags.
*/
@@ -216,6 +225,13 @@ xfs_get_initial_prid(struct xfs_inode *dp)
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
+#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */
+/*
+ * If this unlinked inode is in the middle of recovery, don't let drop_inode
+ * truncate and free the inode. This can happen if we iget the inode during
+ * log recovery to replay a bmap operation on the inode.
+ */
+#define XFS_IRECOVERY (1 << 11)
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -410,6 +426,7 @@ int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
@@ -473,4 +490,7 @@ do { \
extern struct kmem_zone *xfs_inode_zone;
+/* The default CoW extent size hint. */
+#define XFS_DEFAULT_COWEXTSZ_HINT 32
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 892c2aced207..9610e9c00952 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -368,7 +368,7 @@ xfs_inode_to_log_dinode(
to->di_crtime.t_sec = from->di_crtime.t_sec;
to->di_crtime.t_nsec = from->di_crtime.t_nsec;
to->di_flags2 = from->di_flags2;
-
+ to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 96a70fd1f5d6..c245bed3249b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -720,7 +720,7 @@ xfs_ioc_space(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = bf->l_start;
- error = xfs_setattr_size(ip, &iattr);
+ error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
break;
default:
ASSERT(0);
@@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr(
xfs_ilock(ip, XFS_ILOCK_SHARED);
fa.fsx_xflags = xfs_ip2xflags(ip);
fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+ fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
+ ip->i_mount->m_sb.sb_blocklog;
fa.fsx_projid = xfs_get_projid(ip);
if (attr) {
@@ -973,12 +975,13 @@ xfs_set_diflags(
if (ip->i_d.di_version < 3)
return;
- di_flags2 = 0;
+ di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
+ if (xflags & FS_XFLAG_COWEXTSIZE)
+ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_flags2 = di_flags2;
-
}
STATIC void
@@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags(
return -EINVAL;
}
+ /* Clear reflink if we are actually able to set the rt flag. */
+ if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /* Don't allow us to set DAX mode for a reflinked file for now. */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
+ return -EINVAL;
+
/*
* Can't modify an immutable/append-only file unless
* we have appropriate permission.
@@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize(
return 0;
}
+/*
+ * CoW extent size hint validation rules are:
+ *
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ * The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ * for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. CoW extsize hint of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ * alignment extending the extent beyond the limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_cowextsize(
+ struct xfs_inode *ip,
+ struct fsxattr *fa)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
+ return 0;
+
+ if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
+ ip->i_d.di_version != 3)
+ return -EINVAL;
+
+ if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
+ return -EINVAL;
+
+ if (fa->fsx_cowextsize != 0) {
+ xfs_extlen_t size;
+ xfs_fsblock_t cowextsize_fsb;
+
+ cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+ if (cowextsize_fsb > MAXEXTLEN)
+ return -EINVAL;
+
+ size = mp->m_sb.sb_blocksize;
+ if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+ return -EINVAL;
+
+ if (fa->fsx_cowextsize % size)
+ return -EINVAL;
+ } else
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+ return 0;
+}
+
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
@@ -1311,6 +1372,10 @@ xfs_ioctl_setattr(
if (code)
goto error_trans_cancel;
+ code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ if (code)
+ goto error_trans_cancel;
+
code = xfs_ioctl_setattr_xflags(tp, ip, fa);
if (code)
goto error_trans_cancel;
@@ -1346,6 +1411,12 @@ xfs_ioctl_setattr(
ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
else
ip->i_d.di_extsize = 0;
+ if (ip->i_d.di_version == 3 &&
+ (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
+ mp->m_sb.sb_blocklog;
+ else
+ ip->i_d.di_cowextsize = 0;
code = xfs_trans_commit(tp);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2af0dda1c978..436e109bb01e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -38,21 +39,45 @@
#include "xfs_quota.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
+#include "xfs_reflink.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
-STATIC int
-xfs_iomap_eof_align_last_fsb(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- xfs_extlen_t extsize,
- xfs_fileoff_t *last_fsb)
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
{
- xfs_extlen_t align = 0;
- int eof, error;
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+xfs_extlen_t
+xfs_eof_alignment(
+ struct xfs_inode *ip,
+ xfs_extlen_t extsize)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t align = 0;
if (!XFS_IS_REALTIME_INODE(ip)) {
/*
@@ -83,8 +108,21 @@ xfs_iomap_eof_align_last_fsb(
align = extsize;
}
+ return align;
+}
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+ struct xfs_inode *ip,
+ xfs_extlen_t extsize,
+ xfs_fileoff_t *last_fsb)
+{
+ xfs_extlen_t align = xfs_eof_alignment(ip, extsize);
+
if (align) {
xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
+ int eof, error;
+
error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
if (error)
return error;
@@ -154,7 +192,7 @@ xfs_iomap_write_direct(
*/
ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
XFS_IFEXTENTS);
- error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+ error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
if (error)
goto out_unlock;
} else {
@@ -274,130 +312,6 @@ out_trans_cancel:
goto out_unlock;
}
-/*
- * If the caller is doing a write at the end of the file, then extend the
- * allocation out to the file system's write iosize. We clean up any extra
- * space left over when the file is closed in xfs_inactive().
- *
- * If we find we already have delalloc preallocation beyond EOF, don't do more
- * preallocation as it it not needed.
- */
-STATIC int
-xfs_iomap_eof_want_preallocate(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *imap,
- int nimaps,
- int *prealloc)
-{
- xfs_fileoff_t start_fsb;
- xfs_filblks_t count_fsb;
- int n, error, imaps;
- int found_delalloc = 0;
-
- *prealloc = 0;
- if (offset + count <= XFS_ISIZE(ip))
- return 0;
-
- /*
- * If the file is smaller than the minimum prealloc and we are using
- * dynamic preallocation, don't do any preallocation at all as it is
- * likely this is the only write to the file that is going to be done.
- */
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
- XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
- return 0;
-
- /*
- * If there are any real blocks past eof, then don't
- * do any speculative allocation.
- */
- start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
- count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- while (count_fsb > 0) {
- imaps = nimaps;
- error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
- 0);
- if (error)
- return error;
- for (n = 0; n < imaps; n++) {
- if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
- (imap[n].br_startblock != DELAYSTARTBLOCK))
- return 0;
- start_fsb += imap[n].br_blockcount;
- count_fsb -= imap[n].br_blockcount;
-
- if (imap[n].br_startblock == DELAYSTARTBLOCK)
- found_delalloc = 1;
- }
- }
- if (!found_delalloc)
- *prealloc = 1;
- return 0;
-}
-
-/*
- * Determine the initial size of the preallocation. We are beyond the current
- * EOF here, but we need to take into account whether this is a sparse write or
- * an extending write when determining the preallocation size. Hence we need to
- * look up the extent that ends at the current write offset and use the result
- * to determine the preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceeding data extent as the basis for the
- * preallocation size. If the size of the extent is greater than half the
- * maximum extent length, then use the current offset as the basis. This ensures
- * that for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width alignment of
- * real extents.
- */
-STATIC xfs_fsblock_t
-xfs_iomap_eof_prealloc_initial_size(
- struct xfs_mount *mp,
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- int nimaps)
-{
- xfs_fileoff_t start_fsb;
- int imaps = 1;
- int error;
-
- ASSERT(nimaps >= imaps);
-
- /* if we are using a specific prealloc size, return now */
- if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
- return 0;
-
- /* If the file is small, then use the minimum prealloc */
- if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
- return 0;
-
- /*
- * As we write multiple pages, the offset will always align to the
- * start of a page and hence point to a hole at EOF. i.e. if the size is
- * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
- * will return FSB 1. Hence if there are blocks in the file, we want to
- * point to the block prior to the EOF block and not the hole that maps
- * directly at @offset.
- */
- start_fsb = XFS_B_TO_FSB(mp, offset);
- if (start_fsb)
- start_fsb--;
- error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
- if (error)
- return 0;
-
- ASSERT(imaps == 1);
- if (imap[0].br_startblock == HOLESTARTBLOCK)
- return 0;
- if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
- return imap[0].br_blockcount << 1;
- return XFS_B_TO_FSB(mp, offset);
-}
-
STATIC bool
xfs_quota_need_throttle(
struct xfs_inode *ip,
@@ -459,27 +373,76 @@ xfs_quota_calc_throttle(
}
/*
+ * If we are doing a write at the end of the file and there are no allocations
+ * past this one, then extend the allocation out to the file system's write
+ * iosize.
+ *
* If we don't have a user specified preallocation size, dynamically increase
- * the preallocation size as the size of the file grows. Cap the maximum size
+ * the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
* filesystem is to full, the smaller the maximum prealocation.
+ *
+ * As an exception we don't do any preallocation at all if the file is smaller
+ * than the minimum preallocation and we are using the default dynamic
+ * preallocation scheme, as it is likely this is the only write to the file that
+ * is going to be done.
+ *
+ * We clean up any extra space left over when the file is closed in
+ * xfs_inactive().
*/
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
- struct xfs_mount *mp,
struct xfs_inode *ip,
- xfs_off_t offset,
- struct xfs_bmbt_irec *imap,
- int nimaps)
+ loff_t offset,
+ loff_t count,
+ xfs_extnum_t idx,
+ struct xfs_bmbt_irec *prev)
{
- xfs_fsblock_t alloc_blocks = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
int qshift = 0;
+ xfs_fsblock_t alloc_blocks = 0;
- alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
- imap, nimaps);
+ if (offset + count <= XFS_ISIZE(ip))
+ return 0;
+
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+ return 0;
+
+ /*
+ * If an explicit allocsize is set, the file is small, or we
+ * are writing behind a hole, then use the minimum prealloc:
+ */
+ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+ idx == 0 ||
+ prev->br_startoff + prev->br_blockcount < offset_fsb)
+ return mp->m_writeio_blocks;
+
+ /*
+ * Determine the initial size of the preallocation. We are beyond the
+ * current EOF here, but we need to take into account whether this is
+ * a sparse write or an extending write when determining the
+ * preallocation size. Hence we need to look up the extent that ends
+ * at the current write offset and use the result to determine the
+ * preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceding data extent as the basis
+ * for the preallocation size. If the size of the extent is greater than
+ * half the maximum extent length, then use the current offset as the
+ * basis. This ensures that for large files the preallocation size
+ * always extends to MAXEXTLEN rather than falling short due to things
+ * like stripe unit/width alignment of real extents.
+ */
+ if (prev->br_blockcount <= (MAXEXTLEN >> 1))
+ alloc_blocks = prev->br_blockcount << 1;
+ else
+ alloc_blocks = XFS_B_TO_FSB(mp, offset);
if (!alloc_blocks)
goto check_writeio;
qblocks = alloc_blocks;
@@ -550,120 +513,156 @@ xfs_iomap_prealloc_size(
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
-
check_writeio:
if (alloc_blocks < mp->m_writeio_blocks)
alloc_blocks = mp->m_writeio_blocks;
-
trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
mp->m_writeio_blocks);
-
return alloc_blocks;
}
-int
-xfs_iomap_write_delay(
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *ret_imap)
+static int
+xfs_file_iomap_begin_delay(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_off_t aligned_offset;
- xfs_fileoff_t ioalign;
- xfs_extlen_t extsz;
- int nimaps;
- xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
- int prealloc;
- int error;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t maxbytes_fsb =
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ xfs_fileoff_t end_fsb, orig_end_fsb;
+ int error = 0, eof = 0;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec prev;
+ xfs_extnum_t idx;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ ASSERT(!xfs_get_extsz_hint(ip));
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
- error = xfs_qm_dqattach_locked(ip, 0);
- if (error)
- return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
- extsz = xfs_get_extsz_hint(ip);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
- error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
- imap, XFS_WRITE_IMAPS, &prealloc);
- if (error)
- return error;
+ XFS_STATS_INC(mp, xs_blk_mapw);
-retry:
- if (prealloc) {
- xfs_fsblock_t alloc_blocks;
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
- alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
- XFS_WRITE_IMAPS);
+ xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
+ &got, &prev);
+ if (!eof && got.br_startoff <= offset_fsb) {
+ if (xfs_is_reflink_inode(ip)) {
+ bool shared;
- aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
- ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
- last_fsb = ioalign + alloc_blocks;
- } else {
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- }
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
+ maxbytes_fsb);
+ xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
+ error = xfs_reflink_reserve_cow(ip, &got, &shared);
+ if (error)
+ goto out_unlock;
+ }
- if (prealloc || extsz) {
- error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
- if (error)
- return error;
+ trace_xfs_iomap_found(ip, offset, count, 0, &got);
+ goto done;
}
+ error = xfs_qm_dqattach_locked(ip, 0);
+ if (error)
+ goto out_unlock;
+
/*
- * Make sure preallocation does not create extents beyond the range we
- * actually support in this filesystem.
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+ * to keep the chunks of work done where somewhat symmetric with the
+ * work writeback does. This is a completely arbitrary number pulled
+ * out of thin air as a best guess for initial testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
*/
- if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
- last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = orig_end_fsb =
+ min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (eof) {
+ xfs_fsblock_t prealloc_blocks;
+
+ prealloc_blocks =
+ xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+ if (prealloc_blocks) {
+ xfs_extlen_t align;
+ xfs_off_t end_offset;
- ASSERT(last_fsb > offset_fsb);
+ end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
+ end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
+
+ align = xfs_eof_alignment(ip, 0);
+ if (align)
+ end_fsb = roundup_64(end_fsb, align);
+
+ end_fsb = min(end_fsb, maxbytes_fsb);
+ ASSERT(end_fsb > offset_fsb);
+ }
+ }
- nimaps = XFS_WRITE_IMAPS;
- error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
- imap, &nimaps, XFS_BMAPI_ENTIRE);
+retry:
+ error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
+ end_fsb - offset_fsb, &got,
+ &prev, &idx, eof);
switch (error) {
case 0:
+ break;
case -ENOSPC:
case -EDQUOT:
- break;
- default:
- return error;
- }
-
- /*
- * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
- * without EOF preallocation.
- */
- if (nimaps == 0) {
+ /* retry without any preallocation */
trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc) {
- prealloc = 0;
- error = 0;
+ if (end_fsb != orig_end_fsb) {
+ end_fsb = orig_end_fsb;
goto retry;
}
- return error ? error : -ENOSPC;
+ /*FALLTHRU*/
+ default:
+ goto out_unlock;
}
- if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, &imap[0]);
-
/*
* Tag the inode as speculatively preallocated so we can reclaim this
* space on demand, if necessary.
*/
- if (prealloc)
+ if (end_fsb != orig_end_fsb)
xfs_inode_set_eofblocks_tag(ip);
- *ret_imap = imap[0];
- return 0;
+ trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+done:
+ if (isnullstartblock(got.br_startblock))
+ got.br_startblock = DELAYSTARTBLOCK;
+
+ if (!got.br_startblock) {
+ error = xfs_alert_fsblock_zero(ip, &got);
+ if (error)
+ goto out_unlock;
+ }
+
+ xfs_bmbt_to_iomap(ip, iomap, &got);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
}
/*
@@ -679,6 +678,7 @@ retry:
int
xfs_iomap_write_allocate(
xfs_inode_t *ip,
+ int whichfork,
xfs_off_t offset,
xfs_bmbt_irec_t *imap)
{
@@ -691,8 +691,12 @@ xfs_iomap_write_allocate(
xfs_trans_t *tp;
int nimaps;
int error = 0;
+ int flags = 0;
int nres;
+ if (whichfork == XFS_COW_FORK)
+ flags |= XFS_BMAPI_COWFORK;
+
/*
* Make sure that the dquots are there.
*/
@@ -786,7 +790,7 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0, &first_block,
+ count_fsb, flags, &first_block,
nres, imap, &nimaps,
&dfops);
if (error)
@@ -947,37 +951,13 @@ error_on_bmapi_transaction:
return error;
}
-void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-}
-
-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool imap_needs_alloc(struct inode *inode,
+ struct xfs_bmbt_irec *imap, int nimaps)
{
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
- imap->br_startblock == DELAYSTARTBLOCK;
+ imap->br_startblock == DELAYSTARTBLOCK ||
+ (IS_DAX(inode) && ISUNWRITTEN(imap));
}
static int
@@ -993,11 +973,29 @@ xfs_file_iomap_begin(
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
+ bool shared = false, trimmed = false;
+ unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
+ !xfs_get_extsz_hint(ip)) {
+ /* Reserve delalloc blocks for regular writeback. */
+ return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+ iomap);
+ }
+
+ /*
+ * COW writes will allocate delalloc space, so we need to make sure
+ * to take the lock exclusively here.
+ */
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ } else {
+ lockmode = xfs_ilock_data_map_shared(ip);
+ }
ASSERT(offset <= mp->m_super->s_maxbytes);
if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -1006,13 +1004,28 @@ xfs_file_iomap_begin(
end_fsb = XFS_B_TO_FSB(mp, offset + length);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
- &nimaps, XFS_BMAPI_ENTIRE);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
+ &nimaps, 0);
+ if (error)
+ goto out_unlock;
+
+ if (flags & IOMAP_REPORT) {
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
+ &trimmed);
+ if (error)
+ goto out_unlock;
+ }
+
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+ if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
/*
* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
* pages to keep the chunks of work done where somewhat symmetric
@@ -1024,32 +1037,33 @@ xfs_file_iomap_begin(
* the lower level functions are updated.
*/
length = min_t(loff_t, length, 1024 * PAGE_SIZE);
- if (xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
- } else {
- error = xfs_iomap_write_delay(ip, offset, length, &imap);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
if (error)
return error;
+ iomap->flags = IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
} else {
ASSERT(nimaps);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
xfs_bmbt_to_iomap(ip, iomap, &imap);
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
return 0;
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
}
static int
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fb8aca3d69ab..6d45cf01fcff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,14 +25,13 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *);
-int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
+xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
extern struct iomap_ops xfs_iomap_ops;
extern struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b24c3102fa93..405a65cd9d6b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -542,6 +542,28 @@ xfs_setattr_time(
inode->i_mtime = iattr->ia_mtime;
}
+static int
+xfs_vn_change_ok(
+ struct dentry *dentry,
+ struct iattr *iattr)
+{
+ struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount;
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return -EROFS;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ return setattr_prepare(dentry, iattr);
+}
+
+/*
+ * Set non-size attributes of an inode.
+ *
+ * Caution: The caller of this function is responsible for calling
+ * setattr_prepare() or otherwise verifying the change is fine.
+ */
int
xfs_setattr_nonsize(
struct xfs_inode *ip,
@@ -558,21 +580,6 @@ xfs_setattr_nonsize(
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
- trace_xfs_setattr(ip);
-
- /* If acls are being inherited, we already have this checked */
- if (!(flags & XFS_ATTR_NOACL)) {
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return -EROFS;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- error = inode_change_ok(inode, iattr);
- if (error)
- return error;
- }
-
ASSERT((mask & ATTR_SIZE) == 0);
/*
@@ -743,8 +750,27 @@ out_dqrele:
return error;
}
+int
+xfs_vn_setattr_nonsize(
+ struct dentry *dentry,
+ struct iattr *iattr)
+{
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
+
+ trace_xfs_setattr(ip);
+
+ error = xfs_vn_change_ok(dentry, iattr);
+ if (error)
+ return error;
+ return xfs_setattr_nonsize(ip, iattr, 0);
+}
+
/*
* Truncate file. Must have write permission and not be a directory.
+ *
+ * Caution: The caller of this function is responsible for calling
+ * setattr_prepare() or otherwise verifying the change is fine.
*/
int
xfs_setattr_size(
@@ -759,18 +785,6 @@ xfs_setattr_size(
uint lock_flags = 0;
bool did_zeroing = false;
- trace_xfs_setattr(ip);
-
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return -EROFS;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- error = inode_change_ok(inode, iattr);
- if (error)
- return error;
-
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
@@ -882,7 +896,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
@@ -942,16 +956,32 @@ out_trans_cancel:
goto out_unlock;
}
+int
+xfs_vn_setattr_size(
+ struct dentry *dentry,
+ struct iattr *iattr)
+{
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
+
+ trace_xfs_setattr(ip);
+
+ error = xfs_vn_change_ok(dentry, iattr);
+ if (error)
+ return error;
+ return xfs_setattr_size(ip, iattr);
+}
+
STATIC int
xfs_vn_setattr(
struct dentry *dentry,
struct iattr *iattr)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error;
if (iattr->ia_valid & ATTR_SIZE) {
- uint iolock = XFS_IOLOCK_EXCL;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ uint iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
error = xfs_break_layouts(d_inode(dentry), &iolock, true);
@@ -959,11 +989,11 @@ xfs_vn_setattr(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;
- error = xfs_setattr_size(ip, iattr);
+ error = xfs_vn_setattr_size(dentry, iattr);
}
xfs_iunlock(ip, iolock);
} else {
- error = xfs_setattr_nonsize(ip, iattr, 0);
+ error = xfs_vn_setattr_nonsize(dentry, iattr);
}
return error;
@@ -1036,9 +1066,6 @@ static const struct inode_operations xfs_inode_operations = {
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.fiemap = xfs_vn_fiemap,
.update_time = xfs_vn_update_time,
@@ -1059,14 +1086,11 @@ static const struct inode_operations xfs_dir_inode_operations = {
*/
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
- .rename2 = xfs_vn_rename,
+ .rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
@@ -1087,14 +1111,11 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
*/
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
- .rename2 = xfs_vn_rename,
+ .rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
@@ -1105,9 +1126,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
};
@@ -1117,9 +1135,6 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {
.get_link = xfs_vn_get_link_inline,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
};
@@ -1144,6 +1159,7 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_NOATIME;
if (S_ISREG(inode->i_mode) &&
ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+ !xfs_is_reflink_inode(ip) &&
(ip->i_mount->m_flags & XFS_MOUNT_DAX ||
ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
inode->i_flags |= S_DAX;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index a0f84abb0d09..0259a383721a 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -33,6 +33,7 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
int flags);
-extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
+extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap);
+extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ce73eb34620d..66e881790c17 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -66,7 +66,7 @@ xfs_bulkstat_one_int(
if (!buffer || xfs_internal_inum(mp, ino))
return -EINVAL;
- buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+ buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
if (!buf)
return -ENOMEM;
@@ -111,6 +111,12 @@ xfs_bulkstat_one_int(
buf->bs_aextents = dic->di_anextents;
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ if (dic->di_version == 3) {
+ if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+ buf->bs_cowextsize = dic->di_cowextsize <<
+ mp->m_sb.sb_blocklog;
+ }
+
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
buf->bs_rdev = ip->i_df.if_u2.if_rdev;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index b8d64d520e12..68640fb63a54 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,6 +116,7 @@ typedef __u32 xfs_nlink_t;
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
#define xfs_eofb_secs xfs_params.eofb_timer.val
+#define xfs_cowb_secs xfs_params.cowb_timer.val
#define current_cpu() (raw_smp_processor_id())
#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 765f084759b5..2b6eec52178e 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -413,7 +413,8 @@ struct xlog {
/* log record crc error injection factor */
uint32_t l_badcrc_factor;
#endif
-
+ /* log recovery lsn tracking (for buffer submission */
+ xfs_lsn_t l_recovery_lsn;
};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e8638fd2c0c3..9b3d7c76915d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -44,6 +44,9 @@
#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_rmap_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -381,6 +384,15 @@ xlog_recover_iodone(
SHUTDOWN_META_IO_ERROR);
}
}
+
+ /*
+ * On v5 supers, a bli could be attached to update the metadata LSN.
+ * Clean it up.
+ */
+ if (bp->b_fspriv)
+ xfs_buf_item_relse(bp);
+ ASSERT(bp->b_fspriv == NULL);
+
bp->b_iodone = NULL;
xfs_buf_ioend(bp);
}
@@ -1914,6 +1926,10 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
case XFS_LI_RUI:
case XFS_LI_RUD:
+ case XFS_LI_CUI:
+ case XFS_LI_CUD:
+ case XFS_LI_BUI:
+ case XFS_LI_BUD:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
list_move_tail(&item->ri_list, &inode_list);
@@ -2232,6 +2248,7 @@ xlog_recover_get_buf_lsn(
case XFS_ABTB_MAGIC:
case XFS_ABTC_MAGIC:
case XFS_RMAP_CRC_MAGIC:
+ case XFS_REFC_CRC_MAGIC:
case XFS_IBT_CRC_MAGIC:
case XFS_IBT_MAGIC: {
struct xfs_btree_block *btb = blk;
@@ -2360,12 +2377,14 @@ static void
xlog_recover_validate_buf_type(
struct xfs_mount *mp,
struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f)
+ xfs_buf_log_format_t *buf_f,
+ xfs_lsn_t current_lsn)
{
struct xfs_da_blkinfo *info = bp->b_addr;
__uint32_t magic32;
__uint16_t magic16;
__uint16_t magicda;
+ char *warnmsg = NULL;
/*
* We can only do post recovery validation on items on CRC enabled
@@ -2403,32 +2422,31 @@ xlog_recover_validate_buf_type(
case XFS_RMAP_CRC_MAGIC:
bp->b_ops = &xfs_rmapbt_buf_ops;
break;
+ case XFS_REFC_CRC_MAGIC:
+ bp->b_ops = &xfs_refcountbt_buf_ops;
+ break;
default:
- xfs_warn(mp, "Bad btree block magic!");
- ASSERT(0);
+ warnmsg = "Bad btree block magic!";
break;
}
break;
case XFS_BLFT_AGF_BUF:
if (magic32 != XFS_AGF_MAGIC) {
- xfs_warn(mp, "Bad AGF block magic!");
- ASSERT(0);
+ warnmsg = "Bad AGF block magic!";
break;
}
bp->b_ops = &xfs_agf_buf_ops;
break;
case XFS_BLFT_AGFL_BUF:
if (magic32 != XFS_AGFL_MAGIC) {
- xfs_warn(mp, "Bad AGFL block magic!");
- ASSERT(0);
+ warnmsg = "Bad AGFL block magic!";
break;
}
bp->b_ops = &xfs_agfl_buf_ops;
break;
case XFS_BLFT_AGI_BUF:
if (magic32 != XFS_AGI_MAGIC) {
- xfs_warn(mp, "Bad AGI block magic!");
- ASSERT(0);
+ warnmsg = "Bad AGI block magic!";
break;
}
bp->b_ops = &xfs_agi_buf_ops;
@@ -2438,8 +2456,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_GDQUOT_BUF:
#ifdef CONFIG_XFS_QUOTA
if (magic16 != XFS_DQUOT_MAGIC) {
- xfs_warn(mp, "Bad DQUOT block magic!");
- ASSERT(0);
+ warnmsg = "Bad DQUOT block magic!";
break;
}
bp->b_ops = &xfs_dquot_buf_ops;
@@ -2451,16 +2468,14 @@ xlog_recover_validate_buf_type(
break;
case XFS_BLFT_DINO_BUF:
if (magic16 != XFS_DINODE_MAGIC) {
- xfs_warn(mp, "Bad INODE block magic!");
- ASSERT(0);
+ warnmsg = "Bad INODE block magic!";
break;
}
bp->b_ops = &xfs_inode_buf_ops;
break;
case XFS_BLFT_SYMLINK_BUF:
if (magic32 != XFS_SYMLINK_MAGIC) {
- xfs_warn(mp, "Bad symlink block magic!");
- ASSERT(0);
+ warnmsg = "Bad symlink block magic!";
break;
}
bp->b_ops = &xfs_symlink_buf_ops;
@@ -2468,8 +2483,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_BLOCK_BUF:
if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
magic32 != XFS_DIR3_BLOCK_MAGIC) {
- xfs_warn(mp, "Bad dir block magic!");
- ASSERT(0);
+ warnmsg = "Bad dir block magic!";
break;
}
bp->b_ops = &xfs_dir3_block_buf_ops;
@@ -2477,8 +2491,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_DATA_BUF:
if (magic32 != XFS_DIR2_DATA_MAGIC &&
magic32 != XFS_DIR3_DATA_MAGIC) {
- xfs_warn(mp, "Bad dir data magic!");
- ASSERT(0);
+ warnmsg = "Bad dir data magic!";
break;
}
bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -2486,8 +2499,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_FREE_BUF:
if (magic32 != XFS_DIR2_FREE_MAGIC &&
magic32 != XFS_DIR3_FREE_MAGIC) {
- xfs_warn(mp, "Bad dir3 free magic!");
- ASSERT(0);
+ warnmsg = "Bad dir3 free magic!";
break;
}
bp->b_ops = &xfs_dir3_free_buf_ops;
@@ -2495,8 +2507,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_LEAF1_BUF:
if (magicda != XFS_DIR2_LEAF1_MAGIC &&
magicda != XFS_DIR3_LEAF1_MAGIC) {
- xfs_warn(mp, "Bad dir leaf1 magic!");
- ASSERT(0);
+ warnmsg = "Bad dir leaf1 magic!";
break;
}
bp->b_ops = &xfs_dir3_leaf1_buf_ops;
@@ -2504,8 +2515,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_LEAFN_BUF:
if (magicda != XFS_DIR2_LEAFN_MAGIC &&
magicda != XFS_DIR3_LEAFN_MAGIC) {
- xfs_warn(mp, "Bad dir leafn magic!");
- ASSERT(0);
+ warnmsg = "Bad dir leafn magic!";
break;
}
bp->b_ops = &xfs_dir3_leafn_buf_ops;
@@ -2513,8 +2523,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DA_NODE_BUF:
if (magicda != XFS_DA_NODE_MAGIC &&
magicda != XFS_DA3_NODE_MAGIC) {
- xfs_warn(mp, "Bad da node magic!");
- ASSERT(0);
+ warnmsg = "Bad da node magic!";
break;
}
bp->b_ops = &xfs_da3_node_buf_ops;
@@ -2522,24 +2531,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_ATTR_LEAF_BUF:
if (magicda != XFS_ATTR_LEAF_MAGIC &&
magicda != XFS_ATTR3_LEAF_MAGIC) {
- xfs_warn(mp, "Bad attr leaf magic!");
- ASSERT(0);
+ warnmsg = "Bad attr leaf magic!";
break;
}
bp->b_ops = &xfs_attr3_leaf_buf_ops;
break;
case XFS_BLFT_ATTR_RMT_BUF:
if (magic32 != XFS_ATTR3_RMT_MAGIC) {
- xfs_warn(mp, "Bad attr remote magic!");
- ASSERT(0);
+ warnmsg = "Bad attr remote magic!";
break;
}
bp->b_ops = &xfs_attr3_rmt_buf_ops;
break;
case XFS_BLFT_SB_BUF:
if (magic32 != XFS_SB_MAGIC) {
- xfs_warn(mp, "Bad SB block magic!");
- ASSERT(0);
+ warnmsg = "Bad SB block magic!";
break;
}
bp->b_ops = &xfs_sb_buf_ops;
@@ -2556,6 +2562,40 @@ xlog_recover_validate_buf_type(
xfs_blft_from_flags(buf_f));
break;
}
+
+ /*
+ * Nothing else to do in the case of a NULL current LSN as this means
+ * the buffer is more recent than the change in the log and will be
+ * skipped.
+ */
+ if (current_lsn == NULLCOMMITLSN)
+ return;
+
+ if (warnmsg) {
+ xfs_warn(mp, warnmsg);
+ ASSERT(0);
+ }
+
+ /*
+ * We must update the metadata LSN of the buffer as it is written out to
+ * ensure that older transactions never replay over this one and corrupt
+ * the buffer. This can occur if log recovery is interrupted at some
+ * point after the current transaction completes, at which point a
+ * subsequent mount starts recovery from the beginning.
+ *
+ * Write verifiers update the metadata LSN from log items attached to
+ * the buffer. Therefore, initialize a bli purely to carry the LSN to
+ * the verifier. We'll clean it up in our ->iodone() callback.
+ */
+ if (bp->b_ops) {
+ struct xfs_buf_log_item *bip;
+
+ ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
+ bp->b_iodone = xlog_recover_iodone;
+ xfs_buf_item_init(bp, mp);
+ bip = bp->b_fspriv;
+ bip->bli_item.li_lsn = current_lsn;
+ }
}
/*
@@ -2569,7 +2609,8 @@ xlog_recover_do_reg_buffer(
struct xfs_mount *mp,
xlog_recover_item_t *item,
struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f)
+ xfs_buf_log_format_t *buf_f,
+ xfs_lsn_t current_lsn)
{
int i;
int bit;
@@ -2642,7 +2683,7 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
- xlog_recover_validate_buf_type(mp, bp, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
}
/*
@@ -2685,7 +2726,7 @@ xlog_recover_do_dquot_buffer(
if (log->l_quotaoffs_flag & type)
return false;
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
return true;
}
@@ -2773,7 +2814,8 @@ xlog_recover_buffer_pass2(
*/
lsn = xlog_recover_get_buf_lsn(mp, bp);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- xlog_recover_validate_buf_type(mp, bp, buf_f);
+ trace_xfs_log_recover_buf_skip(log, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
goto out_release;
}
@@ -2789,7 +2831,7 @@ xlog_recover_buffer_pass2(
if (!dirty)
goto out_release;
} else {
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
/*
@@ -3515,6 +3557,242 @@ xlog_recover_rud_pass2(
}
/*
+ * Copy an CUI format buffer from the given buf, and into the destination
+ * CUI format structure. The CUI/CUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_cui_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_cui_log_format *dst_cui_fmt)
+{
+ struct xfs_cui_log_format *src_cui_fmt;
+ uint len;
+
+ src_cui_fmt = buf->i_addr;
+ len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+
+ if (buf->i_len == len) {
+ memcpy(dst_cui_fmt, src_cui_fmt, len);
+ return 0;
+ }
+ return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_cui_log_format *cui_formatp;
+
+ cui_formatp = item->ri_buf[0].i_addr;
+
+ cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
+ if (error) {
+ xfs_cui_item_free(cuip);
+ return error;
+ }
+ atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+ spin_lock(&log->l_ailp->xa_lock);
+ /*
+ * The CUI has two references. One for the CUD and one for CUI to ensure
+ * it makes it into the AIL. Insert the CUI into the AIL directly and
+ * drop the CUI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
+ */
+ xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
+ xfs_cui_release(cuip);
+ return 0;
+}
+
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_cud_log_format *cud_formatp;
+ struct xfs_cui_log_item *cuip = NULL;
+ struct xfs_log_item *lip;
+ __uint64_t cui_id;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ cud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
+ return -EFSCORRUPTED;
+ cui_id = cud_formatp->cud_cui_id;
+
+ /*
+ * Search for the CUI with the id in the CUD format structure in the
+ * AIL.
+ */
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ if (lip->li_type == XFS_LI_CUI) {
+ cuip = (struct xfs_cui_log_item *)lip;
+ if (cuip->cui_format.cui_id == cui_id) {
+ /*
+ * Drop the CUD reference to the CUI. This
+ * removes the CUI from the AIL and frees it.
+ */
+ spin_unlock(&ailp->xa_lock);
+ xfs_cui_release(cuip);
+ spin_lock(&ailp->xa_lock);
+ break;
+ }
+ }
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+
+ return 0;
+}
+
+/*
+ * Copy an BUI format buffer from the given buf, and into the destination
+ * BUI format structure. The BUI/BUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_bui_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_bui_log_format *dst_bui_fmt)
+{
+ struct xfs_bui_log_format *src_bui_fmt;
+ uint len;
+
+ src_bui_fmt = buf->i_addr;
+ len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+
+ if (buf->i_len == len) {
+ memcpy(dst_bui_fmt, src_bui_fmt, len);
+ return 0;
+ }
+ return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_bui_log_format *bui_formatp;
+
+ bui_formatp = item->ri_buf[0].i_addr;
+
+ if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ return -EFSCORRUPTED;
+ buip = xfs_bui_init(mp);
+ error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
+ if (error) {
+ xfs_bui_item_free(buip);
+ return error;
+ }
+ atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+
+ spin_lock(&log->l_ailp->xa_lock);
+ /*
+ * The RUI has two references. One for the RUD and one for RUI to ensure
+ * it makes it into the AIL. Insert the RUI into the AIL directly and
+ * drop the RUI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
+ */
+ xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
+ xfs_bui_release(buip);
+ return 0;
+}
+
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_bud_log_format *bud_formatp;
+ struct xfs_bui_log_item *buip = NULL;
+ struct xfs_log_item *lip;
+ __uint64_t bui_id;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ bud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
+ return -EFSCORRUPTED;
+ bui_id = bud_formatp->bud_bui_id;
+
+ /*
+ * Search for the BUI with the id in the BUD format structure in the
+ * AIL.
+ */
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ if (lip->li_type == XFS_LI_BUI) {
+ buip = (struct xfs_bui_log_item *)lip;
+ if (buip->bui_format.bui_id == bui_id) {
+ /*
+ * Drop the BUD reference to the BUI. This
+ * removes the BUI from the AIL and frees it.
+ */
+ spin_unlock(&ailp->xa_lock);
+ xfs_bui_release(buip);
+ spin_lock(&ailp->xa_lock);
+ break;
+ }
+ }
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+
+ return 0;
+}
+
+/*
* This routine is called when an inode create format structure is found in a
* committed transaction in the log. It's purpose is to initialise the inodes
* being allocated on disk. This requires us to get inode cluster buffers that
@@ -3741,6 +4019,10 @@ xlog_recover_ra_pass2(
case XFS_LI_QUOTAOFF:
case XFS_LI_RUI:
case XFS_LI_RUD:
+ case XFS_LI_CUI:
+ case XFS_LI_CUD:
+ case XFS_LI_BUI:
+ case XFS_LI_BUD:
default:
break;
}
@@ -3766,6 +4048,10 @@ xlog_recover_commit_pass1(
case XFS_LI_ICREATE:
case XFS_LI_RUI:
case XFS_LI_RUD:
+ case XFS_LI_CUI:
+ case XFS_LI_CUD:
+ case XFS_LI_BUI:
+ case XFS_LI_BUD:
/* nothing to do in pass 1 */
return 0;
default:
@@ -3800,6 +4086,14 @@ xlog_recover_commit_pass2(
return xlog_recover_rui_pass2(log, item, trans->r_lsn);
case XFS_LI_RUD:
return xlog_recover_rud_pass2(log, item);
+ case XFS_LI_CUI:
+ return xlog_recover_cui_pass2(log, item, trans->r_lsn);
+ case XFS_LI_CUD:
+ return xlog_recover_cud_pass2(log, item);
+ case XFS_LI_BUI:
+ return xlog_recover_bui_pass2(log, item, trans->r_lsn);
+ case XFS_LI_BUD:
+ return xlog_recover_bud_pass2(log, item);
case XFS_LI_DQUOT:
return xlog_recover_dquot_pass2(log, buffer_list, item,
trans->r_lsn);
@@ -3846,14 +4140,13 @@ STATIC int
xlog_recover_commit_trans(
struct xlog *log,
struct xlog_recover *trans,
- int pass)
+ int pass,
+ struct list_head *buffer_list)
{
int error = 0;
- int error2;
int items_queued = 0;
struct xlog_recover_item *item;
struct xlog_recover_item *next;
- LIST_HEAD (buffer_list);
LIST_HEAD (ra_list);
LIST_HEAD (done_list);
@@ -3876,7 +4169,7 @@ xlog_recover_commit_trans(
items_queued++;
if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
error = xlog_recover_items_pass2(log, trans,
- &buffer_list, &ra_list);
+ buffer_list, &ra_list);
list_splice_tail_init(&ra_list, &done_list);
items_queued = 0;
}
@@ -3894,15 +4187,14 @@ out:
if (!list_empty(&ra_list)) {
if (!error)
error = xlog_recover_items_pass2(log, trans,
- &buffer_list, &ra_list);
+ buffer_list, &ra_list);
list_splice_tail_init(&ra_list, &done_list);
}
if (!list_empty(&done_list))
list_splice_init(&done_list, &trans->r_itemq);
- error2 = xfs_buf_delwri_submit(&buffer_list);
- return error ? error : error2;
+ return error;
}
STATIC void
@@ -4085,7 +4377,8 @@ xlog_recovery_process_trans(
char *dp,
unsigned int len,
unsigned int flags,
- int pass)
+ int pass,
+ struct list_head *buffer_list)
{
int error = 0;
bool freeit = false;
@@ -4109,7 +4402,8 @@ xlog_recovery_process_trans(
error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
break;
case XLOG_COMMIT_TRANS:
- error = xlog_recover_commit_trans(log, trans, pass);
+ error = xlog_recover_commit_trans(log, trans, pass,
+ buffer_list);
/* success or fail, we are now done with this transaction. */
freeit = true;
break;
@@ -4191,10 +4485,12 @@ xlog_recover_process_ophdr(
struct xlog_op_header *ohead,
char *dp,
char *end,
- int pass)
+ int pass,
+ struct list_head *buffer_list)
{
struct xlog_recover *trans;
unsigned int len;
+ int error;
/* Do we understand who wrote this op? */
if (ohead->oh_clientid != XFS_TRANSACTION &&
@@ -4221,8 +4517,39 @@ xlog_recover_process_ophdr(
return 0;
}
+ /*
+ * The recovered buffer queue is drained only once we know that all
+ * recovery items for the current LSN have been processed. This is
+ * required because:
+ *
+ * - Buffer write submission updates the metadata LSN of the buffer.
+ * - Log recovery skips items with a metadata LSN >= the current LSN of
+ * the recovery item.
+ * - Separate recovery items against the same metadata buffer can share
+ * a current LSN. I.e., consider that the LSN of a recovery item is
+ * defined as the starting LSN of the first record in which its
+ * transaction appears, that a record can hold multiple transactions,
+ * and/or that a transaction can span multiple records.
+ *
+ * In other words, we are allowed to submit a buffer from log recovery
+ * once per current LSN. Otherwise, we may incorrectly skip recovery
+ * items and cause corruption.
+ *
+ * We don't know up front whether buffers are updated multiple times per
+ * LSN. Therefore, track the current LSN of each commit log record as it
+ * is processed and drain the queue when it changes. Use commit records
+ * because they are ordered correctly by the logging code.
+ */
+ if (log->l_recovery_lsn != trans->r_lsn &&
+ ohead->oh_flags & XLOG_COMMIT_TRANS) {
+ error = xfs_buf_delwri_submit(buffer_list);
+ if (error)
+ return error;
+ log->l_recovery_lsn = trans->r_lsn;
+ }
+
return xlog_recovery_process_trans(log, trans, dp, len,
- ohead->oh_flags, pass);
+ ohead->oh_flags, pass, buffer_list);
}
/*
@@ -4240,7 +4567,8 @@ xlog_recover_process_data(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
- int pass)
+ int pass,
+ struct list_head *buffer_list)
{
struct xlog_op_header *ohead;
char *end;
@@ -4254,6 +4582,7 @@ xlog_recover_process_data(
if (xlog_header_check_recover(log->l_mp, rhead))
return -EIO;
+ trace_xfs_log_recover_record(log, rhead, pass);
while ((dp < end) && num_logops) {
ohead = (struct xlog_op_header *)dp;
@@ -4262,7 +4591,7 @@ xlog_recover_process_data(
/* errors will abort recovery */
error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
- dp, end, pass);
+ dp, end, pass, buffer_list);
if (error)
return error;
@@ -4352,12 +4681,94 @@ xlog_recover_cancel_rui(
spin_lock(&ailp->xa_lock);
}
+/* Recover the CUI if necessary. */
+STATIC int
+xlog_recover_process_cui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_cui_log_item *cuip;
+ int error;
+
+ /*
+ * Skip CUIs that we've already processed.
+ */
+ cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+ if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
+ return 0;
+
+ spin_unlock(&ailp->xa_lock);
+ error = xfs_cui_recover(mp, cuip);
+ spin_lock(&ailp->xa_lock);
+
+ return error;
+}
+
+/* Release the CUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_cui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_cui_log_item *cuip;
+
+ cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_cui_release(cuip);
+ spin_lock(&ailp->xa_lock);
+}
+
+/* Recover the BUI if necessary. */
+STATIC int
+xlog_recover_process_bui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_bui_log_item *buip;
+ int error;
+
+ /*
+ * Skip BUIs that we've already processed.
+ */
+ buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+ if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
+ return 0;
+
+ spin_unlock(&ailp->xa_lock);
+ error = xfs_bui_recover(mp, buip);
+ spin_lock(&ailp->xa_lock);
+
+ return error;
+}
+
+/* Release the BUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_bui(
+ struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_bui_log_item *buip;
+
+ buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_bui_release(buip);
+ spin_lock(&ailp->xa_lock);
+}
+
/* Is this log item a deferred action intent? */
static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
{
switch (lip->li_type) {
case XFS_LI_EFI:
case XFS_LI_RUI:
+ case XFS_LI_CUI:
+ case XFS_LI_BUI:
return true;
default:
return false;
@@ -4421,6 +4832,12 @@ xlog_recover_process_intents(
case XFS_LI_RUI:
error = xlog_recover_process_rui(log->l_mp, ailp, lip);
break;
+ case XFS_LI_CUI:
+ error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+ break;
+ case XFS_LI_BUI:
+ error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+ break;
}
if (error)
goto out;
@@ -4468,6 +4885,12 @@ xlog_recover_cancel_intents(
case XFS_LI_RUI:
xlog_recover_cancel_rui(log->l_mp, ailp, lip);
break;
+ case XFS_LI_CUI:
+ xlog_recover_cancel_cui(log->l_mp, ailp, lip);
+ break;
+ case XFS_LI_BUI:
+ xlog_recover_cancel_bui(log->l_mp, ailp, lip);
+ break;
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
@@ -4546,6 +4969,7 @@ xlog_recover_process_one_iunlink(
if (error)
goto fail_iput;
+ xfs_iflags_clear(ip, XFS_IRECOVERY);
ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
@@ -4685,7 +5109,8 @@ xlog_recover_process(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
- int pass)
+ int pass,
+ struct list_head *buffer_list)
{
int error;
__le32 crc;
@@ -4732,7 +5157,8 @@ xlog_recover_process(
if (error)
return error;
- return xlog_recover_process_data(log, rhash, rhead, dp, pass);
+ return xlog_recover_process_data(log, rhash, rhead, dp, pass,
+ buffer_list);
}
STATIC int
@@ -4793,9 +5219,11 @@ xlog_do_recovery_pass(
char *offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size, h_len;
+ int error2 = 0;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
struct hlist_head rhash[XLOG_RHASH_SIZE];
+ LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
rhead_blk = 0;
@@ -4981,7 +5409,7 @@ xlog_do_recovery_pass(
}
error = xlog_recover_process(log, rhash, rhead, offset,
- pass);
+ pass, &buffer_list);
if (error)
goto bread_err2;
@@ -5012,7 +5440,8 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = xlog_recover_process(log, rhash, rhead, offset, pass);
+ error = xlog_recover_process(log, rhash, rhead, offset, pass,
+ &buffer_list);
if (error)
goto bread_err2;
@@ -5025,10 +5454,17 @@ xlog_do_recovery_pass(
bread_err1:
xlog_put_bp(hbp);
+ /*
+ * Submit buffers that have been added from the last record processed,
+ * regardless of error status.
+ */
+ if (!list_empty(&buffer_list))
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+
if (error && first_bad)
*first_bad = rhead_blk;
- return error;
+ return error ? error : error2;
}
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index faeead671f9f..b341f10cf481 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,8 @@
#include "xfs_icache.h"
#include "xfs_sysfs.h"
#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_reflink.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -684,6 +686,7 @@ xfs_mountfs(
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
xfs_ialloc_compute_maxlevels(mp);
xfs_rmapbt_compute_maxlevels(mp);
+ xfs_refcountbt_compute_maxlevels(mp);
xfs_set_maxicount(mp);
@@ -923,6 +926,15 @@ xfs_mountfs(
}
/*
+ * During the second phase of log recovery, we need iget and
+ * iput to behave like they do for an active filesystem.
+ * xfs_fs_drop_inode needs to be able to prevent the deletion
+ * of inodes before we're done replaying log items on those
+ * inodes.
+ */
+ mp->m_super->s_flags |= MS_ACTIVE;
+
+ /*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
* read in.
@@ -934,6 +946,20 @@ xfs_mountfs(
}
/*
+ * Now the log is fully replayed, we can transition to full read-only
+ * mode for read-only mounts. This will sync all the metadata and clean
+ * the log so that the recovery we just performed does not have to be
+ * replayed again on the next mount.
+ *
+ * We use the same quiesce mechanism as the rw->ro remount, as they are
+ * semantically identical operations.
+ */
+ if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
+ XFS_MOUNT_RDONLY) {
+ xfs_quiesce_attr(mp);
+ }
+
+ /*
* Complete the quota initialisation, post-log-replay component.
*/
if (quotamount) {
@@ -960,11 +986,30 @@ xfs_mountfs(
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
+
+ /* Recover any CoW blocks that never got remapped. */
+ error = xfs_reflink_recover_cow(mp);
+ if (error) {
+ xfs_err(mp,
+ "Error %d recovering leftover CoW allocations.", error);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto out_quota;
+ }
+
+ /* Reserve AG blocks for future btree expansion. */
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+ goto out_agresv;
}
return 0;
+ out_agresv:
+ xfs_fs_unreserve_ag_blocks(mp);
+ out_quota:
+ xfs_qm_unmount_quotas(mp);
out_rtunmount:
+ mp->m_super->s_flags &= ~MS_ACTIVE;
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
@@ -1005,7 +1050,9 @@ xfs_unmountfs(
int error;
cancel_delayed_work_sync(&mp->m_eofblocks_work);
+ cancel_delayed_work_sync(&mp->m_cowblocks_work);
+ xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
IRELE(mp->m_rootip);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b36676cde103..819b80b15bfb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
#define XFS_ERR_RETRY_FOREVER -1
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day. So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855. Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
struct xfs_error_cfg {
struct xfs_kobj kobj;
int max_retries;
- unsigned long retry_timeout; /* in jiffies, 0 = no timeout */
+ long retry_timeout; /* in jiffies, -1 = infinite */
};
typedef struct xfs_mount {
@@ -118,10 +124,13 @@ typedef struct xfs_mount {
uint m_inobt_mnr[2]; /* min inobt btree records */
uint m_rmap_mxr[2]; /* max rmap btree records */
uint m_rmap_mnr[2]; /* min rmap btree records */
+ uint m_refc_mxr[2]; /* max refc btree records */
+ uint m_refc_mnr[2]; /* min refc btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
uint m_in_maxlevels; /* max inobt btree levels. */
uint m_rmap_maxlevels; /* max rmap btree levels */
+ uint m_refc_maxlevels; /* max refcount btree level */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
@@ -155,6 +164,8 @@ typedef struct xfs_mount {
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct delayed_work m_eofblocks_work; /* background eof blocks
trimming */
+ struct delayed_work m_cowblocks_work; /* background cow blocks
+ trimming */
bool m_update_sb; /* sb needs update in mount */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
@@ -325,6 +336,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
}
#endif
+/* per-AG block reservation data structures*/
+enum xfs_ag_resv_type {
+ XFS_AG_RESV_NONE = 0,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_AGFL,
+};
+
+struct xfs_ag_resv {
+ /* number of blocks originally reserved here */
+ xfs_extlen_t ar_orig_reserved;
+ /* number of blocks reserved here */
+ xfs_extlen_t ar_reserved;
+ /* number of blocks originally asked for */
+ xfs_extlen_t ar_asked;
+};
+
/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
* performance of allocation group selection.
@@ -372,8 +399,31 @@ typedef struct xfs_perag {
/* for rcu-safe freeing */
struct rcu_head rcu_head;
int pagb_count; /* pagb slots in use */
+
+ /* Blocks reserved for all kinds of metadata. */
+ struct xfs_ag_resv pag_meta_resv;
+ /* Blocks reserved for just AGFL-based metadata. */
+ struct xfs_ag_resv pag_agfl_resv;
+
+ /* reference count */
+ __uint8_t pagf_refcount_level;
} xfs_perag_t;
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ return &pag->pag_meta_resv;
+ case XFS_AG_RESV_AGFL:
+ return &pag->pag_agfl_resv;
+ default:
+ return NULL;
+ }
+}
+
extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 69e2986a3776..0c381d71b242 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
@@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4);
/* dir/attr trees */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0f14b2e4bf6c..93a7aafa56d6 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -114,6 +114,13 @@ xfs_fs_map_blocks(
return -ENXIO;
/*
+ * The pNFS block layout spec actually supports reflink like
+ * functionality, but the Linux pNFS server doesn't implement it yet.
+ */
+ if (xfs_is_reflink_inode(ip))
+ return -ENXIO;
+
+ /*
* Lock out any other I/O before we flush and invalidate the pagecache,
* and then hand out a layout to the remote system. This is very
* similar to direct I/O, except that the synchronization is much more
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
new file mode 100644
index 000000000000..fe86a668a57e
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_log.h"
+#include "xfs_refcount.h"
+
+
+kmem_zone_t *xfs_cui_zone;
+kmem_zone_t *xfs_cud_zone;
+
+static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_cui_log_item, cui_item);
+}
+
+void
+xfs_cui_item_free(
+ struct xfs_cui_log_item *cuip)
+{
+ if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
+ kmem_free(cuip);
+ else
+ kmem_zone_free(xfs_cui_zone, cuip);
+}
+
+STATIC void
+xfs_cui_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
+
+ *nvecs += 1;
+ *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cui log item. We use only 1 iovec, and we point that
+ * at the cui_log_format structure embedded in the cui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cui item have been filled.
+ */
+STATIC void
+xfs_cui_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ ASSERT(atomic_read(&cuip->cui_next_extent) ==
+ cuip->cui_format.cui_nextents);
+
+ cuip->cui_format.cui_type = XFS_LI_CUI;
+ cuip->cui_format.cui_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
+ xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an cui item, so just return.
+ */
+STATIC void
+xfs_cui_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an CUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the CUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the CUI to either construct
+ * and commit the CUD or drop the CUD's reference in the event of error. Simply
+ * drop the log's CUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_cui_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
+
+ xfs_cui_release(cuip);
+}
+
+/*
+ * CUI items have no locking or pushing. However, since CUIs are pulled from
+ * the AIL when their corresponding CUDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the CUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_cui_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an CUD isn't going to be
+ * constructed and thus we free the CUI here directly.
+ */
+STATIC void
+xfs_cui_item_unlock(
+ struct xfs_log_item *lip)
+{
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_cui_item_free(CUI_ITEM(lip));
+}
+
+/*
+ * The CUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_cui_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ return lsn;
+}
+
+/*
+ * The CUI dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cui_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cui log items.
+ */
+static const struct xfs_item_ops xfs_cui_item_ops = {
+ .iop_size = xfs_cui_item_size,
+ .iop_format = xfs_cui_item_format,
+ .iop_pin = xfs_cui_item_pin,
+ .iop_unpin = xfs_cui_item_unpin,
+ .iop_unlock = xfs_cui_item_unlock,
+ .iop_committed = xfs_cui_item_committed,
+ .iop_push = xfs_cui_item_push,
+ .iop_committing = xfs_cui_item_committing,
+};
+
+/*
+ * Allocate and initialize an cui item with the given number of extents.
+ */
+struct xfs_cui_log_item *
+xfs_cui_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_cui_log_item *cuip;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+ cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+ KM_SLEEP);
+ else
+ cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+
+ xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+ cuip->cui_format.cui_nextents = nextents;
+ cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+ atomic_set(&cuip->cui_next_extent, 0);
+ atomic_set(&cuip->cui_refcount, 2);
+
+ return cuip;
+}
+
+/*
+ * Freeing the CUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the CUI may not yet have been placed in the AIL
+ * when called by xfs_cui_release() from CUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the CUI.
+ */
+void
+xfs_cui_release(
+ struct xfs_cui_log_item *cuip)
+{
+ if (atomic_dec_and_test(&cuip->cui_refcount)) {
+ xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_cui_item_free(cuip);
+ }
+}
+
+static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_cud_log_item, cud_item);
+}
+
+STATIC void
+xfs_cud_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_cud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cud log item. We use only 1 iovec, and we point that
+ * at the cud_log_format structure embedded in the cud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cud item have been filled.
+ */
+STATIC void
+xfs_cud_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ cudp->cud_format.cud_type = XFS_LI_CUD;
+ cudp->cud_format.cud_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
+ sizeof(struct xfs_cud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an cud item, so just return.
+ */
+STATIC void
+xfs_cud_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an cud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_cud_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an cud item. It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_cud_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the CUI and free the
+ * CUD.
+ */
+STATIC void
+xfs_cud_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_cui_release(cudp->cud_cuip);
+ kmem_zone_free(xfs_cud_zone, cudp);
+ }
+}
+
+/*
+ * When the cud item is committed to disk, all we need to do is delete our
+ * reference to our partner cui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_cud_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+
+ /*
+ * Drop the CUI reference regardless of whether the CUD has been
+ * aborted. Once the CUD transaction is constructed, it is the sole
+ * responsibility of the CUD to release the CUI (even if the CUI is
+ * aborted due to log I/O error).
+ */
+ xfs_cui_release(cudp->cud_cuip);
+ kmem_zone_free(xfs_cud_zone, cudp);
+
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * The CUD dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cud_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cud log items.
+ */
+static const struct xfs_item_ops xfs_cud_item_ops = {
+ .iop_size = xfs_cud_item_size,
+ .iop_format = xfs_cud_item_format,
+ .iop_pin = xfs_cud_item_pin,
+ .iop_unpin = xfs_cud_item_unpin,
+ .iop_unlock = xfs_cud_item_unlock,
+ .iop_committed = xfs_cud_item_committed,
+ .iop_push = xfs_cud_item_push,
+ .iop_committing = xfs_cud_item_committing,
+};
+
+/*
+ * Allocate and initialize an cud item with the given number of extents.
+ */
+struct xfs_cud_log_item *
+xfs_cud_init(
+ struct xfs_mount *mp,
+ struct xfs_cui_log_item *cuip)
+
+{
+ struct xfs_cud_log_item *cudp;
+
+ cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+ xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
+ cudp->cud_cuip = cuip;
+ cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+ return cudp;
+}
+
+/*
+ * Process a refcount update intent item that was recovered from the log.
+ * We need to update the refcountbt.
+ */
+int
+xfs_cui_recover(
+ struct xfs_mount *mp,
+ struct xfs_cui_log_item *cuip)
+{
+ int i;
+ int error = 0;
+ unsigned int refc_type;
+ struct xfs_phys_extent *refc;
+ xfs_fsblock_t startblock_fsb;
+ bool op_ok;
+ struct xfs_cud_log_item *cudp;
+ struct xfs_trans *tp;
+ struct xfs_btree_cur *rcur = NULL;
+ enum xfs_refcount_intent_type type;
+ xfs_fsblock_t firstfsb;
+ xfs_fsblock_t new_fsb;
+ xfs_extlen_t new_len;
+ struct xfs_bmbt_irec irec;
+ struct xfs_defer_ops dfops;
+ bool requeue_only = false;
+
+ ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+
+ /*
+ * First check the validity of the extents described by the
+ * CUI. If any are bad, then assume that all are bad and
+ * just toss the CUI.
+ */
+ for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+ refc = &cuip->cui_format.cui_extents[i];
+ startblock_fsb = XFS_BB_TO_FSB(mp,
+ XFS_FSB_TO_DADDR(mp, refc->pe_startblock));
+ switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ op_ok = true;
+ break;
+ default:
+ op_ok = false;
+ break;
+ }
+ if (!op_ok || startblock_fsb == 0 ||
+ refc->pe_len == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+ refc->pe_len >= mp->m_sb.sb_agblocks ||
+ (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
+ /*
+ * This will pull the CUI from the AIL and
+ * free the memory associated with it.
+ */
+ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+ xfs_cui_release(cuip);
+ return -EIO;
+ }
+ }
+
+ /*
+ * Under normal operation, refcount updates are deferred, so we
+ * wouldn't be adding them directly to a transaction. All
+ * refcount updates manage reservation usage internally and
+ * dynamically by deferring work that won't fit in the
+ * transaction. Normally, any work that needs to be deferred
+ * gets attached to the same defer_ops that scheduled the
+ * refcount update. However, we're in log recovery here, so we
+ * we create our own defer_ops and use that to finish up any
+ * work that doesn't fit.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error)
+ return error;
+ cudp = xfs_trans_get_cud(tp, cuip);
+
+ xfs_defer_init(&dfops, &firstfsb);
+ for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+ refc = &cuip->cui_format.cui_extents[i];
+ refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ switch (refc_type) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ type = refc_type;
+ break;
+ default:
+ error = -EFSCORRUPTED;
+ goto abort_error;
+ }
+ if (requeue_only) {
+ new_fsb = refc->pe_startblock;
+ new_len = refc->pe_len;
+ } else
+ error = xfs_trans_log_finish_refcount_update(tp, cudp,
+ &dfops, type, refc->pe_startblock, refc->pe_len,
+ &new_fsb, &new_len, &rcur);
+ if (error)
+ goto abort_error;
+
+ /* Requeue what we didn't finish. */
+ if (new_len > 0) {
+ irec.br_startblock = new_fsb;
+ irec.br_blockcount = new_len;
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_increase_extent(
+ tp->t_mountp, &dfops, &irec);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_decrease_extent(
+ tp->t_mountp, &dfops, &irec);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ error = xfs_refcount_alloc_cow_extent(
+ tp->t_mountp, &dfops,
+ irec.br_startblock,
+ irec.br_blockcount);
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ error = xfs_refcount_free_cow_extent(
+ tp->t_mountp, &dfops,
+ irec.br_startblock,
+ irec.br_blockcount);
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (error)
+ goto abort_error;
+ requeue_only = true;
+ }
+ }
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto abort_error;
+ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+ error = xfs_trans_commit(tp);
+ return error;
+
+abort_error:
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
new file mode 100644
index 000000000000..5b74dddfa64b
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_ITEM_H__
+#define __XFS_REFCOUNT_ITEM_H__
+
+/*
+ * There are (currently) two pairs of refcount btree redo item types:
+ * increase and decrease. The log items for these are CUI (refcount
+ * update intent) and CUD (refcount update done). The redo item type
+ * is encoded in the flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same
+ * transaction that records the associated refcountbt updates.
+ *
+ * Should the system crash after the commit of the first transaction
+ * but before the commit of the final transaction in a series, log
+ * recovery will use the redo information recorded by the intent items
+ * to replay the refcountbt metadata updates.
+ */
+
+/* kernel only CUI/CUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_CUI_MAX_FAST_EXTENTS 16
+
+/*
+ * Define CUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define XFS_CUI_RECOVERED 1
+
+/*
+ * This is the "refcount update intent" log item. It is used to log
+ * the fact that some reverse mappings need to change. It is used in
+ * conjunction with the "refcount update done" log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item;
+ * see the comments about that structure (in xfs_extfree_item.h) for
+ * more details.
+ */
+struct xfs_cui_log_item {
+ struct xfs_log_item cui_item;
+ atomic_t cui_refcount;
+ atomic_t cui_next_extent;
+ unsigned long cui_flags; /* misc flags */
+ struct xfs_cui_log_format cui_format;
+};
+
+static inline size_t
+xfs_cui_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_cui_log_item, cui_format) +
+ xfs_cui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "refcount update done" log item. It is used to log the
+ * fact that some refcountbt updates mentioned in an earlier cui item
+ * have been performed.
+ */
+struct xfs_cud_log_item {
+ struct xfs_log_item cud_item;
+ struct xfs_cui_log_item *cud_cuip;
+ struct xfs_cud_log_format cud_format;
+};
+
+extern struct kmem_zone *xfs_cui_zone;
+extern struct kmem_zone *xfs_cud_zone;
+
+struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
+struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
+ struct xfs_cui_log_item *);
+void xfs_cui_item_free(struct xfs_cui_log_item *);
+void xfs_cui_release(struct xfs_cui_log_item *);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+
+#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
new file mode 100644
index 000000000000..a279b4e7f5fe
--- /dev/null
+++ b/fs/xfs/xfs_reflink.c
@@ -0,0 +1,1733 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
+#include "xfs_iomap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_sb.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Copy on Write of Shared Blocks
+ *
+ * XFS must preserve "the usual" file semantics even when two files share
+ * the same physical blocks. This means that a write to one file must not
+ * alter the blocks in a different file; the way that we'll do that is
+ * through the use of a copy-on-write mechanism. At a high level, that
+ * means that when we want to write to a shared block, we allocate a new
+ * block, write the data to the new block, and if that succeeds we map the
+ * new block into the file.
+ *
+ * XFS provides a "delayed allocation" mechanism that defers the allocation
+ * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
+ * possible. This reduces fragmentation by enabling the filesystem to ask
+ * for bigger chunks less often, which is exactly what we want for CoW.
+ *
+ * The delalloc mechanism begins when the kernel wants to make a block
+ * writable (write_begin or page_mkwrite). If the offset is not mapped, we
+ * create a delalloc mapping, which is a regular in-core extent, but without
+ * a real startblock. (For delalloc mappings, the startblock encodes both
+ * a flag that this is a delalloc mapping, and a worst-case estimate of how
+ * many blocks might be required to put the mapping into the BMBT.) delalloc
+ * mappings are a reservation against the free space in the filesystem;
+ * adjacent mappings can also be combined into fewer larger mappings.
+ *
+ * When dirty pages are being written out (typically in writepage), the
+ * delalloc reservations are converted into real mappings by allocating
+ * blocks and replacing the delalloc mapping with real ones. A delalloc
+ * mapping can be replaced by several real ones if the free space is
+ * fragmented.
+ *
+ * We want to adapt the delalloc mechanism for copy-on-write, since the
+ * write paths are similar. The first two steps (creating the reservation
+ * and allocating the blocks) are exactly the same as delalloc except that
+ * the mappings must be stored in a separate CoW fork because we do not want
+ * to disturb the mapping in the data fork until we're sure that the write
+ * succeeded. IO completion in this case is the process of removing the old
+ * mapping from the data fork and moving the new mapping from the CoW fork to
+ * the data fork. This will be discussed shortly.
+ *
+ * For now, unaligned directio writes will be bounced back to the page cache.
+ * Block-aligned directio writes will use the same mechanism as buffered
+ * writes.
+ *
+ * CoW remapping must be done after the data block write completes,
+ * because we don't want to destroy the old data fork map until we're sure
+ * the new block has been written. Since the new mappings are kept in a
+ * separate fork, we can simply iterate these mappings to find the ones
+ * that cover the file blocks that we just CoW'd. For each extent, simply
+ * unmap the corresponding range in the data fork, map the new range into
+ * the data fork, and remove the extent from the CoW fork.
+ *
+ * Since the remapping operation can be applied to an arbitrary file
+ * range, we record the need for the remap step as a flag in the ioend
+ * instead of declaring a new IO type. This is required for direct io
+ * because we only have ioend for the whole dio, and we have to be able to
+ * remember the presence of unwritten blocks and CoW blocks with a single
+ * ioend structure. Better yet, the more ground we can cover with one
+ * ioend, the better.
+ */
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen. If
+ * find_end_of_shared is true, return the longest contiguous extent of
+ * shared blocks. If there are no shared extents, fbno and flen will
+ * be set to NULLAGBLOCK and 0, respectively.
+ */
+int
+xfs_reflink_find_shared(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_extlen_t aglen,
+ xfs_agblock_t *fbno,
+ xfs_extlen_t *flen,
+ bool find_end_of_shared)
+{
+ struct xfs_buf *agbp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
+
+ cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+ error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
+ find_end_of_shared);
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ xfs_buf_relse(agbp);
+ return error;
+}
+
+/*
+ * Trim the mapping to the next block where there's a change in the
+ * shared/unshared status. More specifically, this means that we
+ * find the lowest-numbered extent of shared blocks that coincides with
+ * the given block mapping. If the shared extent overlaps the start of
+ * the mapping, trim the mapping to the end of the shared extent. If
+ * the shared region intersects the mapping, trim the mapping to the
+ * start of the shared extent. If there are no shared regions that
+ * overlap, just return the original extent.
+ */
+int
+xfs_reflink_trim_around_shared(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *irec,
+ bool *shared,
+ bool *trimmed)
+{
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error = 0;
+
+ /* Holes, unwritten, and delalloc extents cannot be shared */
+ if (!xfs_is_reflink_inode(ip) ||
+ ISUNWRITTEN(irec) ||
+ irec->br_startblock == HOLESTARTBLOCK ||
+ irec->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(irec->br_startblock)) {
+ *shared = false;
+ return 0;
+ }
+
+ trace_xfs_reflink_trim_around_shared(ip, irec);
+
+ agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+ aglen = irec->br_blockcount;
+
+ error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
+ aglen, &fbno, &flen, true);
+ if (error)
+ return error;
+
+ *shared = *trimmed = false;
+ if (fbno == NULLAGBLOCK) {
+ /* No shared blocks at all. */
+ return 0;
+ } else if (fbno == agbno) {
+ /*
+ * The start of this extent is shared. Truncate the
+ * mapping at the end of the shared region so that a
+ * subsequent iteration starts at the start of the
+ * unshared region.
+ */
+ irec->br_blockcount = flen;
+ *shared = true;
+ if (flen != aglen)
+ *trimmed = true;
+ return 0;
+ } else {
+ /*
+ * There's a shared extent midway through this extent.
+ * Truncate the mapping at the start of the shared
+ * extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = fbno - agbno;
+ *trimmed = true;
+ return 0;
+ }
+}
+
+/*
+ * Trim the passed in imap to the next shared/unshared extent boundary, and
+ * if imap->br_startoff points to a shared extent reserve space for it in the
+ * COW fork. In this case *shared is set to true, else to false.
+ *
+ * Note that imap will always contain the block numbers for the existing blocks
+ * in the data fork, as the upper layers need them for read-modify-write
+ * operations.
+ */
+int
+xfs_reflink_reserve_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
+{
+ struct xfs_bmbt_irec got, prev;
+ xfs_fileoff_t end_fsb, orig_end_fsb;
+ int eof = 0, error = 0;
+ bool trimmed;
+ xfs_extnum_t idx;
+ xfs_extlen_t align;
+
+ /*
+ * Search the COW fork extent list first. This serves two purposes:
+ * first this implement the speculative preallocation using cowextisze,
+ * so that we also unshared block adjacent to shared blocks instead
+ * of just the shared blocks themselves. Second the lookup in the
+ * extent list is generally faster than going out to the shared extent
+ * tree.
+ */
+ xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
+ &got, &prev);
+ if (!eof && got.br_startoff <= imap->br_startoff) {
+ trace_xfs_reflink_cow_found(ip, imap);
+ xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+
+ *shared = true;
+ return 0;
+ }
+
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ if (error)
+ return error;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!*shared)
+ return 0;
+
+ /*
+ * Fork all the shared blocks from our write offset until the end of
+ * the extent.
+ */
+ error = xfs_qm_dqattach_locked(ip, 0);
+ if (error)
+ return error;
+
+ end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
+
+ align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
+ if (align)
+ end_fsb = roundup_64(end_fsb, align);
+
+retry:
+ error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
+ end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
+ switch (error) {
+ case 0:
+ break;
+ case -ENOSPC:
+ case -EDQUOT:
+ /* retry without any preallocation */
+ trace_xfs_reflink_cow_enospc(ip, imap);
+ if (end_fsb != orig_end_fsb) {
+ end_fsb = orig_end_fsb;
+ goto retry;
+ }
+ /*FALLTHRU*/
+ default:
+ return error;
+ }
+
+ if (end_fsb != orig_end_fsb)
+ xfs_inode_set_cowblocks_tag(ip);
+
+ trace_xfs_reflink_cow_alloc(ip, &got);
+ return 0;
+}
+
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+static int
+__xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ struct xfs_defer_ops dfops;
+ struct xfs_trans *tp;
+ xfs_fsblock_t first_block;
+ int nimaps = 1, error;
+ bool shared;
+
+ xfs_defer_init(&dfops, &first_block);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /* Read extent from the source file. */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
+ &imap, &nimaps, 0);
+ if (error)
+ goto out_unlock;
+ ASSERT(nimaps == 1);
+
+ error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ if (error)
+ goto out_trans_cancel;
+
+ if (!shared) {
+ *offset_fsb = imap.br_startoff + imap.br_blockcount;
+ goto out_trans_cancel;
+ }
+
+ xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
+ XFS_BMAPI_COWFORK, &first_block,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+ &imap, &nimaps, &dfops);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_trans_commit(tp);
+
+ *offset_fsb = imap.br_startoff + imap.br_blockcount;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+out_trans_cancel:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
+/* Allocate all CoW reservations covering a part of a file. */
+int
+xfs_reflink_allocate_cow_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ int error;
+
+ ASSERT(xfs_is_reflink_inode(ip));
+
+ trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+
+ /*
+ * Make sure that the dquots are there.
+ */
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ while (offset_fsb < end_fsb) {
+ error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
+ if (error) {
+ trace_xfs_reflink_allocate_cow_range_error(ip, error,
+ _RET_IP_);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/*
+ * Find the CoW reservation (and whether or not it needs block allocation)
+ * for a given byte offset of a file.
+ */
+bool
+xfs_reflink_find_cow_mapping(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ struct xfs_bmbt_irec *imap,
+ bool *need_alloc)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_rec_host *gotp;
+ xfs_fileoff_t bno;
+ xfs_extnum_t idx;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ ASSERT(xfs_is_reflink_inode(ip));
+
+ /* Find the extent in the CoW fork. */
+ ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ bno = XFS_B_TO_FSBT(ip->i_mount, offset);
+ gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
+ if (!gotp)
+ return false;
+
+ xfs_bmbt_get_all(gotp, &irec);
+ if (bno >= irec.br_startoff + irec.br_blockcount ||
+ bno < irec.br_startoff)
+ return false;
+
+ trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
+ &irec);
+
+ /* If it's still delalloc, we must allocate later. */
+ *imap = irec;
+ *need_alloc = !!(isnullstartblock(irec.br_startblock));
+
+ return true;
+}
+
+/*
+ * Trim an extent to end at the next CoW reservation past offset_fsb.
+ */
+int
+xfs_reflink_trim_irec_to_next_cow(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_rec_host *gotp;
+ xfs_extnum_t idx;
+
+ if (!xfs_is_reflink_inode(ip))
+ return 0;
+
+ /* Find the extent in the CoW fork. */
+ ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
+ if (!gotp)
+ return 0;
+ xfs_bmbt_get_all(gotp, &irec);
+
+ /* This is the extent before; try sliding up one. */
+ if (irec.br_startoff < offset_fsb) {
+ idx++;
+ if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ return 0;
+ gotp = xfs_iext_get_ext(ifp, idx);
+ xfs_bmbt_get_all(gotp, &irec);
+ }
+
+ if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
+ return 0;
+
+ imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+ trace_xfs_reflink_trim_irec(ip, imap);
+
+ return 0;
+}
+
+/*
+ * Cancel all pending CoW reservations for some block range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_blocks(
+ struct xfs_inode *ip,
+ struct xfs_trans **tpp,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got, prev, del;
+ xfs_extnum_t idx;
+ xfs_fsblock_t firstfsb;
+ struct xfs_defer_ops dfops;
+ int error = 0, eof = 0;
+
+ if (!xfs_is_reflink_inode(ip))
+ return 0;
+
+ xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
+ &got, &prev);
+ if (eof)
+ return 0;
+
+ while (got.br_startoff < end_fsb) {
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+ trace_xfs_reflink_cancel_cow(ip, &del);
+
+ if (isnullstartblock(del.br_startblock)) {
+ error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
+ &idx, &got, &del);
+ if (error)
+ break;
+ } else {
+ xfs_trans_ijoin(*tpp, ip, 0);
+ xfs_defer_init(&dfops, &firstfsb);
+
+ /* Free the CoW orphan record. */
+ error = xfs_refcount_free_cow_extent(ip->i_mount,
+ &dfops, del.br_startblock,
+ del.br_blockcount);
+ if (error)
+ break;
+
+ xfs_bmap_add_free(ip->i_mount, &dfops,
+ del.br_startblock, del.br_blockcount,
+ NULL);
+
+ /* Update quota accounting */
+ xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(long)del.br_blockcount);
+
+ /* Roll the transaction */
+ error = xfs_defer_finish(tpp, &dfops, ip);
+ if (error) {
+ xfs_defer_cancel(&dfops);
+ break;
+ }
+
+ /* Remove the mapping from the CoW fork. */
+ xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+ }
+
+ if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+ break;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
+ }
+
+ /* clear tag if cow fork is emptied */
+ if (!ifp->if_bytes)
+ xfs_inode_clear_cowblocks_tag(ip);
+
+ return error;
+}
+
+/*
+ * Cancel all pending CoW reservations for some byte range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ struct xfs_trans *tp;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ trace_xfs_reflink_cancel_cow_range(ip, offset, count);
+ ASSERT(xfs_is_reflink_inode(ip));
+
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ if (count == NULLFILEOFF)
+ end_fsb = NULLFILEOFF;
+ else
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+ /* Start a rolling transaction to remove the mappings */
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+ 0, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /* Scrape out the old CoW reservations */
+ error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_trans_commit(tp);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+ trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got, prev, del;
+ struct xfs_trans *tp;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ xfs_fsblock_t firstfsb;
+ struct xfs_defer_ops dfops;
+ int error, eof = 0;
+ unsigned int resblks;
+ xfs_filblks_t rlen;
+ xfs_extnum_t idx;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ /* No COW extents? That's easy! */
+ if (ifp->if_bytes == 0)
+ return 0;
+
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+ /* Start a rolling transaction to switch the mappings */
+ resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+ resblks, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
+ &got, &prev);
+
+ /* If there is a hole at end_fsb - 1 go to the previous extent */
+ if (eof || got.br_startoff > end_fsb) {
+ ASSERT(idx > 0);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+ }
+
+ /* Walk backwards until we're out of the I/O range... */
+ while (got.br_startoff + got.br_blockcount > offset_fsb) {
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+
+ /* Extent delete may have bumped idx forward */
+ if (!del.br_blockcount) {
+ idx--;
+ goto next_extent;
+ }
+
+ ASSERT(!isnullstartblock(got.br_startblock));
+
+ /* Unmap the old blocks in the data fork. */
+ xfs_defer_init(&dfops, &firstfsb);
+ rlen = del.br_blockcount;
+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1,
+ &firstfsb, &dfops);
+ if (error)
+ goto out_defer;
+
+ /* Trim the extent to whatever got unmapped. */
+ if (rlen) {
+ xfs_trim_extent(&del, del.br_startoff + rlen,
+ del.br_blockcount - rlen);
+ }
+ trace_xfs_reflink_cow_remap(ip, &del);
+
+ /* Free the CoW orphan record. */
+ error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops,
+ del.br_startblock, del.br_blockcount);
+ if (error)
+ goto out_defer;
+
+ /* Map the new blocks into the data fork. */
+ error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del);
+ if (error)
+ goto out_defer;
+
+ /* Remove the mapping from the CoW fork. */
+ xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+
+ error = xfs_defer_finish(&tp, &dfops, ip);
+ if (error)
+ goto out_defer;
+
+next_extent:
+ if (idx < 0)
+ break;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
+ }
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto out;
+ return 0;
+
+out_defer:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Free leftover CoW reservations that didn't get cleaned out.
+ */
+int
+xfs_reflink_recover_cow(
+ struct xfs_mount *mp)
+{
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_refcount_recover_cow_leftovers(mp, agno);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Reflinking (Block) Ranges of Two Files Together
+ *
+ * First, ensure that the reflink flag is set on both inodes. The flag is an
+ * optimization to avoid unnecessary refcount btree lookups in the write path.
+ *
+ * Now we can iteratively remap the range of extents (and holes) in src to the
+ * corresponding ranges in dest. Let drange and srange denote the ranges of
+ * logical blocks in dest and src touched by the reflink operation.
+ *
+ * While the length of drange is greater than zero,
+ * - Read src's bmbt at the start of srange ("imap")
+ * - If imap doesn't exist, make imap appear to start at the end of srange
+ * with zero length.
+ * - If imap starts before srange, advance imap to start at srange.
+ * - If imap goes beyond srange, truncate imap to end at the end of srange.
+ * - Punch (imap start - srange start + imap len) blocks from dest at
+ * offset (drange start).
+ * - If imap points to a real range of pblks,
+ * > Increase the refcount of the imap's pblks
+ * > Map imap's pblks into dest at the offset
+ * (drange start + imap start - srange start)
+ * - Advance drange and srange by (imap start - srange start + imap len)
+ *
+ * Finally, if the reflink made dest longer, update both the in-core and
+ * on-disk file sizes.
+ *
+ * ASCII Art Demonstration:
+ *
+ * Let's say we want to reflink this source file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS (src file)
+ * <-------------------->
+ *
+ * into this destination file:
+ *
+ * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
+ * <-------------------->
+ * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
+ * Observe that the range has different logical offsets in either file.
+ *
+ * Consider that the first extent in the source file doesn't line up with our
+ * reflink range. Unmapping and remapping are separate operations, so we can
+ * unmap more blocks from the destination file than we remap.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ * <------->
+ * --DDDDD---------DDDDD--DDD
+ * <------->
+ *
+ * Now remap the source extent into the destination file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ * <------->
+ * --DDDDD--SSSSSSSDDDDD--DDD
+ * <------->
+ *
+ * Do likewise with the second hole and extent in our range. Holes in the
+ * unmap range don't affect our operation.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ * <---->
+ * --DDDDD--SSSSSSS-SSSSS-DDD
+ * <---->
+ *
+ * Finally, unmap and remap part of the third extent. This will increase the
+ * size of the destination file.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ * <----->
+ * --DDDDD--SSSSSSS-SSSSS----SSS
+ * <----->
+ *
+ * Once we update the destination file's i_size, we're done.
+ */
+
+/*
+ * Ensure the reflink bit is set in both inodes.
+ */
+STATIC int
+xfs_reflink_set_inode_flag(
+ struct xfs_inode *src,
+ struct xfs_inode *dest)
+{
+ struct xfs_mount *mp = src->i_mount;
+ int error;
+ struct xfs_trans *tp;
+
+ if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
+ return 0;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ goto out_error;
+
+ /* Lock both files against IO */
+ if (src->i_ino == dest->i_ino)
+ xfs_ilock(src, XFS_ILOCK_EXCL);
+ else
+ xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL);
+
+ if (!xfs_is_reflink_inode(src)) {
+ trace_xfs_reflink_set_inode_flag(src);
+ xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
+ src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
+ xfs_ifork_init_cow(src);
+ } else
+ xfs_iunlock(src, XFS_ILOCK_EXCL);
+
+ if (src->i_ino == dest->i_ino)
+ goto commit_flags;
+
+ if (!xfs_is_reflink_inode(dest)) {
+ trace_xfs_reflink_set_inode_flag(dest);
+ xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+ dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+ xfs_ifork_init_cow(dest);
+ } else
+ xfs_iunlock(dest, XFS_ILOCK_EXCL);
+
+commit_flags:
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_error;
+ return error;
+
+out_error:
+ trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Update destination inode size & cowextsize hint, if necessary.
+ */
+STATIC int
+xfs_reflink_update_dest(
+ struct xfs_inode *dest,
+ xfs_off_t newlen,
+ xfs_extlen_t cowextsize)
+{
+ struct xfs_mount *mp = dest->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ return 0;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ goto out_error;
+
+ xfs_ilock(dest, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+
+ if (newlen > i_size_read(VFS_I(dest))) {
+ trace_xfs_reflink_update_inode_size(dest, newlen);
+ i_size_write(VFS_I(dest), newlen);
+ dest->i_d.di_size = newlen;
+ }
+
+ if (cowextsize) {
+ dest->i_d.di_cowextsize = cowextsize;
+ dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ }
+
+ xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_error;
+ return error;
+
+out_error:
+ trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Do we have enough reserve in this AG to handle a reflink? The refcount
+ * btree already reserved all the space it needs, but the rmap btree can grow
+ * infinitely, so we won't allow more reflinks when the AG is down to the
+ * btree reserves.
+ */
+static int
+xfs_reflink_ag_has_free_space(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ pag = xfs_perag_get(mp, agno);
+ if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) ||
+ xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
+ error = -ENOSPC;
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Unmap a range of blocks from a file, then map other blocks into the hole.
+ * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
+ * The extent irec is mapped into dest at irec->br_startoff.
+ */
+STATIC int
+xfs_reflink_remap_extent(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *irec,
+ xfs_fileoff_t destoff,
+ xfs_off_t new_isize)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ xfs_fsblock_t firstfsb;
+ unsigned int resblks;
+ struct xfs_defer_ops dfops;
+ struct xfs_bmbt_irec uirec;
+ bool real_extent;
+ xfs_filblks_t rlen;
+ xfs_filblks_t unmap_len;
+ xfs_off_t newlen;
+ int error;
+
+ unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
+ trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
+
+ /* Only remap normal extents. */
+ real_extent = (irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !ISUNWRITTEN(irec));
+
+ /* No reflinking if we're low on space */
+ if (real_extent) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ if (error)
+ goto out;
+ }
+
+ /* Start a rolling transaction to switch the mappings */
+ resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /* If we're not just clearing space, then do we have enough quota? */
+ if (real_extent) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip,
+ irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_cancel;
+ }
+
+ trace_xfs_reflink_remap(ip, irec->br_startoff,
+ irec->br_blockcount, irec->br_startblock);
+
+ /* Unmap the old blocks in the data fork. */
+ rlen = unmap_len;
+ while (rlen) {
+ xfs_defer_init(&dfops, &firstfsb);
+ error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
+ &firstfsb, &dfops);
+ if (error)
+ goto out_defer;
+
+ /*
+ * Trim the extent to whatever got unmapped.
+ * Remember, bunmapi works backwards.
+ */
+ uirec.br_startblock = irec->br_startblock + rlen;
+ uirec.br_startoff = irec->br_startoff + rlen;
+ uirec.br_blockcount = unmap_len - rlen;
+ unmap_len = rlen;
+
+ /* If this isn't a real mapping, we're done. */
+ if (!real_extent || uirec.br_blockcount == 0)
+ goto next_extent;
+
+ trace_xfs_reflink_remap(ip, uirec.br_startoff,
+ uirec.br_blockcount, uirec.br_startblock);
+
+ /* Update the refcount tree */
+ error = xfs_refcount_increase_extent(mp, &dfops, &uirec);
+ if (error)
+ goto out_defer;
+
+ /* Map the new blocks into the data fork. */
+ error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec);
+ if (error)
+ goto out_defer;
+
+ /* Update quota accounting. */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ uirec.br_blockcount);
+
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp,
+ uirec.br_startoff + uirec.br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_d.di_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+next_extent:
+ /* Process all the deferred stuff. */
+ error = xfs_defer_finish(&tp, &dfops, ip);
+ if (error)
+ goto out_defer;
+ }
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto out;
+ return 0;
+
+out_defer:
+ xfs_defer_cancel(&dfops);
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Iteratively remap one file's extents (and holes) to another's.
+ */
+STATIC int
+xfs_reflink_remap_blocks(
+ struct xfs_inode *src,
+ xfs_fileoff_t srcoff,
+ struct xfs_inode *dest,
+ xfs_fileoff_t destoff,
+ xfs_filblks_t len,
+ xfs_off_t new_isize)
+{
+ struct xfs_bmbt_irec imap;
+ int nimaps;
+ int error = 0;
+ xfs_filblks_t range_len;
+
+ /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
+ while (len) {
+ trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
+ dest, destoff);
+ /* Read extent from the source file */
+ nimaps = 1;
+ xfs_ilock(src, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
+ xfs_iunlock(src, XFS_ILOCK_EXCL);
+ if (error)
+ goto err;
+ ASSERT(nimaps == 1);
+
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ &imap);
+
+ /* Translate imap into the destination file. */
+ range_len = imap.br_startoff + imap.br_blockcount - srcoff;
+ imap.br_startoff += destoff - srcoff;
+
+ /* Clear dest from destoff to the end of imap and map it in. */
+ error = xfs_reflink_remap_extent(dest, &imap, destoff,
+ new_isize);
+ if (error)
+ goto err;
+
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto err;
+ }
+
+ /* Advance drange/srange */
+ srcoff += range_len;
+ destoff += range_len;
+ len -= range_len;
+ }
+
+ return 0;
+
+err:
+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *
+xfs_get_page(
+ struct inode *inode,
+ xfs_off_t offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int
+xfs_compare_extents(
+ struct inode *src,
+ xfs_off_t srcoff,
+ struct inode *dest,
+ xfs_off_t destoff,
+ xfs_off_t len,
+ bool *is_same)
+{
+ xfs_off_t src_poff;
+ xfs_off_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ xfs_off_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ ASSERT(cmp_len > 0);
+
+ trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
+ XFS_I(dest), destoff);
+
+ src_page = xfs_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = xfs_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ bool same_inode = (inode_in == inode_out);
+ xfs_fileoff_t sfsbno, dfsbno;
+ xfs_filblks_t fsblen;
+ xfs_extlen_t cowextsize;
+ loff_t isize;
+ ssize_t ret;
+ loff_t blen;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Lock both files against IO */
+ if (same_inode) {
+ xfs_ilock(src, XFS_IOLOCK_EXCL);
+ xfs_ilock(src, XFS_MMAPLOCK_EXCL);
+ } else {
+ xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
+ }
+
+ /* Don't touch certain kinds of inodes */
+ ret = -EPERM;
+ if (IS_IMMUTABLE(inode_out))
+ goto out_unlock;
+
+ ret = -ETXTBSY;
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ goto out_unlock;
+
+
+ /* Don't reflink dirs, pipes, sockets... */
+ ret = -EISDIR;
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+ goto out_unlock;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ goto out_unlock;
+
+ /* Don't reflink realtime inodes */
+ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+ goto out_unlock;
+
+ /* Don't share DAX file data for now. */
+ if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ goto out_unlock;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (len == 0)
+ len = isize - pos_in;
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > isize)
+ goto out_unlock;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + len > disize)
+ goto out_unlock;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ goto out_unlock;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ goto out_unlock;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (is_dedupe) {
+ bool is_same = false;
+
+ ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
+ len, &is_same);
+ if (ret)
+ goto out_unlock;
+ if (!is_same) {
+ ret = -EBADE;
+ goto out_unlock;
+ }
+ }
+
+ ret = xfs_reflink_set_inode_flag(src, dest);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Invalidate the page cache so that we can clear any CoW mappings
+ * in the destination file.
+ */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
+ dfsbno = XFS_B_TO_FSBT(mp, pos_out);
+ sfsbno = XFS_B_TO_FSBT(mp, pos_in);
+ fsblen = XFS_B_TO_FSB(mp, len);
+ ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
+ pos_out + len);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Carry the cowextsize hint from src to dest if we're sharing the
+ * entire source file to the entire destination file, the source file
+ * has a cowextsize hint, and the destination file does not.
+ */
+ cowextsize = 0;
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_d.di_cowextsize;
+
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
+
+out_unlock:
+ xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(src, XFS_IOLOCK_EXCL);
+ if (src->i_ino != dest->i_ino) {
+ xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(dest, XFS_IOLOCK_EXCL);
+ }
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ return ret;
+}
+
+/*
+ * The user wants to preemptively CoW all shared blocks in this file,
+ * which enables us to turn off the reflink flag. Iterate all
+ * extents which are not prealloc/delalloc to see which ranges are
+ * mentioned in the refcount tree, then read those blocks into the
+ * pagecache, dirty them, fsync them back out, and then we can update
+ * the inode flag. What happens if we run out of memory? :)
+ */
+STATIC int
+xfs_reflink_dirty_extents(
+ struct xfs_inode *ip,
+ xfs_fileoff_t fbno,
+ xfs_filblks_t end,
+ xfs_off_t isize)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+ xfs_off_t fpos;
+ xfs_off_t flen;
+ struct xfs_bmbt_irec map[2];
+ int nmaps;
+ int error = 0;
+
+ while (end - fbno > 0) {
+ nmaps = 1;
+ /*
+ * Look for extents in the file. Skip holes, delalloc, or
+ * unwritten extents; they can't be reflinked.
+ */
+ error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
+ if (error)
+ goto out;
+ if (nmaps == 0)
+ break;
+ if (map[0].br_startblock == HOLESTARTBLOCK ||
+ map[0].br_startblock == DELAYSTARTBLOCK ||
+ ISUNWRITTEN(&map[0]))
+ goto next;
+
+ map[1] = map[0];
+ while (map[1].br_blockcount) {
+ agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
+ aglen = map[1].br_blockcount;
+
+ error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+ &rbno, &rlen, true);
+ if (error)
+ goto out;
+ if (rbno == NULLAGBLOCK)
+ break;
+
+ /* Dirty the pages */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
+ (rbno - agbno));
+ flen = XFS_FSB_TO_B(mp, rlen);
+ if (fpos + flen > isize)
+ flen = isize - fpos;
+ error = iomap_file_dirty(VFS_I(ip), fpos, flen,
+ &xfs_iomap_ops);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto out;
+
+ map[1].br_blockcount -= (rbno - agbno + rlen);
+ map[1].br_startoff += (rbno - agbno + rlen);
+ map[1].br_startblock += (rbno - agbno + rlen);
+ }
+
+next:
+ fbno = map[0].br_startoff + map[0].br_blockcount;
+ }
+out:
+ return error;
+}
+
+/* Clear the inode reflink flag if there are no shared extents. */
+int
+xfs_reflink_clear_inode_flag(
+ struct xfs_inode *ip,
+ struct xfs_trans **tpp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t fbno;
+ xfs_filblks_t end;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+ struct xfs_bmbt_irec map;
+ int nmaps;
+ int error = 0;
+
+ ASSERT(xfs_is_reflink_inode(ip));
+
+ fbno = 0;
+ end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
+ while (end - fbno > 0) {
+ nmaps = 1;
+ /*
+ * Look for extents in the file. Skip holes, delalloc, or
+ * unwritten extents; they can't be reflinked.
+ */
+ error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0);
+ if (error)
+ return error;
+ if (nmaps == 0)
+ break;
+ if (map.br_startblock == HOLESTARTBLOCK ||
+ map.br_startblock == DELAYSTARTBLOCK ||
+ ISUNWRITTEN(&map))
+ goto next;
+
+ agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock);
+ aglen = map.br_blockcount;
+
+ error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+ &rbno, &rlen, false);
+ if (error)
+ return error;
+ /* Is there still a shared block here? */
+ if (rbno != NULLAGBLOCK)
+ return 0;
+next:
+ fbno = map.br_startoff + map.br_blockcount;
+ }
+
+ /*
+ * We didn't find any shared blocks so turn off the reflink flag.
+ * First, get rid of any leftover CoW mappings.
+ */
+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
+ if (error)
+ return error;
+
+ /* Clear the inode flag. */
+ trace_xfs_reflink_unset_inode_flag(ip);
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ xfs_inode_clear_cowblocks_tag(ip);
+ xfs_trans_ijoin(*tpp, ip, 0);
+ xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+ return error;
+}
+
+/*
+ * Clear the inode reflink flag if there are no shared extents and the size
+ * hasn't changed.
+ */
+STATIC int
+xfs_reflink_try_clear_inode_flag(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error = 0;
+
+ /* Start a rolling transaction to remove the mappings */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_reflink_clear_inode_flag(ip, &tp);
+ if (error)
+ goto cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+cancel:
+ xfs_trans_cancel(tp);
+out:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Pre-COW all shared blocks within a given byte range of a file and turn off
+ * the reflink flag if we unshare all of the file's blocks.
+ */
+int
+xfs_reflink_unshare(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t fbno;
+ xfs_filblks_t end;
+ xfs_off_t isize;
+ int error;
+
+ if (!xfs_is_reflink_inode(ip))
+ return 0;
+
+ trace_xfs_reflink_unshare(ip, offset, len);
+
+ inode_dio_wait(VFS_I(ip));
+
+ /* Try to CoW the selected ranges */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ fbno = XFS_B_TO_FSBT(mp, offset);
+ isize = i_size_read(VFS_I(ip));
+ end = XFS_B_TO_FSB(mp, offset + len);
+ error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
+ if (error)
+ goto out_unlock;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ /* Wait for the IO to finish */
+ error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ goto out;
+
+ /* Turn off the reflink flag if possible. */
+ error = xfs_reflink_try_clear_inode_flag(ip);
+ if (error)
+ goto out;
+
+ return 0;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+ trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Does this inode have any real CoW reservations?
+ */
+bool
+xfs_reflink_has_real_cow_blocks(
+ struct xfs_inode *ip)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_rec_host *gotp;
+ xfs_extnum_t idx;
+
+ if (!xfs_is_reflink_inode(ip))
+ return false;
+
+ /* Go find the old extent in the CoW fork. */
+ ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
+ while (gotp) {
+ xfs_bmbt_get_all(gotp, &irec);
+
+ if (!isnullstartblock(irec.br_startblock))
+ return true;
+
+ /* Roll on... */
+ idx++;
+ if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ break;
+ gotp = xfs_iext_get_ext(ifp, idx);
+ }
+
+ return false;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
new file mode 100644
index 000000000000..fad11607c9ad
--- /dev/null
+++ b/fs/xfs/xfs_reflink.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_REFLINK_H
+#define __XFS_REFLINK_H 1
+
+extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+ xfs_extlen_t *flen, bool find_maximal);
+extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+ struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+
+extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap, bool *shared);
+extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+ xfs_off_t offset, xfs_off_t count);
+extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+ struct xfs_bmbt_irec *imap, bool *need_alloc);
+extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
+
+extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+ struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb);
+extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
+extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
+extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
+ struct xfs_trans **tpp);
+extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+
+extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
+
+#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 2500f28689d5..73c827831551 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
kmem_zone_free(xfs_rui_zone, ruip);
}
-/*
- * This returns the number of iovecs needed to log the given rui item.
- * We only need 1 iovec for an rui item. It just logs the rui_log_format
- * structure.
- */
-static inline int
-xfs_rui_item_sizeof(
- struct xfs_rui_log_item *ruip)
-{
- return sizeof(struct xfs_rui_log_format) +
- (ruip->rui_format.rui_nextents - 1) *
- sizeof(struct xfs_map_extent);
-}
-
STATIC void
xfs_rui_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+ *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
/*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
ruip->rui_format.rui_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
- xfs_rui_item_sizeof(ruip));
+ xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
}
/*
@@ -205,16 +193,12 @@ xfs_rui_init(
{
struct xfs_rui_log_item *ruip;
- uint size;
ASSERT(nextents > 0);
- if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(struct xfs_rui_log_item) +
- ((nextents - 1) * sizeof(struct xfs_map_extent)));
- ruip = kmem_zalloc(size, KM_SLEEP);
- } else {
+ if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+ else
ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
- }
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
uint len;
src_rui_fmt = buf->i_addr;
- len = sizeof(struct xfs_rui_log_format) +
- (src_rui_fmt->rui_nextents - 1) *
- sizeof(struct xfs_map_extent);
+ len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
if (buf->i_len != len)
return -EFSCORRUPTED;
- memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+ memcpy(dst_rui_fmt, src_rui_fmt, len);
return 0;
}
@@ -459,8 +441,11 @@ xfs_rui_recover(
XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
case XFS_RMAP_EXTENT_MAP:
+ case XFS_RMAP_EXTENT_MAP_SHARED:
case XFS_RMAP_EXTENT_UNMAP:
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
case XFS_RMAP_EXTENT_CONVERT:
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
case XFS_RMAP_EXTENT_ALLOC:
case XFS_RMAP_EXTENT_FREE:
op_ok = true;
@@ -499,12 +484,21 @@ xfs_rui_recover(
case XFS_RMAP_EXTENT_MAP:
type = XFS_RMAP_MAP;
break;
+ case XFS_RMAP_EXTENT_MAP_SHARED:
+ type = XFS_RMAP_MAP_SHARED;
+ break;
case XFS_RMAP_EXTENT_UNMAP:
type = XFS_RMAP_UNMAP;
break;
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
+ type = XFS_RMAP_UNMAP_SHARED;
+ break;
case XFS_RMAP_EXTENT_CONVERT:
type = XFS_RMAP_CONVERT;
break;
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
+ type = XFS_RMAP_CONVERT_SHARED;
+ break;
case XFS_RMAP_EXTENT_ALLOC:
type = XFS_RMAP_ALLOC;
break;
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index aefcc3a318a5..340c968e1f9c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
struct xfs_rui_log_format rui_format;
};
+static inline size_t
+xfs_rui_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_rui_log_item, rui_format) +
+ xfs_rui_log_format_sizeof(nr);
+}
+
/*
* This is the "rmap update done" log item. It is used to log the fact that
* some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 6e812fe0fd43..12d48cd8f8a4 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "ibt2", XFSSTAT_END_IBT_V2 },
{ "fibt2", XFSSTAT_END_FIBT_V2 },
{ "rmapbt", XFSSTAT_END_RMAP_V2 },
+ { "refcntbt", XFSSTAT_END_REFCOUNT },
/* we print both series of quota information together */
{ "qm", XFSSTAT_END_QM },
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 657865f51e78..79ad2e69fc33 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -213,7 +213,23 @@ struct xfsstats {
__uint32_t xs_rmap_2_alloc;
__uint32_t xs_rmap_2_free;
__uint32_t xs_rmap_2_moves;
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6)
+#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15)
+ __uint32_t xs_refcbt_2_lookup;
+ __uint32_t xs_refcbt_2_compare;
+ __uint32_t xs_refcbt_2_insrec;
+ __uint32_t xs_refcbt_2_delrec;
+ __uint32_t xs_refcbt_2_newroot;
+ __uint32_t xs_refcbt_2_killroot;
+ __uint32_t xs_refcbt_2_increment;
+ __uint32_t xs_refcbt_2_decrement;
+ __uint32_t xs_refcbt_2_lshift;
+ __uint32_t xs_refcbt_2_rshift;
+ __uint32_t xs_refcbt_2_split;
+ __uint32_t xs_refcbt_2_join;
+ __uint32_t xs_refcbt_2_alloc;
+ __uint32_t xs_refcbt_2_free;
+ __uint32_t xs_refcbt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
__uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fd6be45b3a1e..ade4691e3f74 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,9 @@
#include "xfs_sysfs.h"
#include "xfs_ondisk.h"
#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_reflink.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -936,6 +939,7 @@ xfs_fs_destroy_inode(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
+ int error;
trace_xfs_destroy_inode(ip);
@@ -943,6 +947,14 @@ xfs_fs_destroy_inode(
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
+ if (xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+ if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+ xfs_warn(ip->i_mount,
+"Error %d while evicting CoW blocks for inode %llu.",
+ error, ip->i_ino);
+ }
+
xfs_inactive(ip);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -1006,6 +1018,16 @@ xfs_fs_drop_inode(
{
struct xfs_inode *ip = XFS_I(inode);
+ /*
+ * If this unlinked inode is in the middle of recovery, don't
+ * drop the inode just yet; log recovery will take care of
+ * that. See the comment for this inode flag.
+ */
+ if (ip->i_flags & XFS_IRECOVERY) {
+ ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+ return 0;
+ }
+
return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
}
@@ -1137,7 +1159,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
* Note: xfs_log_quiesce() stops background log work - the callers must ensure
* it is started again when appropriate.
*/
-static void
+void
xfs_quiesce_attr(
struct xfs_mount *mp)
{
@@ -1296,10 +1318,31 @@ xfs_fs_remount(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
xfs_queue_eofblocks(mp);
+
+ /* Recover any CoW blocks that never got remapped. */
+ error = xfs_reflink_recover_cow(mp);
+ if (error) {
+ xfs_err(mp,
+ "Error %d recovering leftover CoW allocations.", error);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ /* Create the per-AG metadata reservation pool .*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+ return error;
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ /* Free the per-AG metadata reservation pool. */
+ error = xfs_fs_unreserve_ag_blocks(mp);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
/*
* Before we sync the metadata, we need to free up the reserve
* block pool so that the used block count in the superblock on
@@ -1490,6 +1533,7 @@ xfs_fs_fill_super(
atomic_set(&mp->m_active_trans, 0);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+ INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
mp->m_super = sb;
@@ -1572,6 +1616,9 @@ xfs_fs_fill_super(
"DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
}
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ xfs_alert(mp,
+ "DAX and reflink have not been tested together!");
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
@@ -1585,6 +1632,10 @@ xfs_fs_fill_super(
"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
}
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ xfs_alert(mp,
+ "EXPERIMENTAL reflink feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1782,15 +1833,44 @@ xfs_init_zones(void)
if (!xfs_rud_zone)
goto out_destroy_icreate_zone;
- xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
- ((XFS_RUI_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_map_extent))),
+ xfs_rui_zone = kmem_zone_init(
+ xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
"xfs_rui_item");
if (!xfs_rui_zone)
goto out_destroy_rud_zone;
+ xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item),
+ "xfs_cud_item");
+ if (!xfs_cud_zone)
+ goto out_destroy_rui_zone;
+
+ xfs_cui_zone = kmem_zone_init(
+ xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
+ "xfs_cui_item");
+ if (!xfs_cui_zone)
+ goto out_destroy_cud_zone;
+
+ xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item),
+ "xfs_bud_item");
+ if (!xfs_bud_zone)
+ goto out_destroy_cui_zone;
+
+ xfs_bui_zone = kmem_zone_init(
+ xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
+ "xfs_bui_item");
+ if (!xfs_bui_zone)
+ goto out_destroy_bud_zone;
+
return 0;
+ out_destroy_bud_zone:
+ kmem_zone_destroy(xfs_bud_zone);
+ out_destroy_cui_zone:
+ kmem_zone_destroy(xfs_cui_zone);
+ out_destroy_cud_zone:
+ kmem_zone_destroy(xfs_cud_zone);
+ out_destroy_rui_zone:
+ kmem_zone_destroy(xfs_rui_zone);
out_destroy_rud_zone:
kmem_zone_destroy(xfs_rud_zone);
out_destroy_icreate_zone:
@@ -1833,6 +1913,10 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_bui_zone);
+ kmem_zone_destroy(xfs_bud_zone);
+ kmem_zone_destroy(xfs_cui_zone);
+ kmem_zone_destroy(xfs_cud_zone);
kmem_zone_destroy(xfs_rui_zone);
kmem_zone_destroy(xfs_rud_zone);
kmem_zone_destroy(xfs_icreate_zone);
@@ -1886,6 +1970,8 @@ init_xfs_fs(void)
xfs_extent_free_init_defer_op();
xfs_rmap_update_init_defer_op();
+ xfs_refcount_update_init_defer_op();
+ xfs_bmap_update_init_defer_op();
xfs_dir_startup();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 529bce9fc37e..b6418abd85ad 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,6 +61,7 @@ struct xfs_mount;
struct xfs_buftarg;
struct block_device;
+extern void xfs_quiesce_attr(struct xfs_mount *mp);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index aed74d3f8da9..afe1f66aaa69 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = {
.extra1 = &xfs_params.eofb_timer.min,
.extra2 = &xfs_params.eofb_timer.max,
},
+ {
+ .procname = "speculative_cow_prealloc_lifetime",
+ .data = &xfs_params.cowb_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.cowb_timer.min,
+ .extra2 = &xfs_params.cowb_timer.max,
+ },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index ffef45375754..984a3499cfe3 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -48,6 +48,7 @@ typedef struct xfs_param {
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
+ xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */
} xfs_param_t;
/*
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 79cfd3fc5324..276d3023d60f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
struct kobject *kobject,
char *buf)
{
+ int retries;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+ if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+ retries = -1;
+ else
+ retries = cfg->max_retries;
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", retries);
}
static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
if (val < -1)
return -EINVAL;
- cfg->max_retries = val;
+ if (val == -1)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else
+ cfg->max_retries = val;
return count;
}
XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
struct kobject *kobject,
char *buf)
{
+ int timeout;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
- return snprintf(buf, PAGE_SIZE, "%ld\n",
- jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+ if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+ timeout = -1;
+ else
+ timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
}
static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
if (ret)
return ret;
- /* 1 day timeout maximum */
- if (val < 0 || val > 86400)
+ /* 1 day timeout maximum, -1 means infinite */
+ if (val < -1 || val > 86400)
return -EINVAL;
- cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ if (val == -1)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else {
+ cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+ }
return count;
}
XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -493,13 +512,13 @@ static struct attribute *xfs_error_attrs[] = {
};
-struct kobj_type xfs_error_cfg_ktype = {
+static struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_error_attrs,
};
-struct kobj_type xfs_error_ktype = {
+static struct kobj_type xfs_error_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
};
@@ -519,18 +538,19 @@ struct xfs_error_init {
static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
{ .name = "default",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "EIO",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENOSPC",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENODEV",
- .max_retries = 0,
+ .max_retries = 0, /* We can't recover from devices disappearing */
+ .retry_timeout = 0,
},
};
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
goto out_error;
cfg->max_retries = init[i].max_retries;
- cfg->retry_timeout = msecs_to_jiffies(
+ if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else
+ cfg->retry_timeout = msecs_to_jiffies(
init[i].retry_timeout * MSEC_PER_SEC);
}
return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d303a665dba9..0907752be62d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -39,6 +39,7 @@ struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
+struct xfs_refcount_irec;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__field(unsigned long, caller_ip)
),
TP_fast_assign(
- struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ?
- ip->i_afp : &ip->i_df;
+ struct xfs_ifork *ifp;
struct xfs_bmbt_irec r;
+ ifp = xfs_iext_state_to_fork(ip, state);
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
@@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
@@ -1170,7 +1176,6 @@ DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
-DEFINE_RW_EVENT(xfs_file_splice_read);
DECLARE_EVENT_CLASS(xfs_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
@@ -1570,14 +1575,15 @@ TRACE_EVENT(xfs_agf,
TRACE_EVENT(xfs_free_extent,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, bool isfl, int haveleft, int haveright),
- TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+ xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+ int haveright),
+ TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(int, isfl)
+ __field(int, resv)
__field(int, haveleft)
__field(int, haveright)
),
@@ -1586,16 +1592,16 @@ TRACE_EVENT(xfs_free_extent,
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->isfl = isfl;
+ __entry->resv = resv;
__entry->haveleft = haveleft;
__entry->haveright = haveright;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+ TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
- __entry->isfl,
+ __entry->resv,
__entry->haveleft ?
(__entry->haveright ? "both" : "left") :
(__entry->haveright ? "right" : "none"))
@@ -1622,8 +1628,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__field(short, otype)
__field(char, wasdel)
__field(char, wasfromfl)
- __field(char, isfl)
- __field(char, userdata)
+ __field(int, resv)
+ __field(int, datatype)
__field(xfs_fsblock_t, firstblock)
),
TP_fast_assign(
@@ -1643,14 +1649,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->otype = args->otype;
__entry->wasdel = args->wasdel;
__entry->wasfromfl = args->wasfromfl;
- __entry->isfl = args->isfl;
- __entry->userdata = args->userdata;
+ __entry->resv = args->resv;
+ __entry->datatype = args->datatype;
__entry->firstblock = args->firstblock;
),
TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
- "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
- "userdata %d firstblock 0x%llx",
+ "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
+ "datatype 0x%x firstblock 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1667,8 +1673,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
__entry->wasdel,
__entry->wasfromfl,
- __entry->isfl,
- __entry->userdata,
+ __entry->resv,
+ __entry->datatype,
(unsigned long long)__entry->firstblock)
)
@@ -1984,6 +1990,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+TRACE_EVENT(xfs_log_recover_record,
+ TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+ TP_ARGS(log, rhead, pass),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_lsn_t, lsn)
+ __field(int, len)
+ __field(int, num_logops)
+ __field(int, pass)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->lsn = be64_to_cpu(rhead->h_lsn);
+ __entry->len = be32_to_cpu(rhead->h_len);
+ __entry->num_logops = be32_to_cpu(rhead->h_num_logops);
+ __entry->pass = pass;
+ ),
+ TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lsn, __entry->len, __entry->num_logops,
+ __entry->pass)
+)
+
DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
TP_PROTO(struct xlog *log, struct xlog_recover *trans,
struct xlog_recover_item *item, int pass),
@@ -1992,6 +2021,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
__field(dev_t, dev)
__field(unsigned long, item)
__field(xlog_tid_t, tid)
+ __field(xfs_lsn_t, lsn)
__field(int, type)
__field(int, pass)
__field(int, count)
@@ -2001,15 +2031,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
__entry->dev = log->l_mp->m_super->s_dev;
__entry->item = (unsigned long)item;
__entry->tid = trans->r_log_tid;
+ __entry->lsn = trans->r_lsn;
__entry->type = ITEM_TYPE(item);
__entry->pass = pass;
__entry->count = item->ri_cnt;
__entry->total = item->ri_total;
),
- TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
- "item region count/total %d/%d",
+ TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, "
+ "item type %s item region count/total %d/%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->tid,
+ __entry->lsn,
__entry->pass,
(void *)__entry->item,
__print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
@@ -2068,6 +2100,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
@@ -2554,10 +2587,794 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);
DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
+/* deferred bmbt updates */
+#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
+
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+ TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+ xfs_extlen_t len),
+ TP_ARGS(pag, resv, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, resv)
+ __field(xfs_extlen_t, freeblks)
+ __field(xfs_extlen_t, flcount)
+ __field(xfs_extlen_t, reserved)
+ __field(xfs_extlen_t, asked)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ struct xfs_ag_resv *r = xfs_perag_resv(pag, resv);
+
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->resv = resv;
+ __entry->freeblks = pag->pagf_freeblks;
+ __entry->flcount = pag->pagf_flcount;
+ __entry->reserved = r ? r->ar_reserved : 0;
+ __entry->asked = r ? r->ar_asked : 0;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->resv,
+ __entry->freeblks,
+ __entry->flcount,
+ __entry->reserved,
+ __entry->asked,
+ __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+ TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+ xfs_extlen_t len), \
+ TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
+/* refcount tracepoint classes */
+
+/* reuse the discard trace class for agbno/aglen-based traces */
+#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+
+/* ag btree lookup tracepoint class */
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+ { XFS_LOOKUP_EQ, "eq" }, \
+ { XFS_LOOKUP_LE, "le" }, \
+ { XFS_LOOKUP_GE, "ge" }
+DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_lookup_t dir),
+ TP_ARGS(mp, agno, agbno, dir),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_lookup_t, dir)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->dir = dir;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
+ __entry->dir)
+)
+
+#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
+DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_lookup_t dir), \
+ TP_ARGS(mp, agno, agbno, dir))
+
+/* single-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *irec),
+ TP_ARGS(mp, agno, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startblock = irec->rc_startblock;
+ __entry->blockcount = irec->rc_blockcount;
+ __entry->refcount = irec->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->refcount)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ struct xfs_refcount_irec *irec), \
+ TP_ARGS(mp, agno, irec))
+
+/* single-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
+ TP_ARGS(mp, agno, irec, agbno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startblock = irec->rc_startblock;
+ __entry->blockcount = irec->rc_blockcount;
+ __entry->refcount = irec->rc_refcount;
+ __entry->agbno = agbno;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->refcount,
+ __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
+ TP_ARGS(mp, agno, irec, agbno))
+
+/* double-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
+ TP_ARGS(mp, agno, i1, i2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, i1_startblock)
+ __field(xfs_extlen_t, i1_blockcount)
+ __field(xfs_nlink_t, i1_refcount)
+ __field(xfs_agblock_t, i2_startblock)
+ __field(xfs_extlen_t, i2_blockcount)
+ __field(xfs_nlink_t, i2_refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->i1_startblock = i1->rc_startblock;
+ __entry->i1_blockcount = i1->rc_blockcount;
+ __entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_startblock = i2->rc_startblock;
+ __entry->i2_blockcount = i2->rc_blockcount;
+ __entry->i2_refcount = i2->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+ "agbno %u len %u refcount %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->i1_startblock,
+ __entry->i1_blockcount,
+ __entry->i1_refcount,
+ __entry->i2_startblock,
+ __entry->i2_blockcount,
+ __entry->i2_refcount)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
+ TP_ARGS(mp, agno, i1, i2))
+
+/* double-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+ xfs_agblock_t agbno),
+ TP_ARGS(mp, agno, i1, i2, agbno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, i1_startblock)
+ __field(xfs_extlen_t, i1_blockcount)
+ __field(xfs_nlink_t, i1_refcount)
+ __field(xfs_agblock_t, i2_startblock)
+ __field(xfs_extlen_t, i2_blockcount)
+ __field(xfs_nlink_t, i2_refcount)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->i1_startblock = i1->rc_startblock;
+ __entry->i1_blockcount = i1->rc_blockcount;
+ __entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_startblock = i2->rc_startblock;
+ __entry->i2_blockcount = i2->rc_blockcount;
+ __entry->i2_refcount = i2->rc_refcount;
+ __entry->agbno = agbno;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+ "agbno %u len %u refcount %u @ agbno %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->i1_startblock,
+ __entry->i1_blockcount,
+ __entry->i1_refcount,
+ __entry->i2_startblock,
+ __entry->i2_blockcount,
+ __entry->i2_refcount,
+ __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+ xfs_agblock_t agbno), \
+ TP_ARGS(mp, agno, i1, i2, agbno))
+
+/* triple-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+ struct xfs_refcount_irec *i3),
+ TP_ARGS(mp, agno, i1, i2, i3),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, i1_startblock)
+ __field(xfs_extlen_t, i1_blockcount)
+ __field(xfs_nlink_t, i1_refcount)
+ __field(xfs_agblock_t, i2_startblock)
+ __field(xfs_extlen_t, i2_blockcount)
+ __field(xfs_nlink_t, i2_refcount)
+ __field(xfs_agblock_t, i3_startblock)
+ __field(xfs_extlen_t, i3_blockcount)
+ __field(xfs_nlink_t, i3_refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->i1_startblock = i1->rc_startblock;
+ __entry->i1_blockcount = i1->rc_blockcount;
+ __entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_startblock = i2->rc_startblock;
+ __entry->i2_blockcount = i2->rc_blockcount;
+ __entry->i2_refcount = i2->rc_refcount;
+ __entry->i3_startblock = i3->rc_startblock;
+ __entry->i3_blockcount = i3->rc_blockcount;
+ __entry->i3_refcount = i3->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+ "agbno %u len %u refcount %u -- "
+ "agbno %u len %u refcount %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->i1_startblock,
+ __entry->i1_blockcount,
+ __entry->i1_refcount,
+ __entry->i2_startblock,
+ __entry->i2_blockcount,
+ __entry->i2_refcount,
+ __entry->i3_startblock,
+ __entry->i3_blockcount,
+ __entry->i3_refcount)
+);
+
+#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+ struct xfs_refcount_irec *i3), \
+ TP_ARGS(mp, agno, i1, i2, i3))
+
+/* refcount btree tracepoints */
+DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
+DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+
+/* refcount adjustment tracepoints */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
+DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+
+/* reflink helpers */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
+
+TRACE_EVENT(xfs_refcount_finish_one_leftover,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int type, xfs_agblock_t agbno, xfs_extlen_t len,
+ xfs_agblock_t new_agbno, xfs_extlen_t new_len),
+ TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, type)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(xfs_agblock_t, new_agbno)
+ __field(xfs_extlen_t, new_len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->type = type;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->new_agbno = new_agbno;
+ __entry->new_len = new_len;
+ ),
+ TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->new_agbno,
+ __entry->new_len)
+);
+
+/* simple inode-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_inode_error_class,
+ TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
+ TP_ARGS(ip, error, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, error)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->error = error;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ino %llx error %d caller %ps",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->error,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_INODE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_inode_error_class, name, \
+ TP_PROTO(struct xfs_inode *ip, int error, \
+ unsigned long caller_ip), \
+ TP_ARGS(ip, error, caller_ip))
+
+/* reflink allocator */
+TRACE_EVENT(xfs_bmap_remap_alloc,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
+ xfs_extlen_t len),
+ TP_ARGS(ip, fsbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, fsbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->fsbno = fsbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->fsbno,
+ __entry->len)
+);
+DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
+
+/* reflink tracepoint classes */
+
+/* two-file io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_io_class,
+ TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len,
+ struct xfs_inode *dest, xfs_off_t doffset),
+ TP_ARGS(src, soffset, len, dest, doffset),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, src_ino)
+ __field(loff_t, src_isize)
+ __field(loff_t, src_disize)
+ __field(loff_t, src_offset)
+ __field(size_t, len)
+ __field(xfs_ino_t, dest_ino)
+ __field(loff_t, dest_isize)
+ __field(loff_t, dest_disize)
+ __field(loff_t, dest_offset)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(src)->i_sb->s_dev;
+ __entry->src_ino = src->i_ino;
+ __entry->src_isize = VFS_I(src)->i_size;
+ __entry->src_disize = src->i_d.di_size;
+ __entry->src_offset = soffset;
+ __entry->len = len;
+ __entry->dest_ino = dest->i_ino;
+ __entry->dest_isize = VFS_I(dest)->i_size;
+ __entry->dest_disize = dest->i_d.di_size;
+ __entry->dest_offset = doffset;
+ ),
+ TP_printk("dev %d:%d count %zd "
+ "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> "
+ "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->len,
+ __entry->src_ino,
+ __entry->src_isize,
+ __entry->src_disize,
+ __entry->src_offset,
+ __entry->dest_ino,
+ __entry->dest_isize,
+ __entry->dest_disize,
+ __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_IO_EVENT(name) \
+DEFINE_EVENT(xfs_double_io_class, name, \
+ TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \
+ struct xfs_inode *dest, xfs_off_t doffset), \
+ TP_ARGS(src, soffset, len, dest, doffset))
+
+/* two-file vfs io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
+ TP_PROTO(struct inode *src, u64 soffset, u64 len,
+ struct inode *dest, u64 doffset),
+ TP_ARGS(src, soffset, len, dest, doffset),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, src_ino)
+ __field(loff_t, src_isize)
+ __field(loff_t, src_offset)
+ __field(size_t, len)
+ __field(unsigned long, dest_ino)
+ __field(loff_t, dest_isize)
+ __field(loff_t, dest_offset)
+ ),
+ TP_fast_assign(
+ __entry->dev = src->i_sb->s_dev;
+ __entry->src_ino = src->i_ino;
+ __entry->src_isize = i_size_read(src);
+ __entry->src_offset = soffset;
+ __entry->len = len;
+ __entry->dest_ino = dest->i_ino;
+ __entry->dest_isize = i_size_read(dest);
+ __entry->dest_offset = doffset;
+ ),
+ TP_printk("dev %d:%d count %zd "
+ "ino 0x%lx isize 0x%llx offset 0x%llx -> "
+ "ino 0x%lx isize 0x%llx offset 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->len,
+ __entry->src_ino,
+ __entry->src_isize,
+ __entry->src_offset,
+ __entry->dest_ino,
+ __entry->dest_isize,
+ __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \
+DEFINE_EVENT(xfs_double_vfs_io_class, name, \
+ TP_PROTO(struct inode *src, u64 soffset, u64 len, \
+ struct inode *dest, u64 doffset), \
+ TP_ARGS(src, soffset, len, dest, doffset))
+
+/* CoW write tracepoint */
+DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
+ xfs_extlen_t len, xfs_fsblock_t new_pblk),
+ TP_ARGS(ip, lblk, pblk, len, new_pblk),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_fsblock_t, pblk)
+ __field(xfs_extlen_t, len)
+ __field(xfs_fsblock_t, new_pblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->lblk = lblk;
+ __entry->pblk = pblk;
+ __entry->len = len;
+ __entry->new_pblk = new_pblk;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
+ "len 0x%x new_pblk %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->lblk,
+ __entry->pblk,
+ __entry->len,
+ __entry->new_pblk)
+)
+
+#define DEFINE_COW_EVENT(name) \
+DEFINE_EVENT(xfs_copy_on_write_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
+ xfs_extlen_t len, xfs_fsblock_t new_pblk), \
+ TP_ARGS(ip, lblk, pblk, len, new_pblk))
+
+/* inode/irec events */
+DECLARE_EVENT_CLASS(xfs_inode_irec_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_extlen_t, len)
+ __field(xfs_fsblock_t, pblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk)
+);
+#define DEFINE_INODE_IREC_EVENT(name) \
+DEFINE_EVENT(xfs_inode_irec_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, irec))
+
+/* refcount/reflink tracepoint definitions */
+
+/* reflink tracepoints */
+DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
+DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
+DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
+DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+ TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
+ xfs_filblks_t len, struct xfs_inode *dest,
+ xfs_fileoff_t doffset),
+ TP_ARGS(src, soffset, len, dest, doffset),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, src_ino)
+ __field(xfs_fileoff_t, src_lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_ino_t, dest_ino)
+ __field(xfs_fileoff_t, dest_lblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(src)->i_sb->s_dev;
+ __entry->src_ino = src->i_ino;
+ __entry->src_lblk = soffset;
+ __entry->len = len;
+ __entry->dest_ino = dest->i_ino;
+ __entry->dest_lblk = doffset;
+ ),
+ TP_printk("dev %d:%d len 0x%llx "
+ "ino 0x%llx offset 0x%llx blocks -> "
+ "ino 0x%llx offset 0x%llx blocks",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->len,
+ __entry->src_ino,
+ __entry->src_lblk,
+ __entry->dest_ino,
+ __entry->dest_lblk)
+);
+TRACE_EVENT(xfs_reflink_punch_range,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+ xfs_extlen_t len),
+ TP_ARGS(ip, lblk, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->lblk = lblk;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->lblk,
+ __entry->len)
+);
+TRACE_EVENT(xfs_reflink_remap,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+ xfs_extlen_t len, xfs_fsblock_t new_pblk),
+ TP_ARGS(ip, lblk, len, new_pblk),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_extlen_t, len)
+ __field(xfs_fsblock_t, new_pblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->lblk = lblk;
+ __entry->len = len;
+ __entry->new_pblk = new_pblk;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->lblk,
+ __entry->len,
+ __entry->new_pblk)
+);
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+
+/* dedupe tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
+
+/* ioctl tracepoints */
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
+TRACE_EVENT(xfs_ioctl_clone,
+ TP_PROTO(struct inode *src, struct inode *dest),
+ TP_ARGS(src, dest),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, src_ino)
+ __field(loff_t, src_isize)
+ __field(unsigned long, dest_ino)
+ __field(loff_t, dest_isize)
+ ),
+ TP_fast_assign(
+ __entry->dev = src->i_sb->s_dev;
+ __entry->src_ino = src->i_ino;
+ __entry->src_isize = i_size_read(src);
+ __entry->dest_ino = dest->i_ino;
+ __entry->dest_isize = i_size_read(dest);
+ ),
+ TP_printk("dev %d:%d "
+ "ino 0x%lx isize 0x%llx -> "
+ "ino 0x%lx isize 0x%llx\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->src_ino,
+ __entry->src_isize,
+ __entry->dest_ino,
+ __entry->dest_isize)
+);
+
+/* unshare tracepoints */
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
+DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
+
+/* copy on write */
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
+
+DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+
+DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
+DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
+
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
+
+DEFINE_COW_EVENT(xfs_reflink_fork_buf);
+DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
+
+DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
+
+/* rmap swapext tracepoints */
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
+DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5f3d33d16e67..70f42ea86dfb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
* in-core superblock's counter. This should only
* be applied to the on-disk superblock.
*/
- ASSERT(delta < 0);
tp->t_res_fdblocks_delta += delta;
if (xfs_sb_version_haslazysbcount(&mp->m_sb))
flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e2bf86aad33d..61b7fbdd3ebd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -36,6 +36,11 @@ struct xfs_busy_extent;
struct xfs_rud_log_item;
struct xfs_rui_log_item;
struct xfs_btree_cur;
+struct xfs_cui_log_item;
+struct xfs_cud_log_item;
+struct xfs_defer_ops;
+struct xfs_bui_log_item;
+struct xfs_bud_log_item;
typedef struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
@@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
+/* refcount updates */
+enum xfs_refcount_intent_type;
+
+void xfs_refcount_update_init_defer_op(void);
+struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
+ struct xfs_cui_log_item *cuip);
+int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
+ struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops,
+ enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+
+/* mapping updates */
+enum xfs_bmap_intent_type;
+
+void xfs_bmap_update_init_defer_op(void);
+struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
+ struct xfs_bui_log_item *buip);
+int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
+ struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+ enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+ int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount, xfs_exntst_t state);
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
new file mode 100644
index 000000000000..6408e7d7c08c
--- /dev/null
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * This routine is called to allocate a "bmap update done"
+ * log item.
+ */
+struct xfs_bud_log_item *
+xfs_trans_get_bud(
+ struct xfs_trans *tp,
+ struct xfs_bui_log_item *buip)
+{
+ struct xfs_bud_log_item *budp;
+
+ budp = xfs_bud_init(tp->t_mountp, buip);
+ xfs_trans_add_item(tp, &budp->bud_item);
+ return budp;
+}
+
+/*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_bmap_update(
+ struct xfs_trans *tp,
+ struct xfs_bud_log_item *budp,
+ struct xfs_defer_ops *dop,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
+{
+ int error;
+
+ error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff,
+ startblock, blockcount, state);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the BUI and frees the BUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_bmap_intent *ba;
+ struct xfs_bmap_intent *bb;
+
+ ba = container_of(a, struct xfs_bmap_intent, bi_list);
+ bb = container_of(b, struct xfs_bmap_intent, bi_list);
+ return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_bui_log_item *buip;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+ ASSERT(tp != NULL);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ ASSERT(buip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &buip->bui_item);
+ return buip;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+ struct xfs_map_extent *bmap,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
+{
+ bmap->me_flags = 0;
+ switch (type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ bmap->me_flags = type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (state == XFS_EXT_UNWRITTEN)
+ bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_bui_log_item *buip = intent;
+ struct xfs_bmap_intent *bmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+ ASSERT(next_extent < buip->bui_format.bui_nextents);
+ map = &buip->bui_format.bui_extents[next_extent];
+ map->me_owner = bmap->bi_owner->i_ino;
+ map->me_startblock = bmap->bi_bmap.br_startblock;
+ map->me_startoff = bmap->bi_bmap.br_startoff;
+ map->me_len = bmap->bi_bmap.br_blockcount;
+ xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+ bmap->bi_bmap.br_state);
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_bud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_bmap_intent *bmap;
+ int error;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+ bmap->bi_type,
+ bmap->bi_owner, bmap->bi_whichfork,
+ bmap->bi_bmap.br_startoff,
+ bmap->bi_bmap.br_startblock,
+ bmap->bi_bmap.br_blockcount,
+ bmap->bi_bmap.br_state);
+ kmem_free(bmap);
+ return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+ void *intent)
+{
+ xfs_bui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bmap;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ kmem_free(bmap);
+}
+
+static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_BMAP,
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_bmap_update_diff_items,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .log_item = xfs_bmap_update_log_item,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_bmap_update_init_defer_op(void)
+{
+ xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 459ddec137a4..ab438647592a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
- error = xfs_free_extent(tp, start_block, ext_len, oinfo);
+ error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+ XFS_AG_RESV_NONE);
/*
* Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 11a3af08b5c7..dab8daa676f9 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -73,7 +73,7 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- tv = current_fs_time(inode->i_sb);
+ tv = current_time(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
new file mode 100644
index 000000000000..94c1877af834
--- /dev/null
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_refcount_item.h"
+#include "xfs_alloc.h"
+#include "xfs_refcount.h"
+
+/*
+ * This routine is called to allocate a "refcount update done"
+ * log item.
+ */
+struct xfs_cud_log_item *
+xfs_trans_get_cud(
+ struct xfs_trans *tp,
+ struct xfs_cui_log_item *cuip)
+{
+ struct xfs_cud_log_item *cudp;
+
+ cudp = xfs_cud_init(tp->t_mountp, cuip);
+ xfs_trans_add_item(tp, &cudp->cud_item);
+ return cudp;
+}
+
+/*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_refcount_update(
+ struct xfs_trans *tp,
+ struct xfs_cud_log_item *cudp,
+ struct xfs_defer_ops *dop,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
+{
+ int error;
+
+ error = xfs_refcount_finish_one(tp, dop, type, startblock,
+ blockcount, new_fsb, new_len, pcur);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the CUI and frees the CUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_refcount_intent *ra;
+ struct xfs_refcount_intent *rb;
+
+ ra = container_of(a, struct xfs_refcount_intent, ri_list);
+ rb = container_of(b, struct xfs_refcount_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_cui_log_item *cuip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ ASSERT(cuip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+ struct xfs_phys_extent *refc,
+ enum xfs_refcount_intent_type type)
+{
+ refc->pe_flags = 0;
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ refc->pe_flags |= type;
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_cui_log_item *cuip = intent;
+ struct xfs_refcount_intent *refc;
+ uint next_extent;
+ struct xfs_phys_extent *ext;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+ ASSERT(next_extent < cuip->cui_format.cui_nextents);
+ ext = &cuip->cui_format.cui_extents[next_extent];
+ ext->pe_startblock = refc->ri_startblock;
+ ext->pe_len = refc->ri_blockcount;
+ xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_cud(tp, intent);
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_refcount_intent *refc;
+ xfs_fsblock_t new_fsb;
+ xfs_extlen_t new_aglen;
+ int error;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ error = xfs_trans_log_finish_refcount_update(tp, done_item, dop,
+ refc->ri_type,
+ refc->ri_startblock,
+ refc->ri_blockcount,
+ &new_fsb, &new_aglen,
+ (struct xfs_btree_cur **)state);
+ /* Did we run out of reservation? Requeue what we didn't finish. */
+ if (!error && new_aglen > 0) {
+ ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+ refc->ri_type == XFS_REFCOUNT_DECREASE);
+ refc->ri_startblock = new_fsb;
+ refc->ri_blockcount = new_aglen;
+ return -EAGAIN;
+ }
+ kmem_free(refc);
+ return error;
+}
+
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+ void *intent)
+{
+ xfs_cui_release(intent);
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_refcount_intent *refc;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ kmem_free(refc);
+}
+
+static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_REFCOUNT,
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_refcount_update_diff_items,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .log_item = xfs_refcount_update_log_item,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_update_finish_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_refcount_update_init_defer_op(void)
+{
+ xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 5a50ef881568..9ead064b5e90 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags(
case XFS_RMAP_MAP:
rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
break;
+ case XFS_RMAP_MAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ break;
case XFS_RMAP_UNMAP:
rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
break;
+ case XFS_RMAP_UNMAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ break;
case XFS_RMAP_CONVERT:
rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
break;
+ case XFS_RMAP_CONVERT_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ break;
case XFS_RMAP_ALLOC:
rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
break;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index ea62245fee26..62900938f26d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
+ context->seen_enough = 1;
return 0;
}
offset = (char *)context->alist + context->count;