summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig3
-rw-r--r--fs/9p/acl.c37
-rw-r--r--fs/9p/acl.h20
-rw-r--r--fs/9p/fid.c71
-rw-r--r--fs/9p/fid.h22
-rw-r--r--fs/9p/v9fs.c34
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/vfs_dentry.c18
-rw-r--r--fs/9p/vfs_dir.c92
-rw-r--r--fs/9p/vfs_file.c15
-rw-r--r--fs/9p/vfs_inode.c49
-rw-r--r--fs/9p/vfs_inode_dotl.c115
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/9p/xattr.c33
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/Kconfig11
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/Kconfig4
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/inode.c15
-rw-r--r--fs/affs/Kconfig4
-rw-r--r--fs/affs/amigaffs.c3
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c18
-rw-r--r--fs/affs/inode.c5
-rw-r--r--fs/afs/Kconfig7
-rw-r--r--fs/afs/afs.h11
-rw-r--r--fs/afs/dir.c4
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/fsclient.c14
-rw-r--r--fs/afs/inode.c6
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/write.c7
-rw-r--r--fs/aio.c10
-rw-r--r--fs/anon_inodes.c10
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/root.c10
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/befs/Kconfig4
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/Kconfig4
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/file.c15
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c20
-rw-r--r--fs/binfmt_elf_fdpic.c11
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_misc.c9
-rw-r--r--fs/binfmt_script.c4
-rw-r--r--fs/bio.c2
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/Kconfig6
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c5
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h20
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.c82
-rw-r--r--fs/btrfs/ctree.h150
-rw-r--r--fs/btrfs/delayed-inode.c147
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/delayed-ref.c82
-rw-r--r--fs/btrfs/delayed-ref.h52
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c227
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c594
-rw-r--r--fs/btrfs/extent_io.c138
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c15
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c71
-rw-r--r--fs/btrfs/file.c100
-rw-r--r--fs/btrfs/free-space-cache.c82
-rw-r--r--fs/btrfs/inode.c1203
-rw-r--r--fs/btrfs/ioctl.c395
-rw-r--r--fs/btrfs/ioctl.h502
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/ordered-data.c111
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/qgroup.c75
-rw-r--r--fs/btrfs/raid56.c2100
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/relocation.c6
-rw-r--r--fs/btrfs/scrub.c35
-rw-r--r--fs/btrfs/send.c59
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/super.c91
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/transaction.c198
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c19
-rw-r--r--fs/btrfs/tree-log.c176
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/volumes.c658
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/cachefiles/interface.c57
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/cachefiles/rdwr.c114
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/Kconfig4
-rw-r--r--fs/ceph/addr.c110
-rw-r--r--fs/ceph/caps.c67
-rw-r--r--fs/ceph/dir.c6
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c91
-rw-r--r--fs/ceph/inode.c78
-rw-r--r--fs/ceph/ioctl.c22
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c48
-rw-r--r--fs/ceph/mds_client.h10
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/strings.c4
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h14
-rw-r--r--fs/ceph/xattr.c214
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/cifs_debug.h6
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifs_fs_sb.h8
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c47
-rw-r--r--fs/cifs/cifsfs.c27
-rw-r--r--fs/cifs/cifsglob.h24
-rw-r--r--fs/cifs/cifspdu.h1
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c15
-rw-r--r--fs/cifs/connect.c79
-rw-r--r--fs/cifs/dir.c18
-rw-r--r--fs/cifs/file.c187
-rw-r--r--fs/cifs/inode.c61
-rw-r--r--fs/cifs/ioctl.c2
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c31
-rw-r--r--fs/cifs/smb1ops.c8
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/coda/cache.c4
-rw-r--r--fs/coda/coda_fs_i.h2
-rw-r--r--fs/coda/coda_linux.c8
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/inode.c8
-rw-r--r--fs/coda/pioctl.c2
-rw-r--r--fs/coda/psdev.c7
-rw-r--r--fs/coda/upcall.c10
-rw-r--r--fs/compat.c52
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/configfs/dir.c7
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c124
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c18
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/config.c2
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c33
-rw-r--r--fs/dlm/lockspace.c1
-rw-r--r--fs/dlm/lowcomms.c11
-rw-r--r--fs/dlm/recover.c52
-rw-r--r--fs/dlm/user.c8
-rw-r--r--fs/ecryptfs/Kconfig4
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h6
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c3
-rw-r--r--fs/ecryptfs/kthread.c6
-rw-r--r--fs/ecryptfs/messaging.c6
-rw-r--r--fs/ecryptfs/mmap.c12
-rw-r--r--fs/ecryptfs/read_write.c6
-rw-r--r--fs/efs/Kconfig4
-rw-r--r--fs/efs/dir.c2
-rw-r--r--fs/eventpoll.c22
-rw-r--r--fs/exec.c59
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exportfs/expfs.c7
-rw-r--r--fs/ext2/balloc.c28
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/ioctl.c2
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext3/dir.c8
-rw-r--r--fs/ext3/inode.c16
-rw-r--r--fs/ext3/ioctl.c2
-rw-r--r--fs/ext3/namei.c5
-rw-r--r--fs/ext3/resize.c12
-rw-r--r--fs/ext3/super.c52
-rw-r--r--fs/ext3/xattr.c4
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/acl.c7
-rw-r--r--fs/ext4/balloc.c15
-rw-r--r--fs/ext4/dir.c11
-rw-r--r--fs/ext4/ext4.h124
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/ext4_jbd2.c102
-rw-r--r--fs/ext4/ext4_jbd2.h51
-rw-r--r--fs/ext4/extents.c338
-rw-r--r--fs/ext4/extents_status.c620
-rw-r--r--fs/ext4/extents_status.h86
-rw-r--r--fs/ext4/file.c26
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/hash.c6
-rw-r--r--fs/ext4/ialloc.c29
-rw-r--r--fs/ext4/indirect.c259
-rw-r--r--fs/ext4/inline.c14
-rw-r--r--fs/ext4/inode.c769
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c77
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/migrate.c15
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c504
-rw-r--r--fs/ext4/page-io.c85
-rw-r--r--fs/ext4/resize.c42
-rw-r--r--fs/ext4/super.c574
-rw-r--r--fs/ext4/xattr.c23
-rw-r--r--fs/ext4/xattr.h68
-rw-r--r--fs/f2fs/Kconfig53
-rw-r--r--fs/f2fs/Makefile7
-rw-r--r--fs/f2fs/acl.c412
-rw-r--r--fs/f2fs/acl.h57
-rw-r--r--fs/f2fs/checkpoint.c784
-rw-r--r--fs/f2fs/data.c718
-rw-r--r--fs/f2fs/debug.c355
-rw-r--r--fs/f2fs/dir.c671
-rw-r--r--fs/f2fs/f2fs.h1113
-rw-r--r--fs/f2fs/file.c671
-rw-r--r--fs/f2fs/gc.c698
-rw-r--r--fs/f2fs/gc.h96
-rw-r--r--fs/f2fs/hash.c101
-rw-r--r--fs/f2fs/inode.c259
-rw-r--r--fs/f2fs/namei.c503
-rw-r--r--fs/f2fs/node.c1756
-rw-r--r--fs/f2fs/node.h353
-rw-r--r--fs/f2fs/recovery.c375
-rw-r--r--fs/f2fs/segment.c1770
-rw-r--r--fs/f2fs/segment.h618
-rw-r--r--fs/f2fs/super.c749
-rw-r--r--fs/f2fs/xattr.c443
-rw-r--r--fs/f2fs/xattr.h145
-rw-r--r--fs/fat/dir.c11
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/file.c4
-rw-r--r--fs/fat/inode.c79
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fat/nfs.c3
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c4
-rw-r--r--fs/file_table.c35
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c60
-rw-r--r--fs/fs_struct.c6
-rw-r--r--fs/fscache/cache.c8
-rw-r--r--fs/fscache/cookie.c89
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/object.c101
-rw-r--r--fs/fscache/operation.c140
-rw-r--r--fs/fscache/page.c195
-rw-r--r--fs/fscache/stats.c17
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/cuse.c46
-rw-r--r--fs/fuse/dev.c133
-rw-r--r--fs/fuse/dir.c261
-rw-r--r--fs/fuse/file.c267
-rw-r--r--fs/fuse/fuse_i.h74
-rw-r--r--fs/fuse/inode.c18
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c32
-rw-r--r--fs/gfs2/dir.c32
-rw-r--r--fs/gfs2/export.c4
-rw-r--r--fs/gfs2/file.c23
-rw-r--r--fs/gfs2/glock.c116
-rw-r--r--fs/gfs2/glops.c4
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c40
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/log.c76
-rw-r--r--fs/gfs2/log.h12
-rw-r--r--fs/gfs2/lops.c83
-rw-r--r--fs/gfs2/lops.h14
-rw-r--r--fs/gfs2/meta_io.c35
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/quota.c142
-rw-r--r--fs/gfs2/quota.h15
-rw-r--r--fs/gfs2/rgrp.c55
-rw-r--r--fs/gfs2/super.c76
-rw-r--r--fs/gfs2/super.h3
-rw-r--r--fs/gfs2/sys.c80
-rw-r--r--fs/gfs2/trans.c124
-rw-r--r--fs/gfs2/trans.h3
-rw-r--r--fs/gfs2/util.c3
-rw-r--r--fs/gfs2/xattr.c40
-rw-r--r--fs/hfs/Kconfig4
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/inode.c28
-rw-r--r--fs/hfsplus/Makefile4
-rw-r--r--fs/hfsplus/attributes.c399
-rw-r--r--fs/hfsplus/bfind.c93
-rw-r--r--fs/hfsplus/bitmap.c13
-rw-r--r--fs/hfsplus/bnode.c8
-rw-r--r--fs/hfsplus/brec.c23
-rw-r--r--fs/hfsplus/btree.c13
-rw-r--r--fs/hfsplus/catalog.c36
-rw-r--r--fs/hfsplus/dir.c57
-rw-r--r--fs/hfsplus/extents.c28
-rw-r--r--fs/hfsplus/hfsplus_fs.h54
-rw-r--r--fs/hfsplus/hfsplus_raw.h68
-rw-r--r--fs/hfsplus/inode.c47
-rw-r--r--fs/hfsplus/ioctl.c112
-rw-r--r--fs/hfsplus/super.c71
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hfsplus/xattr.c709
-rw-r--r--fs/hfsplus/xattr.h60
-rw-r--r--fs/hfsplus/xattr_security.c104
-rw-r--r--fs/hfsplus/xattr_trusted.c63
-rw-r--r--fs/hfsplus/xattr_user.c63
-rw-r--r--fs/hostfs/hostfs_kern.c10
-rw-r--r--fs/hpfs/dir.c4
-rw-r--r--fs/hpfs/file.c22
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c7
-rw-r--r--fs/hppfs/hppfs.c8
-rw-r--r--fs/hugetlbfs/inode.c33
-rw-r--r--fs/inode.c21
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c12
-rw-r--r--fs/isofs/compress.c2
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/isofs/export.c4
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd2/commit.c8
-rw-r--r--fs/jbd2/journal.c66
-rw-r--r--fs/jbd2/transaction.c61
-rw-r--r--fs/jffs2/Kconfig10
-rw-r--r--fs/jffs2/dir.c4
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_dtree.c2
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/clntlock.c16
-rw-r--r--fs/lockd/clntproc.c11
-rw-r--r--fs/lockd/host.c30
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svclock.c16
-rw-r--r--fs/lockd/svcsubs.c11
-rw-r--r--fs/locks.c24
-rw-r--r--fs/logfs/Kconfig4
-rw-r--r--fs/logfs/dir.c4
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/readwrite.c10
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/file.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/namei.c189
-rw-r--r--fs/namespace.c65
-rw-r--r--fs/ncpfs/dir.c10
-rw-r--r--fs/ncpfs/inode.c63
-rw-r--r--fs/ncpfs/ioctl.c29
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/ncp_fs_sb.h6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/cache_lib.c12
-rw-r--r--fs/nfs/cache_lib.h2
-rw-r--r--fs/nfs/callback_proc.c61
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c154
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c64
-rw-r--r--fs/nfs/dns_resolve.c67
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/fscache.h21
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/idmap.c55
-rw-r--r--fs/nfs/inode.c43
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs2xdr.c19
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c18
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c76
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/nfs4filelayout.c6
-rw-r--r--fs/nfs/nfs4filelayout.h2
-rw-r--r--fs/nfs/nfs4filelayoutdev.c1
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c173
-rw-r--r--fs/nfs/nfs4state.c33
-rw-r--r--fs/nfs/nfs4super.c6
-rw-r--r--fs/nfs/nfs4xdr.c16
-rw-r--r--fs/nfs/objlayout/objio_osd.c1
-rw-r--r--fs/nfs/pnfs.c173
-rw-r--r--fs/nfs/pnfs.h13
-rw-r--r--fs/nfs/pnfs_dev.c9
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfs/super.c99
-rw-r--r--fs/nfs/unlink.c25
-rw-r--r--fs/nfs/write.c13
-rw-r--r--fs/nfs_common/nfsacl.c41
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/acl.h2
-rw-r--r--fs/nfsd/auth.c12
-rw-r--r--fs/nfsd/auth.h6
-rw-r--r--fs/nfsd/cache.h17
-rw-r--r--fs/nfsd/export.c38
-rw-r--r--fs/nfsd/fault_inject.c113
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/idmap.h8
-rw-r--r--fs/nfsd/netns.h66
-rw-r--r--fs/nfsd/nfs2acl.c25
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs3xdr.c71
-rw-r--r--fs/nfsd/nfs4acl.c63
-rw-r--r--fs/nfsd/nfs4callback.c69
-rw-r--r--fs/nfsd/nfs4idmap.c54
-rw-r--r--fs/nfsd/nfs4proc.c81
-rw-r--r--fs/nfsd/nfs4recover.c571
-rw-r--r--fs/nfsd/nfs4state.c1120
-rw-r--r--fs/nfsd/nfs4xdr.c395
-rw-r--r--fs/nfsd/nfscache.c353
-rw-r--r--fs/nfsd/nfsctl.c167
-rw-r--r--fs/nfsd/nfsd.h42
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsproc.c12
-rw-r--r--fs/nfsd/nfssvc.c211
-rw-r--r--fs/nfsd/nfsxdr.c32
-rw-r--r--fs/nfsd/state.h64
-rw-r--r--fs/nfsd/vfs.c66
-rw-r--r--fs/nfsd/vfs.h8
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nfsd/xdr3.h2
-rw-r--r--fs/nfsd/xdr4.h17
-rw-r--r--fs/nilfs2/Kconfig3
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c5
-rw-r--r--fs/nilfs2/inode.c24
-rw-r--r--fs/nilfs2/ioctl.c7
-rw-r--r--fs/nilfs2/namei.c4
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c39
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/notify/fsnotify.c3
-rw-r--r--fs/notify/group.c47
-rw-r--r--fs/notify/inode_mark.c33
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/notify/inotify/inotify_user.c62
-rw-r--r--fs/notify/mark.c91
-rw-r--r--fs/notify/notification.c1
-rw-r--r--fs/notify/vfsmount_mark.c33
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ocfs2/acl.c31
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c40
-rw-r--r--fs/ocfs2/dcache.c3
-rw-r--r--fs/ocfs2/dir.c5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c6
-rw-r--r--fs/ocfs2/dlmglue.c13
-rw-r--r--fs/ocfs2/export.c4
-rw-r--r--fs/ocfs2/extent_map.c3
-rw-r--r--fs/ocfs2/file.c43
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/journal.c10
-rw-r--r--fs/ocfs2/localalloc.c8
-rw-r--r--fs/ocfs2/mmap.c10
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/suballoc.c7
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c22
-rw-r--r--fs/open.c147
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/pipe.c20
-rw-r--r--fs/proc/Makefile3
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c55
-rw-r--r--fs/proc/generic.c80
-rw-r--r--fs/proc/inode.c44
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/proc_devtree.c13
-rw-r--r--fs/proc/proc_net.c16
-rw-r--r--fs/proc/proc_sysctl.c30
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/vmcore.c35
-rw-r--r--fs/pstore/inode.c18
-rw-r--r--fs/pstore/platform.c35
-rw-r--r--fs/pstore/ram.c24
-rw-r--r--fs/pstore/ram_core.c9
-rw-r--r--fs/qnx4/dir.c2
-rw-r--r--fs/qnx6/dir.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c19
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/file.c5
-rw-r--r--fs/reiserfs/inode.c19
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c1
-rw-r--r--fs/seq_file.c44
-rw-r--r--fs/splice.c11
-rw-r--r--fs/squashfs/dir.c2
-rw-r--r--fs/stat.c29
-rw-r--r--fs/statfs.c9
-rw-r--r--fs/super.c8
-rw-r--r--fs/sync.c2
-rw-r--r--fs/sysfs/bin.c9
-rw-r--r--fs/sysfs/group.c42
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysfs/symlink.c45
-rw-r--r--fs/sysfs/sysfs.h2
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c5
-rw-r--r--fs/sysv/itree.c17
-rw-r--r--fs/timerfd.c85
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c3
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/orphan.c12
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/udf/inode.c86
-rw-r--r--fs/udf/namei.c4
-rw-r--r--fs/udf/super.c14
-rw-r--r--fs/udf/udf_i.h16
-rw-r--r--fs/udf/udf_sb.h5
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/ufs/Kconfig2
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/inode.c15
-rw-r--r--fs/utimes.c6
-rw-r--r--fs/xattr.c72
-rw-r--r--fs/xfs/Kconfig4
-rw-r--r--fs/xfs/xfs_alloc.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr.c9
-rw-r--r--fs/xfs/xfs_bmap.c124
-rw-r--r--fs/xfs/xfs_buf.c34
-rw-r--r--fs/xfs/xfs_buf.h6
-rw-r--r--fs/xfs/xfs_buf_item.c177
-rw-r--r--fs/xfs/xfs_buf_item.h16
-rw-r--r--fs/xfs/xfs_dfrag.c12
-rw-r--r--fs/xfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/xfs_dquot.c12
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c4
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c16
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c86
-rw-r--r--fs/xfs/xfs_log.c10
-rw-r--r--fs/xfs/xfs_log_recover.c3
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c32
-rw-r--r--fs/xfs/xfs_super.c29
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c376
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c14
-rw-r--r--fs/xfs/xfs_trans_buf.c27
-rw-r--r--fs/xfs/xfs_trans_dquot.c10
-rw-r--r--fs/xfs/xfs_trans_inode.c41
-rw-r--r--fs/xfs/xfs_types.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c12
622 files changed, 31456 insertions, 9179 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 0a93dc1cb4ac..55abfd62654a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -11,8 +11,7 @@ config 9P_FS
if 9P_FS
config 9P_FSCACHE
- bool "Enable 9P client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Enable 9P client caching support"
depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
help
Choose Y here to enable persistent, read-only local
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 15b679166201..7af425f53bee 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -23,6 +23,7 @@
#include "acl.h"
#include "v9fs.h"
#include "v9fs_vfs.h"
+#include "fid.h"
static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
{
@@ -113,16 +114,12 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)
}
-static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
{
int retval;
char *name;
size_t size;
void *buffer;
- struct inode *inode = dentry->d_inode;
-
- set_cached_acl(inode, type, acl);
-
if (!acl)
return 0;
@@ -144,17 +141,16 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+ retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
err_free_out:
kfree(buffer);
return retval;
}
-int v9fs_acl_chmod(struct dentry *dentry)
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
{
int retval = 0;
struct posix_acl *acl;
- struct inode *inode = dentry->d_inode;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -163,25 +159,30 @@ int v9fs_acl_chmod(struct dentry *dentry)
retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (retval)
return retval;
- retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl);
+ set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+ retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
posix_acl_release(acl);
}
return retval;
}
-int v9fs_set_create_acl(struct dentry *dentry,
- struct posix_acl **dpacl, struct posix_acl **pacl)
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+ struct posix_acl *dacl, struct posix_acl *acl)
{
- if (dentry) {
- v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl);
- v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl);
- }
- posix_acl_release(*dpacl);
- posix_acl_release(*pacl);
- *dpacl = *pacl = NULL;
+ set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+ set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+ v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl);
+ v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
return 0;
}
+void v9fs_put_acl(struct posix_acl *dacl,
+ struct posix_acl *acl)
+{
+ posix_acl_release(dacl);
+ posix_acl_release(acl);
+}
+
int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl, struct posix_acl **pacl)
{
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 559556411965..e4f7e882272b 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -17,27 +17,33 @@
#ifdef CONFIG_9P_FS_POSIX_ACL
extern int v9fs_get_acl(struct inode *, struct p9_fid *);
extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
-extern int v9fs_acl_chmod(struct dentry *);
-extern int v9fs_set_create_acl(struct dentry *,
- struct posix_acl **, struct posix_acl **);
+extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
+extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
+ struct posix_acl *, struct posix_acl *);
extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl, struct posix_acl **pacl);
+extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
#define v9fs_iop_get_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
return 0;
}
-static inline int v9fs_acl_chmod(struct dentry *dentry)
+static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
{
return 0;
}
-static inline int v9fs_set_create_acl(struct dentry *dentry,
- struct posix_acl **dpacl,
- struct posix_acl **pacl)
+static inline int v9fs_set_create_acl(struct inode *inode,
+ struct p9_fid *fid,
+ struct posix_acl *dacl,
+ struct posix_acl *acl)
{
return 0;
}
+static inline void v9fs_put_acl(struct posix_acl *dacl,
+ struct posix_acl *acl)
+{
+}
static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl,
struct posix_acl **pacl)
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index da8eefbe830d..d51ec9fafcc8 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -41,29 +41,16 @@
*
*/
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
{
- struct v9fs_dentry *dent;
-
- p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
- fid->fid, dentry->d_name.name);
-
- dent = dentry->d_fsdata;
- if (!dent) {
- dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL);
- if (!dent)
- return -ENOMEM;
-
- spin_lock_init(&dent->lock);
- INIT_LIST_HEAD(&dent->fidlist);
- dentry->d_fsdata = dent;
- }
-
- spin_lock(&dent->lock);
- list_add(&fid->dlist, &dent->fidlist);
- spin_unlock(&dent->lock);
+ hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
- return 0;
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+{
+ spin_lock(&dentry->d_lock);
+ __add_fid(dentry, fid);
+ spin_unlock(&dentry->d_lock);
}
/**
@@ -74,24 +61,25 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
*
*/
-static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
{
- struct v9fs_dentry *dent;
struct p9_fid *fid, *ret;
p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, uid, any);
- dent = (struct v9fs_dentry *) dentry->d_fsdata;
+ dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+ any);
ret = NULL;
- if (dent) {
- spin_lock(&dent->lock);
- list_for_each_entry(fid, &dent->fidlist, dlist) {
- if (any || fid->uid == uid) {
+ /* we'll recheck under lock if there's anything to look in */
+ if (dentry->d_fsdata) {
+ struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+ spin_lock(&dentry->d_lock);
+ hlist_for_each_entry(fid, h, dlist) {
+ if (any || uid_eq(fid->uid, uid)) {
ret = fid;
break;
}
}
- spin_unlock(&dent->lock);
+ spin_unlock(&dentry->d_lock);
}
return ret;
@@ -126,7 +114,7 @@ err_out:
}
static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- uid_t uid, int any)
+ kuid_t uid, int any)
{
struct dentry *ds;
char **wnames, *uname;
@@ -214,8 +202,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
}
kfree(wnames);
fid_out:
- if (!IS_ERR(fid))
- v9fs_fid_add(dentry, fid);
+ if (!IS_ERR(fid)) {
+ spin_lock(&dentry->d_lock);
+ if (d_unhashed(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ p9_client_clunk(fid);
+ fid = ERR_PTR(-ENOENT);
+ } else {
+ __add_fid(dentry, fid);
+ spin_unlock(&dentry->d_lock);
+ }
+ }
err_out:
up_read(&v9ses->rename_sem);
return fid;
@@ -233,7 +230,7 @@ err_out:
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
{
- uid_t uid;
+ kuid_t uid;
int any, access;
struct v9fs_session_info *v9ses;
@@ -253,7 +250,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
break;
default:
- uid = ~0;
+ uid = INVALID_UID;
any = 0;
break;
}
@@ -272,7 +269,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
return ret;
}
-static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
{
struct p9_fid *fid, *ret;
@@ -289,7 +286,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
int err;
struct p9_fid *fid;
- fid = v9fs_fid_clone_with_uid(dentry, 0);
+ fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
if (IS_ERR(fid))
goto error_out;
/*
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index bb0b6e7f58fc..2b6787fcb626 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -23,28 +23,8 @@
#define FS_9P_FID_H
#include <linux/list.h>
-/**
- * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
- * @lock: protects the fidlist
- * @fidlist: list of FIDs currently associated with this dentry
- *
- * This structure defines the 9p private data associated with
- * a particular dentry. In particular, this private data is used
- * to lookup which 9P FID handle should be used for a particular VFS
- * operation. FID handles are associated with dentries instead of
- * inodes in order to more closely map functionality to the Plan 9
- * expected behavior for FID reclaimation and tracking.
- *
- * See Also: Mapping FIDs to Linux VFS model in
- * Design and Implementation of the Linux 9P File System documentation
- */
-struct v9fs_dentry {
- spinlock_t lock; /* protect fidlist */
- struct list_head fidlist;
-};
-
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d934f04e7736..58e6cbce4156 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
ret = r;
continue;
}
- v9ses->dfltuid = option;
+ v9ses->dfltuid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(v9ses->dfltuid)) {
+ p9_debug(P9_DEBUG_ERROR,
+ "uid field, but not a uid?\n");
+ ret = -EINVAL;
+ continue;
+ }
break;
case Opt_dfltgid:
r = match_int(&args[0], &option);
@@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
ret = r;
continue;
}
- v9ses->dfltgid = option;
+ v9ses->dfltgid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(v9ses->dfltgid)) {
+ p9_debug(P9_DEBUG_ERROR,
+ "gid field, but not a gid?\n");
+ ret = -EINVAL;
+ continue;
+ }
break;
case Opt_afid:
r = match_int(&args[0], &option);
@@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
else if (strcmp(s, "client") == 0) {
v9ses->flags |= V9FS_ACCESS_CLIENT;
} else {
+ uid_t uid;
v9ses->flags |= V9FS_ACCESS_SINGLE;
- v9ses->uid = simple_strtoul(s, &e, 10);
+ uid = simple_strtoul(s, &e, 10);
if (*e != '\0') {
ret = -EINVAL;
pr_info("Unknown access argument %s\n",
@@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
kfree(s);
goto free_and_return;
}
+ v9ses->uid = make_kuid(current_user_ns(), uid);
+ if (!uid_valid(v9ses->uid)) {
+ ret = -EINVAL;
+ pr_info("Uknown uid %s\n", s);
+ kfree(s);
+ goto free_and_return;
+ }
}
kfree(s);
@@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
list_add(&v9ses->slist, &v9fs_sessionlist);
spin_unlock(&v9fs_sessionlist_lock);
- v9ses->uid = ~0;
+ v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags &= ~V9FS_ACCESS_MASK;
v9ses->flags |= V9FS_ACCESS_ANY;
- v9ses->uid = ~0;
+ v9ses->uid = INVALID_UID;
}
if (!v9fs_proto_dotl(v9ses) ||
!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
@@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags &= ~V9FS_ACL_MASK;
}
- fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
+ fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
v9ses->aname);
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
@@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
fid->uid = v9ses->uid;
else
- fid->uid = ~0;
+ fid->uid = INVALID_UID;
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 34c59f14a1c9..a8e127c89627 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,9 +109,9 @@ struct v9fs_session_info {
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
unsigned int maxdata; /* max data for client interface */
- unsigned int dfltuid; /* default uid/muid for legacy support */
- unsigned int dfltgid; /* default gid for legacy support */
- u32 uid; /* if ACCESS_SINGLE, the uid that has access */
+ kuid_t dfltuid; /* default uid/muid for legacy support */
+ kgid_t dfltgid; /* default gid for legacy support */
+ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct backing_dev_info bdi;
@@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
#define V9FS_PORT 564
#define V9FS_DEFUSER "nobody"
#define V9FS_DEFANAME ""
-#define V9FS_DEFUID (-2)
-#define V9FS_DEFGID (-2)
+#define V9FS_DEFUID KUIDT_INIT(-2)
+#define V9FS_DEFGID KGIDT_INIT(-2)
static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 64600b5d0522..f039b104a98e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -83,21 +83,12 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
- struct v9fs_dentry *dent;
- struct p9_fid *temp, *current_fid;
-
+ struct hlist_node *p, *n;
p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
dentry->d_name.name, dentry);
- dent = dentry->d_fsdata;
- if (dent) {
- list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
- dlist) {
- p9_client_clunk(current_fid);
- }
-
- kfree(dent);
- dentry->d_fsdata = NULL;
- }
+ hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+ p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+ dentry->d_fsdata = NULL;
}
static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
@@ -137,6 +128,7 @@ out_valid:
const struct dentry_operations v9fs_cached_dentry_operations = {
.d_revalidate = v9fs_lookup_revalidate,
+ .d_weak_revalidate = v9fs_lookup_revalidate,
.d_delete = v9fs_cached_dentry_delete,
.d_release = v9fs_dentry_release,
};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ff911e779651..be1e34adc3c6 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -52,10 +52,9 @@
*/
struct p9_rdir {
- struct mutex mutex;
int head;
int tail;
- uint8_t *buf;
+ uint8_t buf[];
};
/**
@@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf)
*
*/
-static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
- struct p9_rdir *rdir;
- struct p9_fid *fid;
- int err = 0;
-
- fid = filp->private_data;
- if (!fid->rdir) {
- rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
-
- if (rdir == NULL) {
- err = -ENOMEM;
- goto exit;
- }
- spin_lock(&filp->f_dentry->d_lock);
- if (!fid->rdir) {
- rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
- mutex_init(&rdir->mutex);
- rdir->head = rdir->tail = 0;
- fid->rdir = (void *) rdir;
- rdir = NULL;
- }
- spin_unlock(&filp->f_dentry->d_lock);
- kfree(rdir);
- }
-exit:
- return err;
+ struct p9_fid *fid = filp->private_data;
+ if (!fid->rdir)
+ fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+ return fid->rdir;
}
/**
@@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
buflen = fid->clnt->msize - P9_IOHDRSZ;
- err = v9fs_alloc_rdir_buf(filp, buflen);
- if (err)
- goto exit;
- rdir = (struct p9_rdir *) fid->rdir;
+ rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ if (!rdir)
+ return -ENOMEM;
- err = mutex_lock_interruptible(&rdir->mutex);
- if (err)
- return err;
- while (err == 0) {
+ while (1) {
if (rdir->tail == rdir->head) {
err = v9fs_file_readn(filp, rdir->buf, NULL,
buflen, filp->f_pos);
if (err <= 0)
- goto unlock_and_exit;
+ return err;
rdir->head = 0;
rdir->tail = err;
@@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
rdir->tail - rdir->head, &st);
if (err) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- err = -EIO;
p9stat_free(&st);
- goto unlock_and_exit;
+ return -EIO;
}
reclen = st.size+2;
@@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
p9stat_free(&st);
- if (over) {
- err = 0;
- goto unlock_and_exit;
- }
+ if (over)
+ return 0;
+
rdir->head += reclen;
filp->f_pos += reclen;
}
}
-
-unlock_and_exit:
- mutex_unlock(&rdir->mutex);
-exit:
- return err;
}
/**
@@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
- err = v9fs_alloc_rdir_buf(filp, buflen);
- if (err)
- goto exit;
- rdir = (struct p9_rdir *) fid->rdir;
+ rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ if (!rdir)
+ return -ENOMEM;
- err = mutex_lock_interruptible(&rdir->mutex);
- if (err)
- return err;
-
- while (err == 0) {
+ while (1) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
filp->f_pos);
if (err <= 0)
- goto unlock_and_exit;
+ return err;
rdir->head = 0;
rdir->tail = err;
@@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
&curdirent);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- err = -EIO;
- goto unlock_and_exit;
+ return -EIO;
}
/* d_off in dirent structure tracks the offset into
@@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
curdirent.d_type);
oldoffset = curdirent.d_off;
- if (over) {
- err = 0;
- goto unlock_and_exit;
- }
+ if (over)
+ return 0;
filp->f_pos = curdirent.d_off;
rdir->head += err;
}
}
-
-unlock_and_exit:
- mutex_unlock(&rdir->mutex);
-exit:
- return err;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c2483e97beee..d384a8b77ee8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)
p9_client_clunk(fid);
return err;
}
- if (file->f_flags & O_TRUNC) {
- i_size_write(inode, 0);
- inode->i_blocks = 0;
- }
if ((file->f_flags & O_APPEND) &&
(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
@@ -133,7 +129,7 @@ out_error:
static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
{
int res = 0;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
@@ -302,7 +298,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
@@ -338,7 +334,7 @@ out_err:
static int v9fs_file_flock_dotl(struct file *filp, int cmd,
struct file_lock *fl)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
@@ -529,7 +525,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
if (!count)
goto out;
- retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+ retval = v9fs_file_write_internal(file_inode(filp),
filp->private_data,
data, count, &origin, 1);
/* update offset on successful write */
@@ -604,7 +600,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct v9fs_inode *v9inode;
struct page *page = vmf->page;
struct file *filp = vma->vm_file;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
@@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
if (page->mapping != inode->i_mapping)
goto out_unlock;
+ wait_for_stable_page(page);
return VM_FAULT_LOCKED;
out_unlock:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 890bed538f9b..d86edc8d3fd0 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)
break;
}
- if (uflags & O_TRUNC)
- ret |= P9_OTRUNC;
-
if (extended) {
if (uflags & O_EXCL)
ret |= P9_OEXCL;
@@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
wstat->uid = NULL;
wstat->gid = NULL;
wstat->muid = NULL;
- wstat->n_uid = ~0;
- wstat->n_gid = ~0;
- wstat->n_muid = ~0;
+ wstat->n_uid = INVALID_UID;
+ wstat->n_gid = INVALID_GID;
+ wstat->n_muid = INVALID_UID;
wstat->extension = NULL;
}
@@ -695,9 +692,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
"inode creation failed %d\n", err);
goto error;
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
+ v9fs_fid_add(dentry, fid);
d_instantiate(dentry, inode);
}
return ofid;
@@ -793,7 +788,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
struct p9_fid *dfid, *fid;
struct inode *inode;
char *name;
- int result = 0;
p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
dir, dentry->d_name.name, dentry, flags);
@@ -811,13 +805,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
name = (char *) dentry->d_name.name;
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
- result = PTR_ERR(fid);
- if (result == -ENOENT) {
- inode = NULL;
- goto inst_out;
+ if (fid == ERR_PTR(-ENOENT)) {
+ d_add(dentry, NULL);
+ return NULL;
}
-
- return ERR_PTR(result);
+ return ERR_CAST(fid);
}
/*
* Make sure we don't use a wrong inode due to parallel
@@ -829,14 +821,9 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
else
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
- result = PTR_ERR(inode);
- inode = NULL;
- goto error;
+ p9_client_clunk(fid);
+ return ERR_CAST(inode);
}
- result = v9fs_fid_add(dentry, fid);
- if (result < 0)
- goto error_iput;
-inst_out:
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -845,15 +832,13 @@ inst_out:
* k/b.
*/
res = d_materialise_unique(dentry, inode);
- if (!IS_ERR(res))
- return res;
- result = PTR_ERR(res);
-error_iput:
- iput(inode);
-error:
- p9_client_clunk(fid);
-
- return ERR_PTR(result);
+ if (!res)
+ v9fs_fid_add(dentry, fid);
+ else if (!IS_ERR(res))
+ v9fs_fid_add(res, fid);
+ else
+ p9_client_clunk(fid);
+ return res;
}
static int
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 40895546e103..53687bbf2296 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
* group of the new file system object.
*/
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
{
BUG_ON(dir_inode == NULL);
@@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags)
{ O_CREAT, P9_DOTL_CREATE },
{ O_EXCL, P9_DOTL_EXCL },
{ O_NOCTTY, P9_DOTL_NOCTTY },
- { O_TRUNC, P9_DOTL_TRUNC },
{ O_APPEND, P9_DOTL_APPEND },
{ O_NONBLOCK, P9_DOTL_NONBLOCK },
{ O_DSYNC, P9_DOTL_DSYNC },
@@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
int *opened)
{
int err = 0;
- gid_t gid;
+ kgid_t gid;
umode_t mode;
char *name = NULL;
struct p9_qid qid;
@@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
}
/* Only creates */
- if (!(flags & O_CREAT) || dentry->d_inode)
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, res);
+ else if (dentry->d_inode) {
+ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+ return -EEXIST;
+ else
+ return finish_no_open(file, res);
+ }
v9ses = v9fs_inode2v9ses(dir);
@@ -325,13 +330,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
goto error;
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
- d_instantiate(dentry, inode);
-
/* Now set the ACL based on the default value */
- v9fs_set_create_acl(dentry, &dacl, &pacl);
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+
+ v9fs_fid_add(dentry, fid);
+ d_instantiate(dentry, inode);
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
@@ -364,6 +367,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
#endif
*opened |= FILE_CREATED;
out:
+ v9fs_put_acl(dacl, pacl);
dput(res);
return err;
@@ -373,7 +377,6 @@ error:
err_clunk_old_fid:
if (ofid)
p9_client_clunk(ofid);
- v9fs_set_create_acl(NULL, &dacl, &pacl);
goto out;
}
@@ -391,7 +394,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
- gid_t gid;
+ kgid_t gid;
char *name;
umode_t mode;
struct inode *inode;
@@ -430,17 +433,17 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
if (err < 0)
goto error;
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
+ fid = NULL;
+ goto error;
+ }
+
/* instantiate inode and assign the unopened fid to the dentry */
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
- fid = NULL;
- goto error;
- }
-
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -448,11 +451,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
err);
goto error;
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
+ v9fs_fid_add(dentry, fid);
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
fid = NULL;
+ err = 0;
} else {
/*
* Not in cached mode. No need to populate
@@ -464,16 +467,15 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
err = PTR_ERR(inode);
goto error;
}
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
}
- /* Now set the ACL based on the default value */
- v9fs_set_create_acl(dentry, &dacl, &pacl);
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
if (fid)
p9_client_clunk(fid);
- v9fs_set_create_acl(NULL, &dacl, &pacl);
+ v9fs_put_acl(dacl, pacl);
return err;
}
@@ -567,10 +569,11 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_iattr_dotl p9attr;
+ struct inode *inode = dentry->d_inode;
p9_debug(P9_DEBUG_VFS, "\n");
- retval = inode_change_ok(dentry->d_inode, iattr);
+ retval = inode_change_ok(inode, iattr);
if (retval)
return retval;
@@ -591,23 +594,23 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
return PTR_ERR(fid);
/* Write all dirty data */
- if (S_ISREG(dentry->d_inode->i_mode))
- filemap_write_and_wait(dentry->d_inode->i_mapping);
+ if (S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
retval = p9_client_setattr(fid, &p9attr);
if (retval < 0)
return retval;
if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(dentry->d_inode))
- truncate_setsize(dentry->d_inode, iattr->ia_size);
+ iattr->ia_size != i_size_read(inode))
+ truncate_setsize(inode, iattr->ia_size);
- v9fs_invalidate_inode_attr(dentry->d_inode);
- setattr_copy(dentry->d_inode, iattr);
- mark_inode_dirty(dentry->d_inode);
+ v9fs_invalidate_inode_attr(inode);
+ setattr_copy(inode, iattr);
+ mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE) {
/* We also want to update ACL when we update mode bits */
- retval = v9fs_acl_chmod(dentry);
+ retval = v9fs_acl_chmod(inode, fid);
if (retval < 0)
return retval;
}
@@ -692,7 +695,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
const char *symname)
{
int err;
- gid_t gid;
+ kgid_t gid;
char *name;
struct p9_qid qid;
struct inode *inode;
@@ -741,11 +744,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
err);
goto error;
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
+ v9fs_fid_add(dentry, fid);
d_instantiate(dentry, inode);
fid = NULL;
+ err = 0;
} else {
/* Not in cached mode. No need to populate inode with stat */
inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
@@ -832,7 +834,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dev_t rdev)
{
int err;
- gid_t gid;
+ kgid_t gid;
char *name;
umode_t mode;
struct v9fs_session_info *v9ses;
@@ -875,17 +877,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
goto error;
v9fs_invalidate_inode_attr(dir);
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
+ fid = NULL;
+ goto error;
+ }
+
/* instantiate inode and assign the unopened fid to the dentry */
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
- err);
- fid = NULL;
- goto error;
- }
-
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -893,11 +895,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
err);
goto error;
}
- err = v9fs_fid_add(dentry, fid);
- if (err < 0)
- goto error;
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, fid);
d_instantiate(dentry, inode);
fid = NULL;
+ err = 0;
} else {
/*
* Not in cached mode. No need to populate inode with stat.
@@ -908,14 +910,13 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
err = PTR_ERR(inode);
goto error;
}
+ v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
}
- /* Now set the ACL based on the default value */
- v9fs_set_create_acl(dentry, &dacl, &pacl);
error:
if (fid)
p9_client_clunk(fid);
- v9fs_set_create_acl(NULL, &dacl, &pacl);
+ v9fs_put_acl(dacl, pacl);
return err;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 137d50396898..91dad63e5a2d 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -363,5 +363,5 @@ struct file_system_type v9fs_fs_type = {
.mount = v9fs_mount,
.kill_sb = v9fs_kill_super,
.owner = THIS_MODULE,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 29653b70a9c3..c45e016b190f 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -111,19 +111,26 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
int v9fs_xattr_set(struct dentry *dentry, const char *name,
const void *value, size_t value_len, int flags)
{
+ struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+ return v9fs_fid_xattr_set(fid, name, value, value_len, flags);
+}
+
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+ const void *value, size_t value_len, int flags)
+{
u64 offset = 0;
int retval, msize, write_count;
- struct p9_fid *fid = NULL;
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
- fid = v9fs_fid_clone(dentry);
- if (IS_ERR(fid)) {
- retval = PTR_ERR(fid);
- fid = NULL;
- goto error;
- }
+ /* Clone it */
+ fid = p9_client_walk(fid, 0, NULL, 1);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+
/*
* On success fid points to xattr
*/
@@ -131,7 +138,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
if (retval < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
retval);
- goto error;
+ p9_client_clunk(fid);
+ return retval;
}
msize = fid->clnt->msize;
while (value_len) {
@@ -144,17 +152,12 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
if (write_count < 0) {
/* error in xattr write */
retval = write_count;
- goto error;
+ break;
}
offset += write_count;
value_len -= write_count;
}
- /* Total read xattr bytes */
- retval = offset;
-error:
- if (fid)
- retval = p9_client_clunk(fid);
- return retval;
+ return p9_client_clunk(fid);
}
ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eaa837c53bd5..eec348a3df71 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -27,6 +27,8 @@ extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
void *, size_t);
extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
void *, size_t);
+extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
+ const void *, size_t, int);
extern int v9fs_xattr_set(struct dentry *, const char *,
const void *, size_t, int);
extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/Kconfig b/fs/Kconfig
index eaff24a19502..780725a463b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,16 +68,6 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
-config CUSE
- tristate "Character device in Userspace support"
- depends on FUSE_FS
- help
- This FUSE extension allows character devices to be
- implemented in userspace.
-
- If you want to develop or use userspace character device
- based on CUSE, answer Y or M.
-
config GENERIC_ACL
bool
select FS_POSIX_ACL
@@ -220,6 +210,7 @@ source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
+source "fs/f2fs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79288a0..9d53192236fc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_F2FS_FS) += f2fs/
obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..c5a7787dd5e9 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,6 @@
config ADFS_FS
- tristate "ADFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "ADFS file system support"
+ depends on BLOCK
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index b3be2e7c5643..9cf874ce8336 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -19,7 +19,7 @@ static DEFINE_RWLOCK(adfs_dir_lock);
static int
adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e9bad5093a3f..5f95d1ed9c6d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, adfs_get_block);
}
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int adfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ adfs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index cfad9afb4762..a04d9e848d05 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -1,6 +1,6 @@
config AFFS_FS
- tristate "Amiga FFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "Amiga FFS file system support"
+ depends on BLOCK
help
The Fast File System (FFS) is the common file system used on hard
disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index eb82ee53ee0b..d9a43674cb94 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -125,9 +125,8 @@ static void
affs_fix_dcache(struct inode *inode, u32 entry_ino)
{
struct dentry *dentry;
- struct hlist_node *p;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
if (entry_ino == (u32)(long)dentry->d_fsdata) {
dentry->d_fsdata = (void *)inode->i_ino;
break;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 8ca8f3a55599..fd11a6d608ee 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -42,7 +42,7 @@ const struct inode_operations affs_dir_inode_operations = {
static int
affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct buffer_head *dir_bh;
struct buffer_head *fh_bh;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb327..af3261b78102 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
};
const struct inode_operations affs_file_inode_operations = {
- .truncate = affs_truncate,
.setattr = affs_notify_change,
};
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, affs_get_block);
}
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ affs_truncate(inode);
+ }
+}
+
static int affs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ affs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 15c484268229..0e092d08680e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ affs_truncate(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 8f975f25b486..ebba3b18e5da 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -1,6 +1,6 @@
config AFS_FS
- tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ tristate "Andrew File System support (AFS)"
+ depends on INET
select AF_RXRPC
select DNS_RESOLVER
help
@@ -22,8 +22,7 @@ config AFS_DEBUG
If unsure, say N.
config AFS_FSCACHE
- bool "Provide AFS client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Provide AFS client caching support"
depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
help
Say Y here if you want AFS data to be cached locally on disk through
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index c548aa346f0d..3c462ff6db63 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -119,8 +119,8 @@ struct afs_file_status {
u64 size; /* file size */
afs_dataversion_t data_version; /* current data version */
u32 author; /* author ID */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
+ kuid_t owner; /* owner ID */
+ kgid_t group; /* group ID */
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
@@ -133,13 +133,6 @@ struct afs_file_status {
/*
* AFS file status change request
*/
-struct afs_store_status {
- u32 mask; /* which bits of the struct are set */
- u32 mtime_client; /* last time client changed data */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
- umode_t mode; /* UNIX mode */
-};
#define AFS_SET_MTIME 0x01 /* set the mtime */
#define AFS_SET_OWNER 0x02 /* set the owner ID */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index db477906ba4f..7a465ed04444 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -393,12 +393,12 @@ static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
int ret;
_enter("{%Ld,{%lu}}",
- file->f_pos, file->f_path.dentry->d_inode->i_ino);
+ file->f_pos, file_inode(file)->i_ino);
ASSERT(file->private_data != NULL);
fpos = file->f_pos;
- ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+ ret = afs_dir_iterate(file_inode(file), &fpos,
cookie, filldir, file->private_data);
file->f_pos = fpos;
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 757d664575dd..2497bf306c70 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -514,7 +514,7 @@ error:
*/
int afs_lock(struct file *file, int cmd, struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -537,7 +537,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
*/
int afs_flock(struct file *file, int cmd, struct file_lock *fl)
{
- struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
_enter("{%x:%u},%d,{t=%x,fl=%x}",
vnode->fid.vid, vnode->fid.vnode, cmd,
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index b960ff05ea0b..c2e930ec2888 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
umode_t mode;
u64 data_version, size;
u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+ kuid_t owner;
+ kgid_t group;
#define EXTRACT(DST) \
do { \
@@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
size = ntohl(*bp++);
data_version = ntohl(*bp++);
EXTRACT(status->author);
- EXTRACT(status->owner);
+ owner = make_kuid(&init_user_ns, ntohl(*bp++));
+ changed |= !uid_eq(owner, status->owner);
+ status->owner = owner;
EXTRACT(status->caller_access); /* call ticket dependent */
EXTRACT(status->anon_access);
EXTRACT(status->mode);
@@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
bp++; /* seg size */
status->mtime_client = ntohl(*bp++);
status->mtime_server = ntohl(*bp++);
- EXTRACT(status->group);
+ group = make_kgid(&init_user_ns, ntohl(*bp++));
+ changed |= !gid_eq(group, status->group);
+ status->group = group;
bp++; /* sync counter */
data_version |= (u64) ntohl(*bp++) << 32;
EXTRACT(status->lock_count);
@@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
if (attr->ia_valid & ATTR_UID) {
mask |= AFS_SET_OWNER;
- owner = attr->ia_uid;
+ owner = from_kuid(&init_user_ns, attr->ia_uid);
}
if (attr->ia_valid & ATTR_GID) {
mask |= AFS_SET_GROUP;
- group = attr->ia_gid;
+ group = from_kgid(&init_user_ns, attr->ia_gid);
}
if (attr->ia_valid & ATTR_MODE) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 95cffd38239f..789bc253b5f6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
set_nlink(inode, vnode->status.nlink);
inode->i_uid = vnode->status.owner;
- inode->i_gid = 0;
+ inode->i_gid = GLOBAL_ROOT_GID;
inode->i_size = vnode->status.size;
inode->i_ctime.tv_sec = vnode->status.mtime_server;
inode->i_ctime.tv_nsec = 0;
@@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_op = &afs_autocell_inode_operations;
set_nlink(inode, 2);
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
inode->i_ctime.tv_sec = get_seconds();
inode->i_ctime.tv_nsec = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 43165009428d..7c31ec399575 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -24,6 +24,8 @@
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
#include "internal.h"
#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
@@ -363,6 +365,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
memset(&params, 0, sizeof(params));
+ ret = -EINVAL;
+ if (current->nsproxy->net_ns != &init_net)
+ goto error;
+
/* parse the options and device name */
if (options) {
ret = afs_parse_options(&params, options, &dev_name);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9aa52d93c73c..7e03eadb40c0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -120,7 +120,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct afs_writeback *candidate, *wb;
- struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
struct key *key = file->private_data;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
@@ -245,7 +245,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
loff_t i_size, maybe_i_size;
_enter("{%x:%u},{%lx}",
@@ -627,8 +627,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct dentry *dentry = iocb->ki_filp->f_path.dentry;
- struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
ssize_t result;
size_t count = iov_length(iov, nr_segs);
diff --git a/fs/aio.c b/fs/aio.c
index 71f613cf4a85..3f941f2a3059 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
struct aio_ring *ring;
struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
- unsigned long size;
+ unsigned long size, populate;
int nr_pages;
/* Compensate for the ring buffer's head/tail overlap entry */
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
down_write(&ctx->mm->mmap_sem);
info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0);
+ MAP_ANONYMOUS|MAP_PRIVATE, 0,
+ &populate);
if (IS_ERR((void *)info->mmap_base)) {
up_write(&ctx->mm->mmap_sem);
info->mmap_size = 0;
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
aio_free_ring(ctx);
return -EAGAIN;
}
+ if (populate)
+ mm_populate(info->mmap_base, populate);
ctx->user_id = info->mmap_base;
@@ -588,11 +591,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
struct mm_struct *mm = current->mm;
struct kioctx *ctx, *ret = NULL;
- struct hlist_node *n;
rcu_read_lock();
- hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+ hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
/*
* RCU protects us against accessing freed memory but
* we have to be careful not to get a reference when the
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 28d39fb84ae3..47a65df8c871 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -131,7 +131,6 @@ struct file *anon_inode_getfile(const char *name,
struct qstr this;
struct path path;
struct file *file;
- int error;
if (IS_ERR(anon_inode_inode))
return ERR_PTR(-ENODEV);
@@ -143,7 +142,7 @@ struct file *anon_inode_getfile(const char *name,
* Link the inode to a directory entry by creating a unique name
* using the inode sequence number.
*/
- error = -ENOMEM;
+ file = ERR_PTR(-ENOMEM);
this.name = name;
this.len = strlen(name);
this.hash = 0;
@@ -160,15 +159,12 @@ struct file *anon_inode_getfile(const char *name,
d_instantiate(path.dentry, anon_inode_inode);
- error = -ENFILE;
file = alloc_file(&path, OPEN_FMODE(flags), fops);
- if (!file)
+ if (IS_ERR(file))
goto err_dput;
file->f_mapping = anon_inode_inode->i_mapping;
- file->f_pos = 0;
file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
- file->f_version = 0;
file->private_data = priv;
return file;
@@ -177,7 +173,7 @@ err_dput:
path_put(&path);
err_module:
module_put(fops->owner);
- return ERR_PTR(error);
+ return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b785e7707959..3f1128b37e46 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -273,7 +273,7 @@ static inline int autofs_prepare_pipe(struct file *pipe)
{
if (!pipe->f_op || !pipe->f_op->write)
return -EINVAL;
- if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode))
+ if (!S_ISFIFO(file_inode(pipe)->i_mode))
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9f68a37bb2b2..743c7c2c949d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -159,7 +159,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
struct inode *inode;
if (f) {
- inode = f->f_path.dentry->d_inode;
+ inode = file_inode(f);
sbi = autofs4_sbi(inode->i_sb);
}
return sbi;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c93447604da8..9bd16255dd9c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
goto done;
}
} else {
- if (!simple_empty(dentry))
+ if (!simple_empty(dentry)) {
+ spin_unlock(&sbi->fs_lock);
goto done;
+ }
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
@@ -587,7 +589,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -874,7 +876,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
@@ -882,7 +884,7 @@ static long autofs4_root_ioctl(struct file *filp,
static long autofs4_root_compat_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int ret;
if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 03bc1d347d8e..3db70dae40d3 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
while (wq) {
nwq = wq->next;
wq->status = -ENOENT; /* Magic is gone - report failure */
- if (wq->name.name) {
- kfree(wq->name.name);
- wq->name.name = NULL;
- }
+ kfree(wq->name.name);
+ wq->name.name = NULL;
wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
wq = nwq;
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 7835d30f211f..edc5cc2aefad 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -1,6 +1,6 @@
config BEFS_FS
- tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "BeOS file system (BeFS) support (read only)"
+ depends on BLOCK
select NLS
help
The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2b3bda8d5e68..c8f4e25eb9e2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -213,7 +213,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
static int
befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
befs_off_t value;
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index c2336c62024f..3728a6479c64 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -1,6 +1,6 @@
config BFS_FS
- tristate "BFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "BFS file system support"
+ depends on BLOCK
help
Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 2785ef91191a..3f422f6bb5ca 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -28,7 +28,7 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
{
- struct inode *dir = f->f_path.dentry->d_inode;
+ struct inode *dir = file_inode(f);
struct buffer_head *bh;
struct bfs_dirent *de;
struct bfs_sb_info *info = BFS_SB(dir->i_sb);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062f..ad3ea1497cc3 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, bfs_get_block);
}
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int bfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
bfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ bfs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 6043567b95c2..bbc8f8827eac 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -214,7 +214,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+ i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
return -ENOEXEC;
}
@@ -367,7 +367,7 @@ static int load_aout_library(struct file *file)
int retval;
struct exec ex;
- inode = file->f_path.dentry->d_inode;
+ inode = file_inode(file);
retval = -ENOEXEC;
error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdbabecf..3939829f6c5c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -33,6 +33,7 @@
#include <linux/elf.h>
#include <linux/utsname.h>
#include <linux/coredump.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -321,6 +322,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return 0;
}
+#ifndef elf_map
+
static unsigned long elf_map(struct file *filep, unsigned long addr,
struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -355,6 +358,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+#endif /* !elf_map */
+
static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -1140,7 +1145,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
- if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
+ if (file_inode(vma->vm_file)->i_nlink == 0 ?
FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
goto whole;
return 0;
@@ -1248,7 +1253,7 @@ static int writenote(struct memelfnote *men, struct file *file,
#undef DUMP_WRITE
static void fill_elf_header(struct elfhdr *elf, int segs,
- u16 machine, u32 flags, u8 osabi)
+ u16 machine, u32 flags)
{
memset(elf, 0, sizeof(*elf));
@@ -1320,8 +1325,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
- cputime_to_timeval(p->utime, &prstatus->pr_utime);
- cputime_to_timeval(p->stime, &prstatus->pr_stime);
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+ cputime_to_timeval(utime, &prstatus->pr_utime);
+ cputime_to_timeval(stime, &prstatus->pr_stime);
}
cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1630,7 +1638,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Initialize the ELF file header.
*/
fill_elf_header(elf, phdrs,
- view->e_machine, view->e_flags, view->ei_osabi);
+ view->e_machine, view->e_flags);
/*
* Allocate a structure for each thread.
@@ -1870,7 +1878,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
elf_core_copy_regs(&info->prstatus->pr_reg, regs);
/* Set up header */
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
/*
* Set up the notes in similar form to SVR4 core dumps made
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index dc84732e554f..9c13e023e2b7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -909,7 +909,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
dynamic_error:
printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
- what, file->f_path.dentry->d_inode->i_ino);
+ what, file_inode(file)->i_ino);
return -ELIBBAD;
}
@@ -1219,7 +1219,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
- if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) {
+ if (file_inode(vma->vm_file)->i_nlink == 0) {
dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
vma->vm_flags, dump_ok ? "yes" : "no");
@@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
- cputime_to_timeval(p->utime, &prstatus->pr_utime);
- cputime_to_timeval(p->stime, &prstatus->pr_stime);
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+ cputime_to_timeval(utime, &prstatus->pr_utime);
+ cputime_to_timeval(stime, &prstatus->pr_stime);
}
cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b56371981d16..2036d21baaef 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -438,7 +438,7 @@ static int load_flat_file(struct linux_binprm * bprm,
int ret;
hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */
- inode = bprm->file->f_path.dentry->d_inode;
+ inode = file_inode(bprm->file);
text_len = ntohl(hdr->data_start);
data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9be335fb8a7c..fecbbf3f8ff2 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -172,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
goto _error;
bprm->argc ++;
- bprm->interp = iname; /* for binfmt_script */
+ /* Update interp in case binfmt_script needs it. */
+ retval = bprm_change_interp(iname, bprm);
+ if (retval < 0)
+ goto _error;
interp_file = open_exec (iname);
retval = PTR_ERR (interp_file);
@@ -528,7 +531,7 @@ static void kill_node(Node *e)
static ssize_t
bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
{
- Node *e = file->f_path.dentry->d_inode->i_private;
+ Node *e = file_inode(file)->i_private;
ssize_t res;
char *page;
@@ -547,7 +550,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
struct dentry *root;
- Node *e = file->f_path.dentry->d_inode->i_private;
+ Node *e = file_inode(file)->i_private;
int res = parse_command(buffer, count);
switch (res) {
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 1610a91637e5..5027a3e14922 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -80,7 +80,9 @@ static int load_script(struct linux_binprm *bprm)
retval = copy_strings_kernel(1, &i_name, bprm);
if (retval) return retval;
bprm->argc++;
- bprm->interp = interp;
+ retval = bprm_change_interp(interp, bprm);
+ if (retval < 0)
+ return retval;
/*
* OK, now restart the process with the interpreter's dentry.
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce4855..bb5768f59b32 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error)
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
error = -EIO;
+ trace_block_bio_complete(bio, error);
+
if (bio->bi_end_io)
bio->bi_end_io(bio, error);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 172f8491a2bd..aea605c98ba6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -318,7 +318,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
/*
* private llseek:
- * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * for a block special file file_inode(file)->i_size is zero
* so we compute the size by hand (just as in block_read/write above)
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
@@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk)
mutex_lock(&bdev->bd_mutex);
check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
@@ -1032,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_logical_block_size(bdev);
- bdev->bd_inode->i_size = size;
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, size);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
@@ -1117,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret && !bdev->bd_openers) {
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..9a8622a5b867 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,11 +1,13 @@
config BTRFS_FS
- tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
- depends on EXPERIMENTAL
+ tristate "Btrfs filesystem Unstable disk format"
select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
+ select RAID6_PQ
+ select XOR_BLOCKS
+
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be875..bd605c87adfd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
err = __resolve_indirect_ref(fs_info, search_commit_root,
time_seq, ref, parents,
extent_item_pos);
- if (err) {
- if (ret == 0)
- ret = err;
+ if (err)
continue;
- }
/* we put the first parent into the ref at hand */
ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca79455..310a7f6d09b1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
#ifndef __BTRFS_BACKREF__
#define __BTRFS_BACKREF__
-#include "ioctl.h"
+#include <linux/btrfs.h>
#include "ulist.h"
#include "extent_io.h"
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f5..d9b97d4960e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
#define BTRFS_INODE_NEEDS_FULL_SYNC 7
#define BTRFS_INODE_COPY_EVERYTHING 8
+#define BTRFS_INODE_IN_DELALLOC_LIST 9
+#define BTRFS_INODE_READDIO_NEED_LOCK 10
/* in memory btrfs inode */
struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
return 0;
}
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+ set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+ smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b4..18af6f48781a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
(bh->b_data + (dev_bytenr & 4095));
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
- strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
- sizeof(super_tmp->magic)) ||
+ super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_size)
- ret = io_tree->ops->merge_bio_hook(page, 0,
+ ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
PAGE_CACHE_SIZE,
bio, 0);
else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->index = em_start >> PAGE_CACHE_SHIFT;
if (comp_bio->bi_size)
- ret = tree->ops->merge_bio_hook(page, 0,
+ ret = tree->ops->merge_bio_hook(READ, page, 0,
PAGE_CACHE_SIZE,
comp_bio, 0);
else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c7b67cf24bba..ecd25a1b4e51 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,13 +1138,14 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- case MOD_LOG_KEY_REMOVE:
- n++;
+ /* Fallthrough */
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
+ n++;
break;
case MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
__tree_mod_log_rewind(eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
return eb_rewin;
}
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
*/
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, int cache_only, u64 *last_ret,
+ int start_slot, u64 *last_ret,
struct btrfs_key *progress)
{
struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
parent_level = btrfs_header_level(parent);
- if (cache_only && parent_level != 1)
- return 0;
WARN_ON(trans->transaction != root->fs_info->running_transaction);
WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
else
uptodate = 0;
if (!cur || !uptodate) {
- if (cache_only) {
- free_extent_buffer(cur);
- continue;
- }
if (!cur) {
cur = read_tree_block(root, blocknr,
blocksize, gen);
@@ -4611,12 +4606,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 nritems;
int ret;
- if (level) {
- ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
- MOD_LOG_KEY_REMOVE);
- BUG_ON(ret < 0);
- }
-
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level)
@@ -4627,6 +4616,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
+ } else if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
}
nritems--;
@@ -4827,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id. This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4849,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
- struct btrfs_path *path, int cache_only,
+ struct btrfs_path *path,
u64 min_trans)
{
struct extent_buffer *cur;
@@ -4889,15 +4882,12 @@ again:
if (sret && slot > 0)
slot--;
/*
- * check this node pointer against the cache_only and
- * min_trans parameters. If it isn't in cache or is too
- * old, skip to the next one.
+ * check this node pointer against the min_trans parameters.
+ * If it is too old, old, skip to the next one.
*/
while (slot < nritems) {
u64 blockptr;
u64 gen;
- struct extent_buffer *tmp;
- struct btrfs_disk_key disk_key;
blockptr = btrfs_node_blockptr(cur, slot);
gen = btrfs_node_ptr_generation(cur, slot);
@@ -4905,27 +4895,7 @@ again:
slot++;
continue;
}
- if (!cache_only)
- break;
-
- if (max_key) {
- btrfs_node_key(cur, &disk_key, slot);
- if (comp_keys(&disk_key, max_key) >= 0) {
- ret = 1;
- goto out;
- }
- }
-
- tmp = btrfs_find_tree_block(root, blockptr,
- btrfs_level_size(root, level - 1));
-
- if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
- free_extent_buffer(tmp);
- break;
- }
- if (tmp)
- free_extent_buffer(tmp);
- slot++;
+ break;
}
find_next_key:
/*
@@ -4936,7 +4906,7 @@ find_next_key:
path->slots[level] = slot;
btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
- cache_only, min_trans);
+ min_trans);
if (sret == 0) {
btrfs_release_path(path);
goto again;
@@ -5401,8 +5371,7 @@ out:
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
*
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
@@ -5411,8 +5380,7 @@ out:
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *key, int level,
- int cache_only, u64 min_trans)
+ struct btrfs_key *key, int level, u64 min_trans)
{
int slot;
struct extent_buffer *c;
@@ -5463,22 +5431,8 @@ next:
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else {
- u64 blockptr = btrfs_node_blockptr(c, slot);
u64 gen = btrfs_node_ptr_generation(c, slot);
- if (cache_only) {
- struct extent_buffer *cur;
- cur = btrfs_find_tree_block(root, blockptr,
- btrfs_level_size(root, level - 1));
- if (!cur ||
- btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
- slot++;
- if (cur)
- free_extent_buffer(cur);
- goto next;
- }
- free_extent_buffer(cur);
- }
if (gen < min_trans) {
slot++;
goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727f..0d82922179db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
+#include <linux/btrfs.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
-#include "ioctl.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
/*
* File system states
*/
+#define BTRFS_FS_STATE_ERROR 0
+#define BTRFS_FS_STATE_REMOUNTING 1
+/* Super block flags */
/* Errors detected */
#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
@@ -502,6 +507,7 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES 5
+
+enum btrfs_raid_types {
+ BTRFS_RAID_RAID10,
+ BTRFS_RAID_RAID1,
+ BTRFS_RAID_DUP,
+ BTRFS_RAID_RAID0,
+ BTRFS_RAID_SINGLE,
+ BTRFS_RAID_RAID5,
+ BTRFS_RAID_RAID6,
+ BTRFS_NR_RAID_TYPES
+};
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
/*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
u64 flags;
u64 sectorsize;
u64 cache_generation;
+
+ /* for raid56, this is a full stripe, without parity */
+ unsigned long full_stripe_len;
+
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
u64 seq;
};
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ wait_queue_head_t wait;
+ spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
/* block group cache stuff */
spinlock_t block_group_cache_lock;
+ u64 first_logical_byte;
struct rb_root block_group_cache_tree;
/* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long mount_opt;
unsigned long compress_type:4;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
u64 max_inline;
+ /*
+ * Protected by ->chunk_mutex and sb->s_umount.
+ *
+ * The reason that we use two lock to protect it is because only
+ * remount and mount operations can change it and these two operations
+ * are under sb->s_umount, but the read side (chunk allocation) can not
+ * acquire sb->s_umount or the deadlock would happen. So we use two
+ * locks to protect it. On the write side, we must acquire two locks,
+ * and on the read side, we just need acquire one of them.
+ */
u64 alloc_start;
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+
+ /* this is used during read/modify/write to make sure
+ * no two ios are trying to mod the same stripe at the same
+ * time
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
*/
struct list_head ordered_extents;
+ spinlock_t delalloc_lock;
/*
* all of the inodes that have delalloc bytes. It is possible for
* this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
struct list_head delalloc_inodes;
/*
- * special rename and truncate targets that must be on disk before
- * we're allowed to commit. This is basically the ext3 style
- * data=ordered list.
- */
- struct list_head ordered_operations;
-
- /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
+ struct btrfs_workers endio_raid56_workers;
+ struct btrfs_workers rmw_workers;
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
u64 total_pinned;
- /* protected by the delalloc lock, used to keep from writing
- * metadata until there is a nice batch
- */
- u64 dirty_metadata_bytes;
+ /* used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
struct reloc_control *reloc_ctl;
- spinlock_t delalloc_lock;
- u64 delalloc_bytes;
-
/* data_alloc_cluster is only used in ssd mode */
struct btrfs_free_cluster data_alloc_cluster;
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
struct rb_root defrag_inodes;
atomic_t defrag_running;
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
/*
* these three are in extended format (availability of single
* chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
u64 qgroup_seq;
/* filesystem state */
- u64 fs_state;
+ unsigned long fs_state;
struct btrfs_delayed_root *delayed_root;
@@ -1623,6 +1691,9 @@ struct btrfs_root {
struct list_head root_list;
+ spinlock_t log_extents_lock[2];
+ struct list_head logged_list[2];
+
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
/*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems,
+ u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
- int cache_only, u64 min_trans);
+ u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
- struct btrfs_path *path, int cache_only,
+ struct btrfs_path *path,
u64 min_trans);
enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
int find_higher, int return_any);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, int cache_only, u64 *last_ret,
+ int start_slot, u64 *last_ret,
struct btrfs_key *progress);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio, unsigned long bio_flags);
-
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int cache_only);
+ struct btrfs_root *root);
/* sysfs.c */
int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
#define btrfs_panic(fs_info, errno, fmt, args...) \
do { \
- struct btrfs_fs_info *_i = (fs_info); \
- __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \
- BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
} while (0)
/* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
return 1;
return 0;
}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01b..0b278b117cbe 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *delayed_item)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
char *ptr;
int ret;
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
- item = btrfs_item_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
}
}
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_node *node)
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
{
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
int ret;
- mutex_lock(&node->mutex);
- if (!node->inode_dirty) {
- mutex_unlock(&node->mutex);
- return 0;
- }
-
key.objectid = node->inode_id;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
+
ret = btrfs_lookup_inode(trans, root, path, &key, 1);
if (ret > 0) {
btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return -ENOENT;
} else if (ret < 0) {
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_delayed_inode_release_metadata(root, node);
btrfs_release_delayed_inode(node);
- mutex_unlock(&node->mutex);
return 0;
}
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ mutex_lock(&node->mutex);
+ if (!node->inode_dirty) {
+ mutex_unlock(&node->mutex);
+ return 0;
+ }
+
+ ret = __btrfs_update_delayed_inode(trans, root, path, node);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+ return ret;
+}
+
/*
* Called when committing the transaction.
* Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int nr)
{
- struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node && (!count || (count && nr--))) {
- curr_root = curr_node->root;
- ret = btrfs_insert_delayed_items(trans, path, curr_root,
- curr_node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path,
- curr_root, curr_node);
- if (!ret)
- ret = btrfs_update_delayed_inode(trans, curr_root,
- path, curr_node);
+ ret = __btrfs_commit_inode_delayed_items(trans, path,
+ curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
return __btrfs_run_delayed_items(trans, root, nr);
}
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_node *node)
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode)
{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret;
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!delayed_node->count) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
- ret = btrfs_insert_delayed_items(trans, path, node->root, node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, node->root, node);
- if (!ret)
- ret = btrfs_update_delayed_inode(trans, node->root, path, node);
- btrfs_free_path(path);
+ ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+ btrfs_free_path(path);
trans->block_rsv = block_rsv;
+
return ret;
}
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
- struct inode *inode)
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
int ret;
if (!delayed_node)
return 0;
mutex_lock(&delayed_node->mutex);
- if (!delayed_node->count) {
+ if (!delayed_node->inode_dirty) {
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
return 0;
}
mutex_unlock(&delayed_node->mutex);
- ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+ trans = btrfs_join_transaction(delayed_node->root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto trans_out;
+ }
+ path->leave_spinning = 1;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->inode_dirty)
+ ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+ path, delayed_node);
+ else
+ ret = 0;
+ mutex_unlock(&delayed_node->mutex);
+
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+trans_out:
+ btrfs_end_transaction(trans, delayed_node->root);
+ btrfs_btree_balance_dirty(delayed_node->root);
+out:
btrfs_release_delayed_node(delayed_node);
+
return ret;
}
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
int need_requeue = 0;
- int ret;
async_node = container_of(work, struct btrfs_async_delayed_node, work);
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
block_rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->delayed_block_rsv;
- ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
- if (!ret)
- ret = btrfs_delete_delayed_items(trans, path, root,
- delayed_node);
-
- if (!ret)
- btrfs_update_delayed_inode(trans, root, path, delayed_node);
-
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
/*
* Maybe new delayed items have been inserted, so we need requeue
* the work. Besides that, we must dequeue the empty delayed nodes
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baeed..78b6ad0fc669 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
/* Used for evicting the inode. */
void btrfs_remove_delayed_node(struct inode *inode);
void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae9411773397..b7a0641ead77 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
#include "delayed-ref.h"
#include "transaction.h"
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
* we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
return 1;
}
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+ struct list_head *pos, *q;
+
+ list_for_each_safe(pos, q, cluster)
+ list_del_init(pos);
+}
+
/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
ref->extent_op->flags_to_set;
existing_ref->extent_op->update_flags = 1;
}
- kfree(ref->extent_op);
+ btrfs_free_delayed_extent_op(ref->extent_op);
}
}
/*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(head_ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(full_ref);
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* we've updated the existing ref, free the newly
* allocated ref
*/
- kfree(full_ref);
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
} else {
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
BUG_ON(extent_op && extent_op->is_data);
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kfree(ref);
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
return -ENOMEM;
}
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
BUG_ON(extent_op && !extent_op->is_data);
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kfree(ref);
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
return -ENOMEM;
}
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
return btrfs_delayed_node_to_head(ref);
return NULL;
}
+
+void btrfs_delayed_ref_exit(void)
+{
+ if (btrfs_delayed_ref_head_cachep)
+ kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+ if (btrfs_delayed_tree_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+ if (btrfs_delayed_data_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ if (btrfs_delayed_extent_op_cachep)
+ kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+ btrfs_delayed_ref_head_cachep = kmem_cache_create(
+ "btrfs_delayed_ref_head",
+ sizeof(struct btrfs_delayed_ref_head), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_ref_head_cachep)
+ goto fail;
+
+ btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_tree_ref",
+ sizeof(struct btrfs_delayed_tree_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_tree_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_data_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_data_ref",
+ sizeof(struct btrfs_delayed_data_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_data_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_extent_op_cachep = kmem_cache_create(
+ "btrfs_delayed_extent_op",
+ sizeof(struct btrfs_delayed_extent_op), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_extent_op_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_delayed_ref_exit();
+ return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df0..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
unsigned long num_heads_ready;
/*
+ * bumped when someone is making progress on the delayed
+ * refs, so that other procs know they are just adding to
+ * contention intead of helping
+ */
+ atomic_t procs_running_refs;
+ atomic_t ref_seq;
+ wait_queue_head_t wait;
+
+ /*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
* to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
u64 run_delayed_start;
};
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+ return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+ if (op)
+ kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
WARN_ON(atomic_read(&ref->refs) == 0);
if (atomic_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree);
- kfree(ref);
+ switch (ref->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ break;
+ case 0:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+ break;
+ default:
+ BUG();
+ }
}
}
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+ mutex_unlock(&head->mutex);
+}
+
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf7..7ba7b3900cb8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return ret;
+ }
btrfs_wait_ordered_extents(root, 0);
trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940b..02369a3c162e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "raid56.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+ struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
{
struct extent_io_tree *tree;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 found_start;
struct extent_buffer *eb;
@@ -639,8 +641,15 @@ err:
btree_readahead_hook(root, eb, eb->start, ret);
}
- if (ret)
+ if (ret) {
+ /*
+ * our io error hook is going to dec the io pages
+ * again, we have to make sure it has something
+ * to decrement
+ */
+ atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
+ }
free_extent_buffer(eb);
out:
return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb = (struct extent_buffer *)page->private;
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
eb->read_mirror = failed_mirror;
+ atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(root, eb, eb->start, -EIO);
return -EIO; /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
end_io_wq->work.flags = 0;
if (bio->bi_rw & REQ_WRITE) {
- if (end_io_wq->metadata == 1)
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
&end_io_wq->work);
- else if (end_io_wq->metadata == 2)
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
btrfs_queue_worker(&fs_info->endio_freespace_worker,
&end_io_wq->work);
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ btrfs_queue_worker(&fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
} else {
- if (end_io_wq->metadata)
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ btrfs_queue_worker(&fs_info->endio_raid56_workers,
+ &end_io_wq->work);
+ else if (end_io_wq->metadata)
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
* 0 - if data
* 1 - if normal metadta
* 2 - if writing to the free space cache area
+ * 3 - raid parity work
*/
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_io_tree *tree;
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
- struct btrfs_root *root = BTRFS_I(mapping->host)->root;
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
if (wbc->for_kupdate)
return 0;
+ fs_info = BTRFS_I(mapping->host)->root->fs_info;
/* this is a bit racy, but that's ok */
- num_dirty = root->fs_info->dirty_metadata_bytes;
- if (num_dirty < thresh)
+ ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret < 0)
return 0;
}
return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
if (btrfs_header_generation(buf) ==
- root->fs_info->running_transaction->transid) {
+ fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (root->fs_info->dirty_metadata_bytes >= buf->len)
- root->fs_info->dirty_metadata_bytes -= buf->len;
- else {
- spin_unlock(&root->fs_info->delalloc_lock);
- btrfs_panic(root->fs_info, -EOVERFLOW,
- "Can't clear %lu bytes from "
- " dirty_mdatadata_bytes (%llu)",
- buf->len,
- root->fs_info->dirty_metadata_bytes);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
-
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->logged_list[0]);
+ INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->accounting_lock);
+ spin_lock_init(&root->log_extents_lock[0]);
+ spin_lock_init(&root->log_extents_lock[1]);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ if (ret) {
+ err = ret;
+ goto fail_bdi;
+ }
+ fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ if (ret) {
+ err = ret;
+ goto fail_dirty_metadata_bytes;
+ }
+
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
- goto fail_bdi;
+ goto fail_delalloc_bytes;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
- INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->tree_mod_seq_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
+ seqlock_init(&fs_info->profiles_lock);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
+ fs_info->first_logical_byte = (u64)-1;
extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ ret = btrfs_alloc_stripe_hash_table(fs_info);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
/* check FS state, whether FS is broken. */
- fs_info->fs_state |= btrfs_super_flags(disk_super);
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
+ fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
* mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->endio_meta_write_workers,
"endio-meta-write", fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_raid56_workers,
+ "endio-raid56", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->rmw_workers,
+ "rmw", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
fs_info->thread_pool_size,
&fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
*/
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
+ fs_info->endio_raid56_workers.idle_thresh = 4;
+ fs_info->rmw_workers.idle_thresh = 2;
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->fixup_workers);
ret |= btrfs_start_workers(&fs_info->endio_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+ ret |= btrfs_start_workers(&fs_info->rmw_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
- sizeof(disk_super->magic))) {
+ if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2694,13 +2742,13 @@ fail_cleaner:
* kthreads
*/
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_block_groups:
btrfs_free_block_groups(fs_info);
fail_tree_roots:
free_root_pointers(fs_info, 1);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_raid56_workers);
+ btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
+ btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
super = (struct btrfs_super_block *)bh->b_data;
if (btrfs_super_bytenr(super) != bytenr ||
- strncmp((char *)(&super->magic), BTRFS_MAGIC,
- sizeof(super->magic))) {
+ super->magic != cpu_to_le64(BTRFS_MAGIC)) {
brelse(bh);
continue;
}
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
== 0)))
num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1
- &&
- (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10)))
- num_tolerated_disk_barrier_failures = 1;
+ else if (num_tolerated_disk_barrier_failures > 1) {
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ num_tolerated_disk_barrier_failures = 1;
+ } else if (flags &
+ BTRFS_BLOCK_GROUP_RAID5) {
+ num_tolerated_disk_barrier_failures = 2;
+ }
+ }
}
}
up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ btrfs_free_log(NULL, root);
+ btrfs_free_log_root_tree(NULL, fs_info);
+ }
+
__btrfs_remove_free_space_cache(root->free_ino_pinned);
__btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_error_commit_super(root);
btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_qgroup_config(root->fs_info);
- if (fs_info->delalloc_bytes) {
- printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
- (unsigned long long)fs_info->delalloc_bytes);
+ if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+ printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+ percpu_counter_sum(&fs_info->delalloc_bytes));
}
free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_raid56_workers);
+ btrfs_stop_workers(&fs_info->rmw_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
+ btrfs_free_stripe_hash_table(fs_info);
+
return 0;
}
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)transid,
(unsigned long long)root->fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
- if (!was_dirty) {
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->dirty_metadata_bytes += buf->len;
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ if (!was_dirty)
+ __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+ buf->len,
+ root->fs_info->dirty_metadata_batch);
}
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
* looks as though older kernels can get into trouble with
* this code, they end up stuck in balance_dirty_pages forever
*/
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
+ int ret;
if (current->flags & PF_MEMALLOC)
return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
if (flush_delayed)
btrfs_balance_delayed_items(root);
- num_dirty = root->fs_info->dirty_metadata_bytes;
-
- if (num_dirty > thresh) {
+ ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret > 0) {
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+ struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
- list_splice_init(&root->fs_info->ordered_operations, &splice);
+ list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
- struct list_head splice;
struct btrfs_ordered_extent *ordered;
- struct inode *inode;
-
- INIT_LIST_HEAD(&splice);
spin_lock(&root->fs_info->ordered_extent_lock);
-
- list_splice_init(&root->fs_info->ordered_extents, &splice);
- while (!list_empty(&splice)) {
- ordered = list_entry(splice.next, struct btrfs_ordered_extent,
- root_extent_list);
-
- list_del_init(&ordered->root_extent_list);
- atomic_inc(&ordered->refs);
-
- /* the inode may be getting freed (in sys_unlink path). */
- inode = igrab(ordered->inode);
-
- spin_unlock(&root->fs_info->ordered_extent_lock);
- if (inode)
- iput(inode);
-
- atomic_set(&ordered->refs, 1);
- btrfs_put_ordered_extent(ordered);
-
- spin_lock(&root->fs_info->ordered_extent_lock);
- }
-
+ /*
+ * This will just short circuit the ordered completion stuff which will
+ * make sure the ordered extent gets properly cleaned up.
+ */
+ list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+ root_extent_list)
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
spin_unlock(&root->fs_info->ordered_extent_lock);
}
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
}
while ((node = rb_first(&delayed_refs->root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ struct btrfs_delayed_ref_head *head = NULL;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
- struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
- kfree(head->extent_op);
+ btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
}
+
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
-
+ if (head)
+ mutex_unlock(&head->mutex);
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &btrfs_inode->runtime_flags);
btrfs_invalidate_inodes(btrfs_inode->root);
}
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
while (!list_empty(&list)) {
t = list_entry(list.next, struct btrfs_transaction, list);
- if (!t)
- break;
- btrfs_destroy_ordered_operations(root);
+ btrfs_destroy_ordered_operations(t, root);
btrfs_destroy_ordered_extents(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
+enum {
+ BTRFS_WQ_ENDIO_DATA = 0,
+ BTRFS_WQ_ENDIO_METADATA = 1,
+ BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+ BTRFS_WQ_ENDIO_RAID56 = 3,
+};
+
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = 16 * 1024;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 614f34a899c2..81ee29eeb7ca 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -22,10 +22,10 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
*max_len = BTRFS_FID_SIZE_CONNECTABLE;
- return 255;
+ return FILEID_INVALID;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
- return 255;
+ return FILEID_INVALID;
}
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f6..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
#include "print-tree.h"
#include "transaction.h"
#include "volumes.h"
+#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "math.h"
@@ -72,8 +73,7 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
rb_link_node(&block_group->cache_node, parent, p);
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
+
+ if (info->first_logical_byte > block_group->key.objectid)
+ info->first_logical_byte = block_group->key.objectid;
+
spin_unlock(&info->block_group_cache_lock);
return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
break;
}
}
- if (ret)
+ if (ret) {
btrfs_get_block_group(ret);
+ if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+ info->first_logical_byte = ret->key.objectid;
+ }
spin_unlock(&info->block_group_cache_lock);
return ret;
@@ -468,8 +477,6 @@ out:
}
static int cache_block_group(struct btrfs_block_group_cache *cache,
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
int load_cache_only)
{
DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
- /*
- * We can't do the read from on-disk cache during a commit since we need
- * to have the normal tree locking. Also if we are currently trying to
- * allocate blocks for the tree root we can't do the fast caching since
- * we likely hold important locks.
- */
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
ret = load_free_space_cache(fs_info, cache);
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*actual_bytes = discarded_bytes;
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
return ret;
}
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
- mutex_unlock(&head->mutex);
return ret;
}
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
* process of being added. Don't run this ref yet.
*/
list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
+ btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref = &locked_ref->node;
if (extent_op && must_insert_reserved) {
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
extent_op = NULL;
}
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
-
- printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ printk(KERN_DEBUG
+ "btrfs: run_delayed_extent_op "
+ "returned %d\n", ret);
spin_lock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
return ret;
}
goto next;
}
-
- list_del_init(&locked_ref->cluster);
- locked_ref = NULL;
}
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (locked_ref) {
+ if (!btrfs_delayed_ref_is_head(ref)) {
/*
* when we play the delayed ref, also correct the
* ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- btrfs_put_delayed_ref(ref);
- kfree(extent_op);
- count++;
-
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- if (locked_ref) {
- list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
- }
- printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ btrfs_delayed_ref_unlock(locked_ref);
+ btrfs_put_delayed_ref(ref);
+ printk(KERN_DEBUG
+ "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
+ /*
+ * If this node is a head, that means all the refs in this head
+ * have been dealt with, and we will pick the next head to deal
+ * with, so we must unlock the head and drop it from the cluster
+ * list before we release it.
+ */
+ if (btrfs_delayed_ref_is_head(ref)) {
+ list_del_init(&locked_ref->cluster);
+ btrfs_delayed_ref_unlock(locked_ref);
+ locked_ref = NULL;
+ }
+ btrfs_put_delayed_ref(ref);
+ count++;
next:
cond_resched();
spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
return ret;
}
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+ int count)
+{
+ int val = atomic_read(&delayed_refs->ref_seq);
+
+ if (val < seq || val >= seq + count)
+ return 1;
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+
+ if (!run_all && !run_most) {
+ int old;
+ int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ DEFINE_WAIT(__wait);
+ if (delayed_refs->num_entries < 16348)
+ return 0;
+
+ prepare_to_wait(&delayed_refs->wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ schedule();
+ finish_wait(&delayed_refs->wait, &__wait);
+
+ if (!refs_newer(delayed_refs, seq, 256))
+ goto progress;
+ else
+ return 0;
+ } else {
+ finish_wait(&delayed_refs->wait, &__wait);
+ goto again;
+ }
+ }
+
+ } else {
+ atomic_inc(&delayed_refs->procs_running_refs);
+ }
+
again:
loops = 0;
spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- if (count == 0) {
- count = delayed_refs->num_entries * 2;
- run_most = 1;
- }
while (1) {
if (!(run_all || run_most) &&
delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
ret = run_clustered_refs(trans, root, &cluster);
if (ret < 0) {
+ btrfs_release_ref_cluster(&cluster);
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
+ atomic_dec(&delayed_refs->procs_running_refs);
return ret;
}
+ atomic_add(ret, &delayed_refs->ref_seq);
+
count -= min_t(unsigned long, ret, count);
if (count == 0)
@@ -2573,6 +2629,11 @@ again:
goto again;
}
out:
+ atomic_dec(&delayed_refs->procs_running_refs);
+ smp_mb();
+ if (waitqueue_active(&delayed_refs->wait))
+ wake_up(&delayed_refs->wait);
+
spin_unlock(&delayed_refs->lock);
assert_qgroups_uptodate(trans);
return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op;
int ret;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op)
return -ENOMEM;
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
num_bytes, extent_op);
if (ret)
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
return ret;
}
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
u64 target;
+ u64 tmp;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
}
spin_unlock(&root->fs_info->balance_lock);
+ /* First, mask out the RAID levels which aren't possible */
if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5);
+ if (num_devices < 3)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID6;
if (num_devices < 4)
flags &= ~BTRFS_BLOCK_GROUP_RAID10;
- if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
- (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))) {
- flags &= ~BTRFS_BLOCK_GROUP_DUP;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
- (flags & BTRFS_BLOCK_GROUP_RAID10)) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID1;
- }
+ tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+ flags &= ~tmp;
- if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
- ((flags & BTRFS_BLOCK_GROUP_RAID1) |
- (flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP))) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- }
+ if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+ tmp = BTRFS_BLOCK_GROUP_RAID6;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+ tmp = BTRFS_BLOCK_GROUP_RAID5;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+ tmp = BTRFS_BLOCK_GROUP_RAID10;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+ tmp = BTRFS_BLOCK_GROUP_RAID1;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+ tmp = BTRFS_BLOCK_GROUP_RAID0;
- return extended_to_chunk(flags);
+ return extended_to_chunk(flags | tmp);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits;
+ unsigned seq;
+
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
+ u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- return get_alloc_profile(root, flags);
+ ret = get_alloc_profile(root, flags);
+ return ret;
}
/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
int ret = 0, committed = 0, alloc_chunk = 1;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
if (root == root->fs_info->tree_root ||
BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
{
u64 num_dev;
- if (type & BTRFS_BLOCK_GROUP_RAID10 ||
- type & BTRFS_BLOCK_GROUP_RAID0)
+ if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
num_dev = root->fs_info->fs_devices->rw_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
goto again;
}
+ trans->allocating_chunk = true;
+
/*
* If we have mixed data/metadata chunks we want to make sure we keep
* allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
check_system_chunk(trans, extent_root, flags);
ret = btrfs_alloc_chunk(trans, extent_root, flags);
- if (ret < 0 && ret != -ENOSPC)
- goto out;
+ trans->allocating_chunk = false;
spin_lock(&space_info->lock);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out;
if (ret)
space_info->full = 1;
else
ret = 1;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
-out:
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 rsv_size = 0;
u64 avail;
u64 used;
+ u64 to_add;
used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ spin_lock(&global_rsv->lock);
+ rsv_size = global_rsv->size;
+ spin_unlock(&global_rsv->lock);
+
+ /*
+ * We only want to allow over committing if we have lots of actual space
+ * free, but if we don't have enough space to handle the global reserve
+ * space then we could end up having a real enospc problem when trying
+ * to allocate a chunk or some other such important allocation.
+ */
+ rsv_size <<= 1;
+ if (used + rsv_size >= space_info->total_bytes)
+ return 0;
+
+ used += space_info->bytes_may_use;
spin_lock(&root->fs_info->free_chunk_lock);
avail = root->fs_info->free_chunk_space;
@@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root,
/*
* If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
+ * space is actually useable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
*/
if (profile & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
avail >>= 1;
+ to_add = space_info->total_bytes;
+
/*
* If we aren't flushing all things, let us overcommit up to
* 1/2th of the space. If we can flush, don't let us overcommit
* too much, let it overcommit up to 1/8 of the space.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- avail >>= 3;
+ to_add >>= 3;
else
- avail >>= 1;
+ to_add >>= 1;
+
+ /*
+ * Limit the overcommit to the amount of free space we could possibly
+ * allocate for chunks.
+ */
+ to_add = min(avail, to_add);
- if (used + bytes < space_info->total_bytes + avail)
+ if (used + bytes < space_info->total_bytes + to_add)
return 1;
return 0;
}
-static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
- unsigned long nr_pages,
- enum wb_reason reason)
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+ unsigned long nr_pages)
{
- if (!writeback_in_progress(sb->s_bdi) &&
- down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, reason);
- up_read(&sb->s_umount);
- return 1;
- }
+ struct super_block *sb = root->fs_info->sb;
+ int started;
- return 0;
+ /* If we can not start writeback, just sync all the delalloc file. */
+ started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
+ if (!started) {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+ }
}
/*
@@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
space_info = block_rsv->space_info;
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
if (delalloc_bytes == 0) {
if (trans)
return;
@@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
- nr_pages,
- WB_REASON_FS_FREE_SPACE);
-
+ btrfs_writeback_inodes_sb_nr(root, nr_pages);
/*
* We need to wait for the async pages to actually start before
* we do anything.
@@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
break;
}
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
}
}
@@ -3997,7 +4115,7 @@ again:
* We make the other tasks wait for the flush only when we can flush
* all things.
*/
- if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
@@ -4030,6 +4148,15 @@ again:
goto again;
out:
+ if (ret == -ENOSPC &&
+ unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+ struct btrfs_block_rsv *global_rsv =
+ &root->fs_info->global_block_rsv;
+
+ if (block_rsv != global_rsv &&
+ !block_rsv_use_bytes(global_rsv, orig_bytes))
+ ret = 0;
+ }
if (flushing) {
spin_lock(&space_info->lock);
space_info->flush = 0;
@@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int items,
+ u64 *qgroup_reserved)
{
- struct btrfs_root *root = pending->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
- struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
- /*
- * two for root back/forward refs, two for directory entries,
- * one for root of the snapshot and one for parent inode.
- */
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
- dst_rsv->space_info = src_rsv->space_info;
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ u64 num_bytes;
+ int ret;
+
+ if (root->fs_info->quota_enabled) {
+ /* One for parent inode, two for dir entries */
+ num_bytes = 3 * root->leafsize;
+ ret = btrfs_qgroup_reserve(root, num_bytes);
+ if (ret)
+ return ret;
+ } else {
+ num_bytes = 0;
+ }
+
+ *qgroup_reserved = num_bytes;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, items);
+ rsv->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ if (*qgroup_reserved)
+ btrfs_qgroup_free(root, *qgroup_reserved);
+ }
+
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved)
+{
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
}
/**
@@ -4534,8 +4702,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
unsigned nr_extents = 0;
int extra_reserve = 0;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
- int ret;
+ int ret = 0;
bool delalloc_lock = true;
+ u64 to_free = 0;
+ unsigned dropped;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
@@ -4582,53 +4752,16 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
nr_extents * root->leafsize);
- if (ret) {
- spin_lock(&BTRFS_I(inode)->lock);
- calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- if (delalloc_lock)
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
- }
+ if (ret)
+ goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
- if (ret) {
- u64 to_free = 0;
- unsigned dropped;
-
- spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
- /*
- * If the inodes csum_bytes is the same as the original
- * csum_bytes then we know we haven't raced with any free()ers
- * so we can just reduce our inodes csum bytes and carry on.
- * Otherwise we have to do the normal free thing to account for
- * the case that the free side didn't free up its reserve
- * because of this outstanding reservation.
- */
- if (BTRFS_I(inode)->csum_bytes == csum_bytes)
- calc_csum_metadata_size(inode, num_bytes, 0);
- else
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- if (dropped)
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
- if (to_free) {
- btrfs_block_rsv_release(root, block_rsv, to_free);
- trace_btrfs_space_reservation(root->fs_info,
- "delalloc",
- btrfs_ino(inode),
- to_free, 0);
- }
- if (root->fs_info->quota_enabled) {
+ if (unlikely(ret)) {
+ if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
nr_extents * root->leafsize);
- }
- if (delalloc_lock)
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
+ goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
@@ -4649,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
+
+out_fail:
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
+ /*
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ * Otherwise we have to do the normal free thing to account for
+ * the case that the free side didn't free up its reserve
+ * because of this outstanding reservation.
+ */
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ else
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (to_free) {
+ btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ return ret;
}
/**
@@ -4670,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ if (num_bytes)
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(&BTRFS_I(inode)->lock);
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4737,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -4775,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && cache->cached == BTRFS_CACHE_NO)
- cache_block_group(cache, trans, NULL, 1);
+ cache_block_group(cache, 1);
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -4825,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
struct btrfs_block_group_cache *cache;
u64 bytenr;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ bytenr = root->fs_info->first_logical_byte;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ if (bytenr < (u64)-1)
+ return bytenr;
+
cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
if (!cache)
return 0;
@@ -4875,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group_cache *cache;
@@ -4890,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* to one because the slow code to read in the free extents does check
* the pinned extents.
*/
- cache_block_group(cache, trans, root, 1);
+ cache_block_group(cache, 1);
pin_down_extent(root, cache, bytenr, num_bytes, 0);
@@ -5287,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+ ret = update_block_group(root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5332,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->extent_op) {
if (!head->must_insert_reserved)
goto out;
- kfree(head->extent_op);
+ btrfs_free_delayed_extent_op(head->extent_op);
head->extent_op = NULL;
}
@@ -5455,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 val, u64 num_bytes)
{
- u64 mask = ((u64)root->stripesize - 1);
- u64 ret = (val + mask) & ~mask;
+ u64 ret = ALIGN(val, root->stripesize);
return ret;
}
@@ -5478,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5495,7 +5662,6 @@ static noinline int
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5509,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
int __get_raid_index(u64 flags)
{
- int index;
-
if (flags & BTRFS_BLOCK_GROUP_RAID10)
- index = 0;
+ return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- index = 1;
+ return BTRFS_RAID_RAID1;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
- index = 2;
+ return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- index = 3;
- else
- index = 4;
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
- return index;
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5560,7 +5726,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int empty_cluster = 2 * 1024 * 1024;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -5665,6 +5831,8 @@ search:
if (!block_group_bits(block_group, data)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10;
/*
@@ -5680,8 +5848,7 @@ have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
found_uncached_bg = true;
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
+ ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
}
@@ -5694,6 +5861,7 @@ have_block_group:
* lets look there
*/
if (last_ptr) {
+ unsigned long aligned_cluster;
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -5760,11 +5928,15 @@ refill_cluster:
goto unclustered_alloc;
}
+ aligned_cluster = max_t(unsigned long,
+ empty_cluster + empty_size,
+ block_group->full_stripe_len);
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
search_start, num_bytes,
- empty_cluster + empty_size);
+ aligned_cluster);
if (ret == 0) {
/*
* now pull our allocation out of this
@@ -5835,7 +6007,8 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, offset);
+ search_start = stripe_align(root, used_block_group,
+ offset, num_bytes);
/* move on to the next group */
if (search_start + num_bytes >
@@ -5986,7 +6159,7 @@ again:
if (ret == -ENOSPC) {
if (!final_tried) {
num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
if (num_bytes == min_alloc_size)
final_tried = true;
@@ -6110,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6174,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6217,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, trans, NULL, 0);
+ cache_block_group(block_group, 0);
caching_ctl = get_caching_control(block_group);
if (!caching_ctl) {
@@ -6331,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
if (ret && !block_rsv->failfast) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- /*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs))
- WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
- ret);
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "btrfs: block rsv returned %d\n", ret);
+ }
ret = reserve_metadata_bytes(root, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
@@ -6402,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -6524,7 +6699,7 @@ reada:
}
/*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
*
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
@@ -6599,7 +6774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block pointed to. if the block
@@ -6737,7 +6912,7 @@ skip:
}
/*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
* reference count on the block.
@@ -6788,11 +6963,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
@@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
root->fs_info->fs_devices->missing_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
if (num_devices == 1) {
@@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
index = get_block_group_index(block_group);
}
- if (index == 0) {
+ if (index == BTRFS_RAID_RAID10) {
dev_min = 4;
/* Divide by 2 */
min_free >>= 1;
- } else if (index == 1) {
+ } else if (index == BTRFS_RAID_RAID1) {
dev_min = 2;
- } else if (index == 2) {
+ } else if (index == BTRFS_RAID_DUP) {
/* Multiply by 2 */
min_free <<= 1;
- } else if (index == 3) {
+ } else if (index == BTRFS_RAID_RAID0) {
dev_min = fs_devices->rw_devices;
do_div(min_free, dev_min);
}
@@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0) {
- WARN_ON(1);
- dump_space_info(space_info, 0, 0);
+ if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
}
list_del(&space_info->list);
kfree(space_info);
@@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
btrfs_release_path(path);
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
-
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ found_key.objectid);
btrfs_init_free_space_ctl(cache);
/*
@@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (!(get_alloc_profile(root, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_DUP)))
continue;
/*
@@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = root->sectorsize;
cache->fs_info = root->fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ chunk_offset);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
@@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+
+ if (root->fs_info->first_logical_byte == block_group->key.objectid)
+ root->fs_info->first_logical_byte = (u64)-1;
spin_unlock(&root->fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
@@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
- ret = cache_block_group(cache, NULL, root, 0);
+ ret = cache_block_group(cache, 0);
if (!ret)
wait_block_group_cache_done(cache);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29eee..f173c5af6461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
-#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
*/
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
*/
static void check_page_locked(struct extent_io_tree *tree, struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
if (ret)
err = ret;
- if (did_repair) {
- ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED, GFP_NOFS);
- if (ret && !err)
- err = ret;
- }
+ ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret && !err)
+ err = ret;
kfree(rec);
return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
BUG_ON(!mirror_num);
+ /* we can't repair anything in raid56 yet */
+ if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+ return 0;
+
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_add_page(bio, page, length, start-page_offset(page));
+ bio_add_page(bio, page, length, start - page_offset(page));
btrfsic_submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
failrec->failed_mirror);
did_repair = !ret;
}
+ ret = 0;
}
out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
struct page *page = bvec->bv_page;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) +
- bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) +
- bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
struct extent_io_tree *tree = bio->bi_private;
u64 start;
- start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
return ret;
}
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
unsigned long offset, size_t size, struct bio *bio,
unsigned long bio_flags)
{
int ret = 0;
if (tree->ops && tree->ops->merge_bio_hook)
- ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+ ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
bio_flags);
BUG_ON(ret < 0);
return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
sector;
if (prev_bio_flags != bio_flags || !contig ||
- merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
+ merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
unsigned long *bio_flags)
{
struct inode *inode = page->mapping->host;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
}
while (cur <= end) {
+ unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
if (cur >= last_byte) {
char *userpage;
struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
- iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ iosize = ALIGN(iosize, blocksize);
if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
disk_io_size = em->block_len;
sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
- ret = 0;
- if (tree->ops && tree->ops->readpage_io_hook) {
- ret = tree->ops->readpage_io_hook(page, cur,
- cur + iosize - 1);
- }
- if (!ret) {
- unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
- pnr -= page->index;
- ret = submit_extent_page(READ, tree, page,
+ pnr -= page->index;
+ ret = submit_extent_page(READ, tree, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
- if (!ret) {
- nr++;
- *bio_flags = this_bio_flag;
- }
- }
- if (ret) {
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ } else {
SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
}
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
struct extent_page_data *epd = data;
struct extent_io_tree *tree = epd->tree;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ iosize = ALIGN(iosize, blocksize);
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- spin_lock(&fs_info->delalloc_lock);
- if (fs_info->dirty_metadata_bytes >= eb->len)
- fs_info->dirty_metadata_bytes -= eb->len;
- else
- WARN_ON(1);
- spin_unlock(&fs_info->delalloc_lock);
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
* swizzled back from swapper_space to tmpfs file
* mapping
*/
- if (tree->ops &&
- tree->ops->write_cache_pages_lock_hook) {
- tree->ops->write_cache_pages_lock_hook(page,
- data, flush_fn);
- } else {
- if (!trylock_page(page)) {
- flush_fn(data);
- lock_page(page);
- }
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
}
if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset)
{
struct extent_state *cached_state = NULL;
- u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
- start += (offset + blocksize - 1) & ~(blocksize - 1);
+ start += ALIGN(offset, blocksize);
if (start > end)
return 0;
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
int ret = 1;
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
gfp_t mask)
{
struct extent_map *em;
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
len = last - offset;
if (len == 0)
break;
- len = (len + sectorsize - 1) & ~(sectorsize - 1);
+ len = ALIGN(len, sectorsize);
em = get_extent(inode, NULL, 0, offset, len, 0);
if (IS_ERR_OR_NULL(em))
return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
list_del(&eb->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
- if (eb->pages && eb->pages != eb->inline_pages)
- kfree(eb->pages);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
- struct page **pages;
- int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- pages = kzalloc(num_pages, mask);
- if (!pages) {
- __free_extent_buffer(eb);
- return NULL;
- }
- eb->pages = pages;
- } else {
- eb->pages = eb->inline_pages;
- }
+ /*
+ * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+ */
+ BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+ > MAX_INLINE_EXTENT_BUFFER_SIZE);
+ BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
return eb;
}
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
+ int refs;
/* the ref bit is tricky. We have to make sure it is set
* if we have the buffer dirty. Otherwise the
* code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* So bump the ref count first, then set the bit. If someone
* beat us to it, drop the ref we added.
*/
+ refs = atomic_read(&eb->refs);
+ if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ return;
+
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
void free_extent_buffer(struct extent_buffer *eb)
{
+ int refs;
+ int old;
if (!eb)
return;
+ while (1) {
+ refs = atomic_read(&eb->refs);
+ if (refs <= 3)
+ break;
+ old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+ if (old == refs)
+ return;
+ }
+
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd3263..6068a1985560 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
- int (*merge_bio_hook)(struct page *page, unsigned long offset,
+ int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
- int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
struct extent_state *other);
void (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
- int (*write_cache_pages_lock_hook)(struct page *page, void *data,
- void (*flush_fn)(void *));
};
struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
*/
wait_queue_head_t read_lock_wq;
wait_queue_head_t lock_wq;
- struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
- struct page **pages;
+ struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7f..2834ca5768ea 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include "ctree.h"
@@ -171,6 +170,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
return 0;
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -255,7 +258,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- list_move(&em->list, &tree->modified_extents);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
@@ -280,6 +284,13 @@ out:
}
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ if (em->in_tree)
+ try_merge_map(tree, em);
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e8..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef42358..ec160202be3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- if (!contig && (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset)) {
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
this_sum_bytes = 0;
@@ -684,6 +684,24 @@ out:
return ret;
}
+static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
+ struct btrfs_sector_sum *sector_sum,
+ u64 total_bytes, u64 sectorsize)
+{
+ u64 tmp = sectorsize;
+ u64 next_sector = sector_sum->bytenr;
+ struct btrfs_sector_sum *next = sector_sum + 1;
+
+ while ((tmp + total_bytes) < sums->len) {
+ if (next_sector + sectorsize != next->bytenr)
+ break;
+ tmp += sectorsize;
+ next_sector = next->bytenr;
+ next++;
+ }
+ return tmp;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
goto insert;
}
- if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+ if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
csum_size) {
- u32 diff = (csum_offset + 1) * csum_size;
+ int extend_nr;
+ u64 tmp;
+ u32 diff;
+ u32 free_space;
- /*
- * is the item big enough already? we dropped our lock
- * before and need to recheck
- */
- if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
- goto csum;
+ if (btrfs_leaf_free_space(root, leaf) <
+ sizeof(struct btrfs_item) + csum_size * 2)
+ goto insert;
+
+ free_space = btrfs_leaf_free_space(root, leaf) -
+ sizeof(struct btrfs_item) - csum_size;
+ tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+ root->sectorsize);
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ WARN_ON(tmp < 1);
+
+ extend_nr = max_t(int, 1, (int)tmp);
+ diff = (csum_offset + extend_nr) * csum_size;
+ diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
- if (diff != csum_size)
- goto insert;
+ diff = min(free_space, diff);
+ diff /= csum_size;
+ diff *= csum_size;
btrfs_extend_item(trans, root, path, diff);
goto csum;
@@ -812,19 +842,14 @@ insert:
btrfs_release_path(path);
csum_offset = 0;
if (found_next) {
- u64 tmp = total_bytes + root->sectorsize;
- u64 next_sector = sector_sum->bytenr;
- struct btrfs_sector_sum *next = sector_sum + 1;
+ u64 tmp;
- while (tmp < sums->len) {
- if (next_sector + root->sectorsize != next->bytenr)
- break;
- tmp += root->sectorsize;
- next_sector = next->bytenr;
- next++;
- }
- tmp = min(tmp, next_offset - file_key.offset);
+ tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+ root->sectorsize);
tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp = min(tmp, (next_offset - file_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits);
+
tmp = max((u64)1, tmp);
tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77061bf43edb..af1d0605a5c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/slab.h>
+#include <linux/btrfs.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
@@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
+ int index;
+ int ret;
/* get the inode */
key.objectid = defrag->root;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(inode_root)) {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return PTR_ERR(inode_root);
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+ if (btrfs_root_refs(&inode_root->root_item) == 0) {
+ ret = -ENOENT;
+ goto cleanup;
}
key.objectid = defrag->ino;
@@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode)) {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return PTR_ERR(inode);
+ ret = PTR_ERR(inode);
+ goto cleanup;
}
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
iput(inode);
return 0;
+cleanup:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
}
/*
@@ -360,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
atomic_inc(&fs_info->defrag_running);
while(1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &fs_info->fs_state))
+ break;
+
if (!__need_auto_defrag(fs_info->tree_root))
break;
@@ -491,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = (write_bytes + pos - start_pos +
- root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1211,7 +1229,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili = 0;
@@ -1298,7 +1316,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct iov_iter *i,
loff_t pos)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
unsigned long first_index;
@@ -1486,7 +1504,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
loff_t *ppos = &iocb->ki_pos;
u64 start_pos;
@@ -1530,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* although we have opened a file as writable, we have
* to stop this write operation to ensure FS consistency.
*/
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
mutex_unlock(&inode->i_mutex);
err = -EROFS;
goto out;
@@ -1594,9 +1612,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (err < 0 && num_written > 0)
num_written = err;
}
-out:
+
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
@@ -1612,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
- btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We need to block on a committing transaction to keep us from
+ * throwing a ordered operation on to the list and causing
+ * something like sync to deadlock trying to flush out this
+ * inode.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+ btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
@@ -1639,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
struct btrfs_trans_handle *trans;
+ bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
/*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
- * multi-task, and make the performance up.
+ * multi-task, and make the performance up. See
+ * btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
if (ret)
return ret;
@@ -1660,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* range being left.
*/
atomic_inc(&root->log_batch);
- btrfs_wait_ordered_range(inode, start, end - start + 1);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ if (full_sync)
+ btrfs_wait_ordered_range(inode, start, end - start + 1);
atomic_inc(&root->log_batch);
/*
@@ -1727,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret != BTRFS_NO_LOG_SYNC) {
if (ret > 0) {
+ /*
+ * If we didn't already wait for ordered extents we need
+ * to do that now.
+ */
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_sync_log(trans, root);
- if (ret == 0)
+ if (ret == 0) {
ret = btrfs_end_transaction(trans, root);
- else
+ } else {
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end -
+ start + 1);
ret = btrfs_commit_transaction(trans, root);
+ }
}
} else {
ret = btrfs_end_transaction(trans, root);
@@ -2087,7 +2139,7 @@ out:
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
u64 cur_offset;
u64 last_byte;
@@ -2241,6 +2293,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
+ lockend--;
len = lockend - lockstart + 1;
len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2360,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
}
}
- *offset = start;
- free_extent_map(em);
- break;
+ if (!test_bit(EXTENT_FLAG_PREALLOC,
+ &em->flags)) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
}
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ max_bitmaps = max(max_bitmaps, 1);
+
BUG_ON(ctl->total_bitmaps > max_bitmaps);
/*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
static struct btrfs_free_space *
-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ unsigned long align)
{
struct btrfs_free_space *entry;
struct rb_node *node;
+ u64 ctl_off;
+ u64 tmp;
+ u64 align_off;
int ret;
if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
if (entry->bytes < *bytes)
continue;
+ /* make sure the space returned is big enough
+ * to match our requested alignment
+ */
+ if (*bytes >= align) {
+ ctl_off = entry->offset - ctl->start;
+ tmp = ctl_off + align - 1;;
+ do_div(tmp, align);
+ tmp = tmp * align + ctl->start;
+ align_off = tmp - entry->offset;
+ } else {
+ align_off = 0;
+ tmp = entry->offset;
+ }
+
+ if (entry->bytes < *bytes + align_off)
+ continue;
+
if (entry->bitmap) {
- ret = search_bitmap(ctl, entry, offset, bytes);
- if (!ret)
+ ret = search_bitmap(ctl, entry, &tmp, bytes);
+ if (!ret) {
+ *offset = tmp;
return entry;
+ }
continue;
}
- *offset = entry->offset;
- *bytes = entry->bytes;
+ *offset = tmp;
+ *bytes = entry->bytes - align_off;
return entry;
}
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
}
/*
- * some block groups are so tiny they can't be enveloped by a bitmap, so
- * don't even bother to create a bitmap for this
+ * The original block groups from mkfs can be really small, like 8
+ * megabytes, so don't bother with a bitmap for those entries. However
+ * some block groups can be smaller than what a bitmap would cover but
+ * are still large enough that they could overflow the 32k memory limit,
+ * so allow those block groups to still be allowed to have a bitmap
+ * entry.
*/
- if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
return false;
return true;
@@ -1862,11 +1891,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- int ret = 0;
+ int ret;
+ bool re_search = false;
spin_lock(&ctl->tree_lock);
again:
+ ret = 0;
if (!bytes)
goto out_lock;
@@ -1879,17 +1910,17 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- /* the tree logging code might be calling us before we
- * have fully loaded the free space rbtree for this
- * block group. So it is possible the entry won't
- * be in the rbtree yet at all. The caching code
- * will make sure not to put it in the rbtree if
- * the logging code has pinned it.
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
*/
+ WARN_ON(re_search);
goto out_lock;
}
}
+ re_search = false;
if (!info->bitmap) {
unlink_free_space(ctl, info);
if (offset == info->offset) {
@@ -1935,8 +1966,10 @@ again:
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ re_search = true;
goto again;
+ }
BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
@@ -2091,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
+ u64 align_gap = 0;
+ u64 align_gap_len = 0;
spin_lock(&ctl->tree_lock);
- entry = find_free_space(ctl, &offset, &bytes_search);
+ entry = find_free_space(ctl, &offset, &bytes_search,
+ block_group->full_stripe_len);
if (!entry)
goto out;
@@ -2103,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
+
unlink_free_space(ctl, entry);
- entry->offset += bytes;
- entry->bytes -= bytes;
+ align_gap_len = offset - entry->offset;
+ align_gap = entry->offset;
+
+ entry->offset = offset + bytes;
+ WARN_ON(entry->bytes < bytes + align_gap_len);
+
+ entry->bytes -= bytes + align_gap_len;
if (!entry->bytes)
kmem_cache_free(btrfs_free_space_cachep, entry);
else
@@ -2115,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
out:
spin_unlock(&ctl->tree_lock);
+ if (align_gap_len)
+ __btrfs_add_free_space(ctl, align_gap, align_gap_len);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24ae86bb..c226daefd65d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
@@ -54,6 +55,7 @@
#include "locking.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "backref.h"
struct btrfs_iget_args {
u64 ino;
@@ -88,7 +90,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
};
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
u64 isize = i_size_read(inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
- u64 aligned_end = (end + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ u64 aligned_end = ALIGN(end, root->sectorsize);
u64 data_len = inline_len;
int ret;
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
return 1;
}
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
@@ -389,7 +391,7 @@ again:
* a compressed extent to 128k.
*/
total_compressed = min(total_compressed, max_uncompressed);
- num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
total_in = 0;
ret = 0;
@@ -488,15 +490,13 @@ cont:
* up to a block size boundary so the allocator does sane
* things
*/
- total_compressed = (total_compressed + blocksize - 1) &
- ~(blocksize - 1);
+ total_compressed = ALIGN(total_compressed, blocksize);
/*
* one last check to make sure the compression is really a
* win, compare the page count read with the blocks on disk
*/
- total_in = (total_in + PAGE_CACHE_SIZE - 1) &
- ~(PAGE_CACHE_SIZE - 1);
+ total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
if (total_compressed >= total_in) {
will_compress = 0;
} else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
if (list_empty(&async_cow->extents))
return 0;
-
+again:
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
async_extent->ram_size - 1,
btrfs_get_extent,
WB_SYNC_ALL);
+ else if (ret)
+ unlock_page(async_cow->locked_page);
kfree(async_extent);
cond_resched();
continue;
@@ -672,6 +674,7 @@ retry:
if (ret) {
int i;
+
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
kfree(async_extent->pages);
async_extent->nr_pages = 0;
async_extent->pages = NULL;
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+
if (ret == -ENOSPC)
goto retry;
- goto out_free; /* JDM: Requeue? */
+ goto out_free;
}
/*
@@ -696,10 +697,13 @@ retry:
async_extent->ram_size - 1, 0);
em = alloc_extent_map();
- BUG_ON(!em); /* -ENOMEM */
+ if (!em)
+ goto out_free_reserve;
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
em->block_start = ins.objectid;
em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
async_extent->ram_size - 1, 0);
}
+ if (ret)
+ goto out_free_reserve;
+
ret = btrfs_add_ordered_extent_compress(inode,
async_extent->start,
ins.objectid,
@@ -733,7 +740,8 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto out_free_reserve;
/*
* clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages);
-
- BUG_ON(ret); /* -ENOMEM */
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
+ if (ret)
+ goto out;
cond_resched();
}
ret = 0;
out:
return ret;
+out_free_reserve:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
out_free:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
kfree(async_extent);
- goto out;
+ goto again;
}
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
BUG_ON(btrfs_is_free_space_inode(inode));
- num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
disk_num_bytes = num_bytes;
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
em->orig_start = em->start;
ram_size = ins.offset;
em->len = ins.offset;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
em->block_start = ins.objectid;
em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
em->block_start = disk_bytenr;
em->orig_block_len = disk_num_bytes;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_FILLING, &em->flags);
em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
- spin_lock(&root->fs_info->delalloc_lock);
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- root->fs_info->delalloc_bytes += len;
- if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
+ if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
&& do_list)
btrfs_free_reserved_data_space(inode, len);
- spin_lock(&root->fs_info->delalloc_lock);
- root->fs_info->delalloc_bytes -= len;
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes -= len;
-
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
- !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
* we don't create bios that span stripes or chunks
*/
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags)
{
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
length = bio->bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, READ, logical,
+ ret = btrfs_map_block(root->fs_info, rw, logical,
&map_length, NULL, 0);
/* Will always return 0 with map_multi == NULL */
BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
return ret;
}
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+ struct rb_node node;
+ struct old_sa_defrag_extent *old;
+ u64 root_id;
+ u64 inum;
+ u64 file_pos;
+ u64 extent_offset;
+ u64 num_bytes;
+ u64 generation;
+};
+
+struct old_sa_defrag_extent {
+ struct list_head list;
+ struct new_sa_defrag_extent *new;
+
+ u64 extent_offset;
+ u64 bytenr;
+ u64 offset;
+ u64 len;
+ int count;
+};
+
+struct new_sa_defrag_extent {
+ struct rb_root root;
+ struct list_head head;
+ struct btrfs_path *path;
+ struct inode *inode;
+ u64 file_pos;
+ u64 len;
+ u64 bytenr;
+ u64 disk_len;
+ u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+ struct sa_defrag_extent_backref *b2)
+{
+ if (b1->root_id < b2->root_id)
+ return -1;
+ else if (b1->root_id > b2->root_id)
+ return 1;
+
+ if (b1->inum < b2->inum)
+ return -1;
+ else if (b1->inum > b2->inum)
+ return 1;
+
+ if (b1->file_pos < b2->file_pos)
+ return -1;
+ else if (b1->file_pos > b2->file_pos)
+ return 1;
+
+ /*
+ * [------------------------------] ===> (a range of space)
+ * |<--->| |<---->| =============> (fs/file tree A)
+ * |<---------------------------->| ===> (fs/file tree B)
+ *
+ * A range of space can refer to two file extents in one tree while
+ * refer to only one file extent in another tree.
+ *
+ * So we may process a disk offset more than one time(two extents in A)
+ * and locate at the same extent(one extent in B), then insert two same
+ * backrefs(both refer to the extent in B).
+ */
+ return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sa_defrag_extent_backref *entry;
+ int ret;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+ ret = backref_comp(backref, entry);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&backref->node, parent, p);
+ rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+ void *ctx)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_fs_info *fs_info;
+ struct old_sa_defrag_extent *old = ctx;
+ struct new_sa_defrag_extent *new = old->new;
+ struct btrfs_path *path = new->path;
+ struct btrfs_key key;
+ struct btrfs_root *root;
+ struct sa_defrag_extent_backref *backref;
+ struct extent_buffer *leaf;
+ struct inode *inode = new->inode;
+ int slot;
+ int ret;
+ u64 extent_offset;
+ u64 num_bytes;
+
+ if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+ inum == btrfs_ino(inode))
+ return 0;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ WARN_ON(1);
+ pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+ inum, offset, root_id);
+ return PTR_ERR(root);
+ }
+
+ key.objectid = inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ if (offset > (u64)-1 << 32)
+ key.offset = 0;
+ else
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+
+ while (1) {
+ cond_resched();
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ continue;
+ }
+
+ path->slots[0]++;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid > inum)
+ goto out;
+
+ if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+ continue;
+
+ extent_offset = btrfs_file_extent_offset(leaf, extent);
+ if (key.offset - extent_offset != offset)
+ continue;
+
+ num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+ if (extent_offset >= old->extent_offset + old->offset +
+ old->len || extent_offset + num_bytes <=
+ old->extent_offset + old->offset)
+ continue;
+
+ break;
+ }
+
+ backref = kmalloc(sizeof(*backref), GFP_NOFS);
+ if (!backref) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ backref->root_id = root_id;
+ backref->inum = inum;
+ backref->file_pos = offset + extent_offset;
+ backref->num_bytes = num_bytes;
+ backref->extent_offset = extent_offset;
+ backref->generation = btrfs_file_extent_generation(leaf, extent);
+ backref->old = old;
+ backref_insert(&new->root, backref);
+ old->count++;
+out:
+ btrfs_release_path(path);
+ WARN_ON(ret);
+ return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+ struct new_sa_defrag_extent *new)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+ struct old_sa_defrag_extent *old, *tmp;
+ int ret;
+
+ new->path = path;
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+ path, record_one_backref,
+ old);
+ BUG_ON(ret < 0 && ret != -ENOENT);
+
+ /* no backref to be processed for this extent */
+ if (!old->count) {
+ list_del(&old->list);
+ kfree(old);
+ }
+ }
+
+ if (list_empty(&new->head))
+ return false;
+
+ return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 disk_bytenr)
+{
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
+ return 0;
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+ struct sa_defrag_extent_backref *prev,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct old_sa_defrag_extent *old = backref->old;
+ struct new_sa_defrag_extent *new = old->new;
+ struct inode *src_inode = new->inode;
+ struct inode *inode;
+ struct extent_state *cached = NULL;
+ int ret = 0;
+ u64 start;
+ u64 len;
+ u64 lock_start;
+ u64 lock_end;
+ bool merge = false;
+ int index;
+
+ if (prev && prev->root_id == backref->root_id &&
+ prev->inum == backref->inum &&
+ prev->file_pos + prev->num_bytes == backref->file_pos)
+ merge = true;
+
+ /* step 1: get root */
+ key.objectid = backref->root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(src_inode)->root->fs_info;
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ return PTR_ERR(root);
+ }
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ /* parse ENOENT to 0 */
+ return 0;
+ }
+
+ /* step 2: get inode */
+ key.objectid = backref->inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* step 3: relink backref */
+ lock_start = backref->file_pos;
+ lock_end = backref->file_pos + backref->num_bytes - 1;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ 0, &cached);
+
+ ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ goto out_unlock;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ key.objectid = backref->inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = backref->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out_free_path;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out_free_path;
+ }
+
+ extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+ backref->generation)
+ goto out_free_path;
+
+ btrfs_release_path(path);
+
+ start = backref->file_pos;
+ if (backref->extent_offset < old->extent_offset + old->offset)
+ start += old->extent_offset + old->offset -
+ backref->extent_offset;
+
+ len = min(backref->extent_offset + backref->num_bytes,
+ old->extent_offset + old->offset + old->len);
+ len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+ ret = btrfs_drop_extents(trans, root, inode, start,
+ start + len, 1);
+ if (ret)
+ goto out_free_path;
+again:
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (merge) {
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ if (ret < 0)
+ goto out_free_path;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+ if (relink_is_mergable(leaf, fi, new->bytenr) &&
+ extent_len + found_key.offset == start) {
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_len + len);
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+
+ ret = 1;
+ goto out_free_path;
+ } else {
+ merge = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+ btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+ btrfs_set_file_extent_num_bytes(leaf, item, len);
+ btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+
+ ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+ new->disk_len, 0,
+ backref->root_id, backref->inum,
+ new->file_pos, 0); /* start - extent_offset */
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ ret = 1;
+out_free_path:
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans, root);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ &cached, GFP_NOFS);
+ iput(inode);
+ return ret;
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+ struct btrfs_path *path;
+ struct old_sa_defrag_extent *old, *tmp;
+ struct sa_defrag_extent_backref *backref;
+ struct sa_defrag_extent_backref *prev = NULL;
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct rb_node *node;
+ int ret;
+
+ inode = new->inode;
+ root = BTRFS_I(inode)->root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+
+ if (!record_extent_backrefs(path, new)) {
+ btrfs_free_path(path);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ while (1) {
+ node = rb_first(&new->root);
+ if (!node)
+ break;
+ rb_erase(node, &new->root);
+
+ backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+ ret = relink_extent_backref(path, prev, backref);
+ WARN_ON(ret < 0);
+
+ kfree(prev);
+
+ if (ret == 1)
+ prev = backref;
+ else
+ prev = NULL;
+ cond_resched();
+ }
+ kfree(prev);
+
+ btrfs_free_path(path);
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+out:
+ atomic_dec(&root->fs_info->defrag_running);
+ wake_up(&root->fs_info->transaction_wait);
+
+ kfree(new);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct old_sa_defrag_extent *old, *tmp;
+ struct new_sa_defrag_extent *new;
+ int ret;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->inode = inode;
+ new->file_pos = ordered->file_offset;
+ new->len = ordered->len;
+ new->bytenr = ordered->start;
+ new->disk_len = ordered->disk_len;
+ new->compress_type = ordered->compress_type;
+ new->root = RB_ROOT;
+ INIT_LIST_HEAD(&new->head);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out_kfree;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = new->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free_path;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ /* find out all the old extents for the file range */
+ while (1) {
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *l;
+ int slot;
+ u64 num_bytes;
+ u64 offset;
+ u64 end;
+ u64 disk_bytenr;
+ u64 extent_offset;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_free_list;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid != btrfs_ino(inode))
+ break;
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+ if (key.offset >= new->file_pos + new->len)
+ break;
+
+ extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+ num_bytes = btrfs_file_extent_num_bytes(l, extent);
+ if (key.offset + num_bytes < new->file_pos)
+ goto next;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+ if (!disk_bytenr)
+ goto next;
+
+ extent_offset = btrfs_file_extent_offset(l, extent);
+
+ old = kmalloc(sizeof(*old), GFP_NOFS);
+ if (!old)
+ goto out_free_list;
+
+ offset = max(new->file_pos, key.offset);
+ end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+ old->bytenr = disk_bytenr;
+ old->extent_offset = extent_offset;
+ old->offset = offset - key.offset;
+ old->len = end - offset;
+ old->new = new;
+ old->count = 0;
+ list_add_tail(&old->list, &new->head);
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ btrfs_free_path(path);
+ atomic_inc(&root->fs_info->defrag_running);
+
+ return new;
+
+out_free_list:
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+out_free_path:
+ btrfs_free_path(path);
+out_kfree:
+ kfree(new);
+ return NULL;
+}
+
/*
* helper function for btrfs_finish_ordered_io, this
* just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
+ struct new_sa_defrag_extent *new = NULL;
int compress_type = 0;
int ret;
bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset + ordered_extent->len - 1,
0, &cached_state);
+ ret = test_range_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 1, cached_state);
+ if (ret) {
+ u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+ if (last_snapshot >= BTRFS_I(inode)->generation)
+ /* the inode is shared */
+ new = record_old_file_extents(inode, ordered_extent);
+
+ clear_extent_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+ }
+
if (nolock)
trans = btrfs_join_transaction_nolock(root);
else
@@ -2001,17 +2689,33 @@ out:
if (trans)
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret) {
clear_extent_uptodate(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len - 1, NULL, GFP_NOFS);
+ /*
+ * If the ordered extent had an IOERR or something else went
+ * wrong we need to return the space for this ordered extent
+ * back to the allocator.
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+ btrfs_free_reserved_extent(root, ordered_extent->start,
+ ordered_extent->disk_len);
+ }
+
+
/*
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
+ /* for snapshot-aware defrag */
+ if (new)
+ relink_file_extents(new);
+
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror)
{
- size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+ size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
}
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
/*
* This is called in transaction commit time. If there are no orphan
* files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*/
set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags);
+ atomic_inc(&root->orphan_inodes);
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
@@ -2478,7 +3178,21 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
+
+ /* 1 for the orphan item deletion. */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+
ret = btrfs_truncate(inode);
+ if (ret)
+ btrfs_orphan_del(NULL, inode);
} else {
nr_unlink++;
}
@@ -2697,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
- btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
- btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_inode_mode(leaf, item, inode->i_mode);
- btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+ struct btrfs_map_token token;
- btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
- inode->i_atime.tv_nsec);
+ btrfs_init_map_token(&token);
- btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
- inode->i_mtime.tv_nsec);
+ btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+ btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+ btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+ &token);
+ btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+ btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_sec);
- btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
- inode->i_ctime.tv_nsec);
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec, &token);
- btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
- btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
- btrfs_set_inode_sequence(leaf, item, inode->i_version);
- btrfs_set_inode_transid(leaf, item, trans->transid);
- btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
- btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
- btrfs_set_inode_block_group(leaf, item, 0);
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec, &token);
+
+ btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+ &token);
+ btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+ &token);
+ btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+ btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+ btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+ btrfs_set_token_inode_block_group(leaf, item, 0, &token);
}
/*
@@ -3292,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
- u64 mask = root->sectorsize - 1;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -3316,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* extent just the way it is.
*/
if (root->ref_cows || root == root->fs_info->tree_root)
- btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
+ btrfs_drop_extent_cache(inode, ALIGN(new_size,
+ root->sectorsize), (u64)-1, 0);
/*
* This function is also used to drop the items in the log tree before
@@ -3395,10 +4116,9 @@ search_again:
if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
- extent_num_bytes = new_size -
- found_key.offset + root->sectorsize - 1;
- extent_num_bytes = extent_num_bytes &
- ~((u64)root->sectorsize - 1);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ root->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
@@ -3634,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 mask = root->sectorsize - 1;
- u64 hole_start = (oldsize + mask) & ~mask;
- u64 block_end = (size + mask) & ~mask;
+ u64 hole_start = ALIGN(oldsize, root->sectorsize);
+ u64 block_end = ALIGN(size, root->sectorsize);
u64 last_byte;
u64 cur_offset;
u64 hole_size;
@@ -3665,10 +4384,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
block_end - cur_offset, 0);
if (IS_ERR(em)) {
err = PTR_ERR(em);
+ em = NULL;
break;
}
last_byte = min(extent_map_end(em), block_end);
- last_byte = (last_byte + mask) & ~mask;
+ last_byte = ALIGN(last_byte , root->sectorsize);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
@@ -3748,16 +4468,27 @@ next:
return err;
}
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
int ret;
if (newsize == oldsize)
return 0;
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+ inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+
if (newsize > oldsize) {
truncate_pagecache(inode, oldsize, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +4514,40 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * We need to do this in case we fail at _any_ point during the
+ * actual truncate. Once we do the truncate_setsize we could
+ * invalidate pages which forces any outstanding ordered io to
+ * be instantly completed which will give us extents that need
+ * to be truncated. If we fail to get an orphan inode down we
+ * could have left over extents that were never meant to live,
+ * so we need to garuntee from this point on that everything
+ * will be consistent.
+ */
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
+
+ /* Disable nonlocked read DIO to avoid the end less truncate */
+ btrfs_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ btrfs_inode_resume_unlocked_dio(inode);
+
ret = btrfs_truncate(inode);
+ if (ret && inode->i_nlink)
+ btrfs_orphan_del(NULL, inode);
}
return ret;
@@ -3805,7 +4567,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr->ia_size);
+ err = btrfs_setsize(inode, attr);
if (err)
return err;
}
@@ -3855,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+
rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
btrfs_orphan_del(NULL, inode);
@@ -3892,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- trans = btrfs_start_transaction_lflush(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
btrfs_free_block_rsv(root, rsv);
@@ -3906,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
break;
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
-
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root);
@@ -4262,16 +5027,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- if (unlikely(d_need_lookup(dentry))) {
- memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NULL;
- /* This thing is hashed, drop it for now */
- d_drop(dentry);
- } else {
- ret = btrfs_inode_by_name(dir, dentry, &location);
- }
-
+ ret = btrfs_inode_by_name(dir, dentry, &location);
if (ret < 0)
return ERR_PTR(ret);
@@ -4341,11 +5097,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *ret;
ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
- if (unlikely(d_need_lookup(dentry))) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
- spin_unlock(&dentry->d_lock);
- }
return ret;
}
@@ -4356,7 +5107,7 @@ unsigned char btrfs_filetype_table[] = {
static int btrfs_real_readdir(struct file *filp, void *dirent,
filldir_t filldir)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_item *item;
struct btrfs_dir_item *di;
@@ -4819,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
if (btrfs_test_opt(root, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
}
insert_inode_hash(inode);
@@ -4971,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_update_inode(trans, root, inode);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -5361,8 +6107,7 @@ again:
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, item);
- extent_end = (extent_start + size + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
}
if (start >= extent_end) {
@@ -5434,8 +6179,7 @@ again:
copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
size - extent_offset);
em->start = extent_start + extent_offset;
- em->len = (copy_size + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
+ em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
if (compress_type) {
@@ -5586,10 +6330,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
return em;
if (em) {
/*
- * if our em maps to a hole, there might
- * actually be delalloc bytes behind it
+ * if our em maps to
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
*/
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return em;
else
hole_em = em;
@@ -5671,6 +6418,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
em->start = range_start;
em->len = found;
@@ -5909,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
em->start = start;
em->orig_start = orig_start;
+ em->mod_start = start;
+ em->mod_len = len;
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
@@ -5950,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 len = bh_result->b_size;
struct btrfs_trans_handle *trans;
int unlock_bits = EXTENT_LOCKED;
- int ret;
+ int ret = 0;
- if (create) {
- ret = btrfs_delalloc_reserve_space(inode, len);
- if (ret)
- return ret;
+ if (create)
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
- } else {
+ else
len = min_t(u64, len, root->sectorsize);
- }
lockstart = start;
lockend = start + len - 1;
@@ -5971,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
return -ENOTBLK;
- if (create) {
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_DELALLOC, NULL,
- &cached_state, GFP_NOFS);
- if (ret)
- goto unlock_err;
- }
-
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -6010,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
- ret = 0;
goto unlock_err;
}
@@ -6108,6 +6846,15 @@ unlock:
*/
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockstart + len - 1, EXTENT_DELALLOC, NULL,
+ &cached_state, GFP_NOFS);
+ BUG_ON(ret);
}
/*
@@ -6116,24 +6863,9 @@ unlock:
* aren't using if there is any left over space.
*/
if (lockstart < lockend) {
- if (create && len < lockend - lockstart) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1,
- unlock_bits | EXTENT_DEFRAG, 1, 0,
- &cached_state, GFP_NOFS);
- /*
- * Beside unlock, we also need to cleanup reserved space
- * for the left range by attaching EXTENT_DO_ACCOUNTING.
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- lockstart + len, lockend,
- unlock_bits | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
- } else {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, unlock_bits, 1, 0,
- &cached_state, GFP_NOFS);
- }
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
} else {
free_extent_state(cached_state);
}
@@ -6143,9 +6875,6 @@ unlock:
return 0;
unlock_err:
- if (create)
- unlock_bits |= EXTENT_DO_ACCOUNTING;
-
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
return ret;
@@ -6386,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int async_submit = 0;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
+ ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
bio_put(orig_bio);
return -EIO;
}
-
if (map_length >= orig_bio->bi_size) {
bio = orig_bio;
goto submit;
}
- async_submit = 1;
+ /* async crcs make it difficult to collect full stripe writes. */
+ if (btrfs_get_alloc_profile(root, 1) &
+ (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ async_submit = 0;
+ else
+ async_submit = 1;
+
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
if (!bio)
return -ENOMEM;
@@ -6440,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_end_io = btrfs_end_dio_bio;
map_length = orig_bio->bi_size;
- ret = btrfs_map_block(root->fs_info, READ,
+ ret = btrfs_map_block(root->fs_info, rw,
start_sector << 9,
&map_length, NULL, 0);
if (ret) {
@@ -6583,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = 0;
+ int flags = 0;
+ bool wakeup = true;
+ bool relock = false;
+ ssize_t ret;
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
offset, nr_segs))
return 0;
- return __blockdev_direct_IO(rw, iocb, inode,
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, 0);
+ atomic_inc(&inode->i_dio_count);
+ smp_mb__after_atomic_inc();
+
+ if (rw & WRITE) {
+ count = iov_length(iov, nr_segs);
+ /*
+ * If the write DIO is beyond the EOF, we need update
+ * the isize, but it is protected by i_mutex. So we can
+ * not unlock the i_mutex at this case.
+ */
+ if (offset + count <= inode->i_size) {
+ mutex_unlock(&inode->i_mutex);
+ relock = true;
+ }
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags))) {
+ inode_dio_done(inode);
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ wakeup = false;
+ }
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (rw & WRITE) {
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ btrfs_delalloc_release_space(inode, count);
+ else if (ret >= 0 && (size_t)ret < count)
+ btrfs_delalloc_release_space(inode,
+ count - (size_t)ret);
+ else
+ btrfs_delalloc_release_metadata(inode, 0);
+ }
+out:
+ if (wakeup)
+ inode_dio_done(inode);
+ if (relock)
+ mutex_lock(&inode->i_mutex);
+
+ return ret;
}
#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -6695,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
return;
}
lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
- page_offset(page));
+ ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
if (ordered) {
/*
* IO on this page will never be started, so we need
@@ -6751,7 +7529,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = fdentry(vma->vm_file)->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
@@ -6929,11 +7707,9 @@ static int btrfs_truncate(struct inode *inode)
/*
* 1 for the truncate slack space
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion
* 1 for updating the inode.
*/
- trans = btrfs_start_transaction(root, 4);
+ trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
@@ -6944,12 +7720,6 @@ static int btrfs_truncate(struct inode *inode)
min_size);
BUG_ON(ret);
- ret = btrfs_orphan_add(trans, inode);
- if (ret) {
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -7018,12 +7788,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_orphan_del(trans, inode);
if (ret)
err = ret;
- } else if (ret && inode->i_nlink > 0) {
- /*
- * Failed to do the truncate, remove us from the in memory
- * orphan list.
- */
- ret = btrfs_orphan_del(NULL, inode);
}
if (trans) {
@@ -7190,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ /* the snap/subvol tree is on deleting */
if (btrfs_root_refs(&root->root_item) == 0 &&
- !btrfs_is_free_space_inode(inode))
+ root != root->fs_info->tree_root)
return 1;
else
return generic_drop_inode(inode);
@@ -7273,40 +8038,22 @@ fail:
static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
+ u64 delalloc_bytes;
struct inode *inode = dentry->d_inode;
u32 blocksize = inode->i_sb->s_blocksize;
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
stat->blksize = PAGE_CACHE_SIZE;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
- ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+ ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
-/*
- * If a file is moved, it will inherit the cow and compression flags of the new
- * directory.
- */
-static void fixup_inode_flags(struct inode *dir, struct inode *inode)
-{
- struct btrfs_inode *b_dir = BTRFS_I(dir);
- struct btrfs_inode *b_inode = BTRFS_I(inode);
-
- if (b_dir->flags & BTRFS_INODE_NODATACOW)
- b_inode->flags |= BTRFS_INODE_NODATACOW;
- else
- b_inode->flags &= ~BTRFS_INODE_NODATACOW;
-
- if (b_dir->flags & BTRFS_INODE_COMPRESS) {
- b_inode->flags |= BTRFS_INODE_COMPRESS;
- b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
- } else {
- b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
- BTRFS_INODE_NOCOMPRESS);
- }
-}
-
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -7472,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- fixup_inode_flags(new_dir, old_inode);
-
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
@@ -7545,41 +8290,57 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
*/
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
- struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
struct btrfs_delalloc_work *work, *next;
struct list_head works;
+ struct list_head splice;
int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
INIT_LIST_HEAD(&works);
+ INIT_LIST_HEAD(&splice);
spin_lock(&root->fs_info->delalloc_lock);
- while (!list_empty(head)) {
- binode = list_entry(head->next, struct btrfs_inode,
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
+
+ list_del_init(&binode->delalloc_inodes);
+
inode = igrab(&binode->vfs_inode);
- if (!inode)
- list_del_init(&binode->delalloc_inodes);
+ if (!inode) {
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &binode->runtime_flags);
+ continue;
+ }
+
+ list_add_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
spin_unlock(&root->fs_info->delalloc_lock);
- if (inode) {
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (!work) {
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (unlikely(!work)) {
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
+
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
}
spin_unlock(&root->fs_info->delalloc_lock);
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
/* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
@@ -7592,11 +8353,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
+ return 0;
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
return ret;
}
@@ -7748,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
- 0, *alloc_hint, &ins, 1);
+ ret = btrfs_reserve_extent(trans, root,
+ min(num_bytes, 256ULL * 1024 * 1024),
+ min_size, 0, *alloc_hint, &ins, 1);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4b4516770f05..c83086fdda05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/btrfs.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
@@ -152,7 +152,7 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
{
- struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+ struct btrfs_inode *ip = BTRFS_I(file_inode(file));
unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
if (copy_to_user(arg, &flags, sizeof(flags)))
@@ -177,7 +177,7 @@ static int check_flags(unsigned int flags)
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_inode *ip = BTRFS_I(inode);
struct btrfs_root *root = ip->root;
struct btrfs_trans_handle *trans;
@@ -310,7 +310,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
return put_user(inode->i_generation, arg);
}
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return 0;
}
-static noinline int create_subvol(struct btrfs_root *root,
+static noinline int create_subvol(struct inode *dir,
struct dentry *dentry,
char *name, int namelen,
u64 *async_transid,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
- struct dentry *parent = dentry->d_parent;
- struct inode *dir;
+ struct btrfs_block_rsv block_rsv;
struct timespec cur_time = CURRENT_TIME;
int ret;
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ u64 qgroup_reserved;
uuid_le new_uuid;
ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret)
return ret;
- dir = parent->d_inode;
-
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - inode item
- * 2 - refs
- * 1 - root item
- * 2 - dir items
+ * The same as the snapshot creation, please see the comment
+ * of create_snapshot().
*/
- trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 7, &qgroup_reserved);
+ if (ret)
+ return ret;
- ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
- inherit ? *inherit : NULL);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
if (ret)
goto fail;
@@ -515,8 +521,9 @@ static noinline int create_subvol(struct btrfs_root *root,
BUG_ON(ret);
- d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, root, 1);
@@ -525,12 +532,18 @@ fail:
}
if (err && !ret)
ret = err;
+
+ if (!ret)
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+out:
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen, u64 *async_transid,
- bool readonly, struct btrfs_qgroup_inherit **inherit)
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+ struct dentry *dentry, char *name, int namelen,
+ u64 *async_transid, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -546,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * 1 - parent dir inode
+ * 2 - dir entries
+ * 1 - root item
+ * 2 - root ref/backref
+ * 1 - root of snapshot
+ */
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv, 7,
+ &pending_snapshot->qgroup_reserved);
+ if (ret)
+ goto out;
+
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
- if (inherit) {
- pending_snapshot->inherit = *inherit;
- *inherit = NULL; /* take responsibility to free it */
- }
+ pending_snapshot->dir = dir;
+ pending_snapshot->inherit = inherit;
- trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}
- ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
- BUG_ON(ret);
-
spin_lock(&root->fs_info->trans_lock);
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
@@ -599,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv,
+ pending_snapshot->qgroup_reserved);
+out:
kfree(pending_snapshot);
return ret;
}
@@ -692,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid, bool readonly,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
@@ -729,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry, name, namelen,
+ error = create_snapshot(snap_src, dir, dentry, name, namelen,
async_transid, readonly, inherit);
} else {
- error = create_subvol(BTRFS_I(dir)->root, dentry,
- name, namelen, async_transid, inherit);
+ error = create_subvol(dir, dentry, name, namelen,
+ async_transid, inherit);
}
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -815,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
while(1) {
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, newer_than);
+ path, newer_than);
if (ret != 0)
goto none;
if (min_key.objectid != ino)
@@ -1203,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!(inode->i_sb->s_flags & MS_ACTIVE))
break;
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
+
if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
extent_thresh, &last_len, &skip,
&defrag_end, range->flags &
@@ -1317,7 +1348,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
@@ -1326,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
int ret = 0;
int mod = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1339,7 +1367,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -1359,21 +1388,27 @@ static noinline int btrfs_ioctl_resize(struct file *file,
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
+ if (!devid) {
+ ret = -EINVAL;
+ goto out_free;
+ }
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
+
device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
- ret = -EINVAL;
+ ret = -ENODEV;
goto out_free;
}
- if (device->fs_devices && device->fs_devices->seeding) {
+
+ if (!device->writeable) {
printk(KERN_INFO "btrfs: resizer unable to apply on "
- "seeding device %llu\n",
+ "readonly device %llu\n",
(unsigned long long)devid);
- ret = -EINVAL;
+ ret = -EPERM;
goto out_free;
}
@@ -1395,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (device->is_tgtdev_for_dev_replace) {
- ret = -EINVAL;
+ ret = -EPERM;
goto out_free;
}
@@ -1443,15 +1478,15 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
char *name, unsigned long fd, int subvol,
u64 *transid, bool readonly,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
{
int namelen;
int ret = 0;
@@ -1483,8 +1518,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
goto out_drop_write;
}
- src_inode = src.file->f_path.dentry->d_inode;
- if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+ src_inode = file_inode(src.file);
+ if (src_inode->i_sb != file_inode(file)->i_sb) {
printk(KERN_INFO "btrfs: Snapshot src from "
"another FS\n");
ret = -EINVAL;
@@ -1560,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol, ptr,
- readonly, &inherit);
+ readonly, inherit);
if (ret == 0 && ptr &&
copy_to_user(arg +
@@ -1576,7 +1611,7 @@ out:
static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
@@ -1598,7 +1633,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
@@ -1857,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
path->keep_locks = 1;
while(1) {
- ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+ ret = btrfs_search_forward(root, &key, &max_key, path,
sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -1892,7 +1927,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
if (IS_ERR(args))
return PTR_ERR(args);
- inode = fdentry(file)->d_inode;
+ inode = file_inode(file);
ret = search_ioctl(inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2002,7 +2037,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
if (IS_ERR(args))
return PTR_ERR(args);
- inode = fdentry(file)->d_inode;
+ inode = file_inode(file);
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
@@ -2029,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 qgroup_reserved;
int namelen;
int ret;
int err = 0;
@@ -2095,13 +2132,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
-
- /* check if subvolume may be deleted by a non-root user */
- err = btrfs_may_delete(dir, dentry, 1);
- if (err)
- goto out_dput;
}
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
@@ -2118,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out_up_write;
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode, two for dir entries, two for root
+ * ref/backref.
+ */
+ err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 5, &qgroup_reserved);
+ if (err)
+ goto out_up_write;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto out_up_write;
+ goto out_release;
}
- trans->block_rsv = &root->fs_info->global_block_rsv;
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
@@ -2153,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
}
out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans, root);
if (ret && !err)
err = ret;
inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
@@ -2165,6 +2217,12 @@ out_unlock:
shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
+
+ /* the last ref */
+ if (dest->cache_inode) {
+ iput(dest->cache_inode);
+ dest->cache_inode = NULL;
+ }
}
out_dput:
dput(dentry);
@@ -2178,24 +2236,25 @@ out:
static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_defrag_range_args *range;
int ret;
- if (btrfs_root_readonly(root))
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
- ret = mnt_want_write_file(file);
- if (ret) {
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
- return ret;
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
}
switch (inode->i_mode & S_IFMT) {
@@ -2204,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EPERM;
goto out;
}
- ret = btrfs_defrag_root(root, 0);
+ ret = btrfs_defrag_root(root);
if (ret)
goto out;
- ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+ ret = btrfs_defrag_root(root->fs_info->extent_root);
break;
case S_IFREG:
if (!(file->f_mode & FMODE_WRITE)) {
@@ -2237,7 +2296,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+ ret = btrfs_defrag_file(file_inode(file), file,
range, 0, 0);
if (ret > 0)
ret = 0;
@@ -2247,8 +2306,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EINVAL;
}
out:
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2263,7 +2322,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2285,7 +2344,7 @@ out:
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -2300,7 +2359,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
mnt_drop_write_file(file);
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2375,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2408,7 +2467,7 @@ out:
static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct fd src_file;
struct inode *src;
@@ -2454,7 +2513,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (src_file.file->f_path.mnt != file->f_path.mnt)
goto out_fput;
- src = src_file.file->f_dentry->d_inode;
+ src = file_inode(src_file.file);
ret = -EINVAL;
if (src == inode)
@@ -2816,7 +2875,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
*/
static long btrfs_ioctl_trans_start(struct file *file)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret;
@@ -2856,7 +2915,7 @@ out:
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
@@ -3080,7 +3139,7 @@ out:
*/
long btrfs_ioctl_trans_end(struct file *file)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -3104,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
u64 transid;
int ret;
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
return PTR_ERR(trans);
@@ -3142,7 +3201,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_scrub_args *sa;
int ret;
@@ -3282,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
struct inode_fs_paths *ipath = NULL;
struct btrfs_path *path;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_DAC_READ_SEARCH))
return -EPERM;
path = btrfs_alloc_path();
@@ -3433,12 +3492,12 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
+ bool need_unlock; /* for mut. excl. ops lock */
int ret;
- int need_to_clear_lock = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3447,14 +3506,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
- mutex_lock(&fs_info->volume_mutex);
+again:
+ if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+ need_unlock = true;
+ goto locked;
+ }
+
+ /*
+ * mut. excl. ops lock is locked. Three possibilites:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* this is either (2) or (3) */
+ if (!atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ if (!mutex_trylock(&fs_info->volume_mutex))
+ goto again;
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !atomic_read(&fs_info->balance_running)) {
+ /* this is (3) */
+ need_unlock = false;
+ goto locked;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ goto again;
+ } else {
+ /* this is (2) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ } else {
+ /* this is (1) */
+ mutex_unlock(&fs_info->balance_mutex);
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+locked:
+ BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
- goto out;
+ goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3580,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bargs = NULL;
}
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
goto out_bargs;
}
- need_to_clear_lock = 1;
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
@@ -3501,11 +3604,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
}
do_balance:
- ret = btrfs_balance(bctl, bargs);
/*
- * bctl is freed in __cancel_balance or in free_fs_info if
- * restriper was paused all the way until unmount
+ * Ownership of bctl and mutually_exclusive_operation_running
+ * goes to to btrfs_balance. bctl is freed in __cancel_balance,
+ * or, if restriper was paused all the way until unmount, in
+ * free_fs_info. mutually_exclusive_operation_running is
+ * cleared in __cancel_balance.
*/
+ need_unlock = false;
+
+ ret = btrfs_balance(bctl, bargs);
+
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -3513,12 +3622,12 @@ do_balance:
out_bargs:
kfree(bargs);
-out:
- if (need_to_clear_lock)
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
+out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+ if (need_unlock)
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
mnt_drop_write_file(file);
return ret;
}
@@ -3573,7 +3682,7 @@ out:
static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_quota_ctl_args *sa;
struct btrfs_trans_handle *trans = NULL;
int ret;
@@ -3632,7 +3741,7 @@ drop_write:
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3679,7 +3788,7 @@ drop_write:
static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_qgroup_create_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3698,6 +3807,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto drop_write;
}
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3725,7 +3839,7 @@ drop_write:
static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_ioctl_qgroup_limit_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -3775,7 +3889,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
struct btrfs_ioctl_received_subvol_args *sa = NULL;
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
@@ -3852,10 +3966,69 @@ out:
return ret;
}
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ const char *label = root->fs_info->super_copy->label;
+ size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+ int ret;
+
+ if (len == BTRFS_LABEL_SIZE) {
+ pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+ --len);
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = copy_to_user(arg, label, len);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ char label[BTRFS_LABEL_SIZE];
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, arg, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+ pr_err("btrfs: unable to set label with more than %d bytes\n",
+ BTRFS_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ strcpy(super_block->label, label);
+ ret = btrfs_end_transaction(trans, root);
+
+out_unlock:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
void __user *argp = (void __user *)arg;
switch (cmd) {
@@ -3952,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_qgroup_limit(file, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(root, argp);
+ case BTRFS_IOC_GET_FSLABEL:
+ return btrfs_ioctl_get_fslabel(file, argp);
+ case BTRFS_IOC_SET_FSLABEL:
+ return btrfs_ioctl_set_fslabel(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2e..000000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __IOCTL_
-#define __IOCTL_
-#include <linux/ioctl.h>
-
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
- __s64 fd;
- char name[BTRFS_PATH_NAME_MAX + 1];
-};
-
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
-
-#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
-#define BTRFS_FSID_SIZE 16
-#define BTRFS_UUID_SIZE 16
-
-#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
-
-struct btrfs_qgroup_limit {
- __u64 flags;
- __u64 max_rfer;
- __u64 max_excl;
- __u64 rsv_rfer;
- __u64 rsv_excl;
-};
-
-struct btrfs_qgroup_inherit {
- __u64 flags;
- __u64 num_qgroups;
- __u64 num_ref_copies;
- __u64 num_excl_copies;
- struct btrfs_qgroup_limit lim;
- __u64 qgroups[0];
-};
-
-struct btrfs_ioctl_qgroup_limit_args {
- __u64 qgroupid;
- struct btrfs_qgroup_limit lim;
-};
-
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
- __s64 fd;
- __u64 transid;
- __u64 flags;
- union {
- struct {
- __u64 size;
- struct btrfs_qgroup_inherit __user *qgroup_inherit;
- };
- __u64 unused[4];
- };
- char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-
-/*
- * structure to report errors and progress to userspace, either as a
- * result of a finished scrub, a canceled scrub or a progress inquiry
- */
-struct btrfs_scrub_progress {
- __u64 data_extents_scrubbed; /* # of data extents scrubbed */
- __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
- __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
- __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
- __u64 read_errors; /* # of read errors encountered (EIO) */
- __u64 csum_errors; /* # of failed csum checks */
- __u64 verify_errors; /* # of occurences, where the metadata
- * of a tree block did not match the
- * expected values, like generation or
- * logical */
- __u64 no_csum; /* # of 4k data block for which no csum
- * is present, probably the result of
- * data written with nodatasum */
- __u64 csum_discards; /* # of csum for which no data was found
- * in the extent tree. */
- __u64 super_errors; /* # of bad super blocks encountered */
- __u64 malloc_errors; /* # of internal kmalloc errors. These
- * will likely cause an incomplete
- * scrub */
- __u64 uncorrectable_errors; /* # of errors where either no intact
- * copy was found or the writeback
- * failed */
- __u64 corrected_errors; /* # of errors corrected */
- __u64 last_physical; /* last physical address scrubbed. In
- * case a scrub was aborted, this can
- * be used to restart the scrub */
- __u64 unverified_errors; /* # of occurences where a read for a
- * full (64k) bio failed, but the re-
- * check succeeded for each 4k piece.
- * Intermittent error. */
-};
-
-#define BTRFS_SCRUB_READONLY 1
-struct btrfs_ioctl_scrub_args {
- __u64 devid; /* in */
- __u64 start; /* in */
- __u64 end; /* in */
- __u64 flags; /* in */
- struct btrfs_scrub_progress progress; /* out */
- /* pad to 1k */
- __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
-struct btrfs_ioctl_dev_replace_start_params {
- __u64 srcdevid; /* in, if 0, use srcdev_name instead */
- __u64 cont_reading_from_srcdev_mode; /* in, see #define
- * above */
- __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
- __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
-struct btrfs_ioctl_dev_replace_status_params {
- __u64 replace_state; /* out, see #define above */
- __u64 progress_1000; /* out, 0 <= x <= 1000 */
- __u64 time_started; /* out, seconds since 1-Jan-1970 */
- __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
- __u64 num_write_errors; /* out */
- __u64 num_uncorrectable_read_errors; /* out */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
-struct btrfs_ioctl_dev_replace_args {
- __u64 cmd; /* in */
- __u64 result; /* out */
-
- union {
- struct btrfs_ioctl_dev_replace_start_params start;
- struct btrfs_ioctl_dev_replace_status_params status;
- }; /* in/out */
-
- __u64 spare[64];
-};
-
-struct btrfs_ioctl_dev_info_args {
- __u64 devid; /* in/out */
- __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
- __u64 bytes_used; /* out */
- __u64 total_bytes; /* out */
- __u64 unused[379]; /* pad to 4k */
- __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
-};
-
-struct btrfs_ioctl_fs_info_args {
- __u64 max_id; /* out */
- __u64 num_devices; /* out */
- __u8 fsid[BTRFS_FSID_SIZE]; /* out */
- __u64 reserved[124]; /* pad to 1k */
-};
-
-/* balance control ioctl modes */
-#define BTRFS_BALANCE_CTL_PAUSE 1
-#define BTRFS_BALANCE_CTL_CANCEL 2
-
-/*
- * this is packed, because it should be exactly the same as its disk
- * byte order counterpart (struct btrfs_disk_balance_args)
- */
-struct btrfs_balance_args {
- __u64 profiles;
- __u64 usage;
- __u64 devid;
- __u64 pstart;
- __u64 pend;
- __u64 vstart;
- __u64 vend;
-
- __u64 target;
-
- __u64 flags;
-
- __u64 unused[8];
-} __attribute__ ((__packed__));
-
-/* report balance progress to userspace */
-struct btrfs_balance_progress {
- __u64 expected; /* estimated # of chunks that will be
- * relocated to fulfill the request */
- __u64 considered; /* # of chunks we have considered so far */
- __u64 completed; /* # of chunks relocated so far */
-};
-
-#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
-#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
-#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
-
-struct btrfs_ioctl_balance_args {
- __u64 flags; /* in/out */
- __u64 state; /* out */
-
- struct btrfs_balance_args data; /* in/out */
- struct btrfs_balance_args meta; /* in/out */
- struct btrfs_balance_args sys; /* in/out */
-
- struct btrfs_balance_progress stat; /* out */
-
- __u64 unused[72]; /* pad to 1k */
-};
-
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
- __u64 treeid;
- __u64 objectid;
- char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-
-struct btrfs_ioctl_search_key {
- /* which root are we searching. 0 is the tree of tree roots */
- __u64 tree_id;
-
- /* keys returned will be >= min and <= max */
- __u64 min_objectid;
- __u64 max_objectid;
-
- /* keys returned will be >= min and <= max */
- __u64 min_offset;
- __u64 max_offset;
-
- /* max and min transids to search for */
- __u64 min_transid;
- __u64 max_transid;
-
- /* keys returned will be >= min and <= max */
- __u32 min_type;
- __u32 max_type;
-
- /*
- * how many items did userland ask for, and how many are we
- * returning
- */
- __u32 nr_items;
-
- /* align to 64 bits */
- __u32 unused;
-
- /* some extra for later */
- __u64 unused1;
- __u64 unused2;
- __u64 unused3;
- __u64 unused4;
-};
-
-struct btrfs_ioctl_search_header {
- __u64 transid;
- __u64 objectid;
- __u64 offset;
- __u32 type;
- __u32 len;
-};
-
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
- struct btrfs_ioctl_search_key key;
- char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-
-struct btrfs_ioctl_clone_range_args {
- __s64 src_fd;
- __u64 src_offset, src_length;
- __u64 dest_offset;
-};
-
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-
-struct btrfs_ioctl_space_info {
- __u64 flags;
- __u64 total_bytes;
- __u64 used_bytes;
-};
-
-struct btrfs_ioctl_space_args {
- __u64 space_slots;
- __u64 total_spaces;
- struct btrfs_ioctl_space_info spaces[0];
-};
-
-struct btrfs_data_container {
- __u32 bytes_left; /* out -- bytes not needed to deliver output */
- __u32 bytes_missing; /* out -- additional bytes needed for result */
- __u32 elem_cnt; /* out */
- __u32 elem_missed; /* out */
- __u64 val[0]; /* out */
-};
-
-struct btrfs_ioctl_ino_path_args {
- __u64 inum; /* in */
- __u64 size; /* in */
- __u64 reserved[4];
- /* struct btrfs_data_container *fspath; out */
- __u64 fspath; /* out */
-};
-
-struct btrfs_ioctl_logical_ino_args {
- __u64 logical; /* in */
- __u64 size; /* in */
- __u64 reserved[4];
- /* struct btrfs_data_container *inodes; out */
- __u64 inodes;
-};
-
-enum btrfs_dev_stat_values {
- /* disk I/O failure stats */
- BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
- BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
- BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
-
- /* stats for indirect indications for I/O failures */
- BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
- * contents is illegal: this is an
- * indication that the block was damaged
- * during read or write, or written to
- * wrong location or read from wrong
- * location */
- BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
- * been written */
-
- BTRFS_DEV_STAT_VALUES_MAX
-};
-
-/* Reset statistics after reading; needs SYS_ADMIN capability */
-#define BTRFS_DEV_STATS_RESET (1ULL << 0)
-
-struct btrfs_ioctl_get_dev_stats {
- __u64 devid; /* in */
- __u64 nr_items; /* in/out */
- __u64 flags; /* in/out */
-
- /* out values: */
- __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
-
- __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
-};
-
-#define BTRFS_QUOTA_CTL_ENABLE 1
-#define BTRFS_QUOTA_CTL_DISABLE 2
-#define BTRFS_QUOTA_CTL_RESCAN 3
-struct btrfs_ioctl_quota_ctl_args {
- __u64 cmd;
- __u64 status;
-};
-
-struct btrfs_ioctl_qgroup_assign_args {
- __u64 assign;
- __u64 src;
- __u64 dst;
-};
-
-struct btrfs_ioctl_qgroup_create_args {
- __u64 create;
- __u64 qgroupid;
-};
-struct btrfs_ioctl_timespec {
- __u64 sec;
- __u32 nsec;
-};
-
-struct btrfs_ioctl_received_subvol_args {
- char uuid[BTRFS_UUID_SIZE]; /* in */
- __u64 stransid; /* in */
- __u64 rtransid; /* out */
- struct btrfs_ioctl_timespec stime; /* in */
- struct btrfs_ioctl_timespec rtime; /* out */
- __u64 flags; /* in */
- __u64 reserved[16]; /* in */
-};
-
-struct btrfs_ioctl_send_args {
- __s64 send_fd; /* in */
- __u64 clone_sources_count; /* in */
- __u64 __user *clone_sources; /* in */
- __u64 parent_root; /* in */
- __u64 flags; /* in */
- __u64 reserved[4]; /* in */
-};
-
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
- struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
-
-#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
- struct btrfs_ioctl_vol_args)
-
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
- struct btrfs_ioctl_clone_range_args)
-
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
- struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
- struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
- struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
- struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
- struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
- struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
-#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
-#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
- struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
-#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
- struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
- struct btrfs_ioctl_dev_info_args)
-#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
- struct btrfs_ioctl_fs_info_args)
-#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
- struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
-#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
- struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
- struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
- struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
- struct btrfs_ioctl_received_subvol_args)
-#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
-#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
- struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
- struct btrfs_ioctl_quota_ctl_args)
-#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
- struct btrfs_ioctl_qgroup_assign_args)
-#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
- struct btrfs_ioctl_qgroup_create_args)
-#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
- struct btrfs_ioctl_qgroup_limit_args)
-#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
- struct btrfs_ioctl_get_dev_stats)
-#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
- struct btrfs_ioctl_dev_replace_args)
-
-#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c66041..e95df435d897 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
read_unlock(&eb->lock);
return;
}
- read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
- read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
goto again;
}
atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f10731297040..dc08d77b717e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+ !(type == BTRFS_ORDERED_NOCOW))
+ entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
+ INIT_LIST_HEAD(&entry->log_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
+ WARN_ON(entry->csum_bytes_left < sum->len);
+ entry->csum_bytes_left -= sum->len;
+ if (entry->csum_bytes_left == 0)
+ wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -405,6 +413,66 @@ out:
return ret == 0;
}
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ struct rb_node *n;
+ int index = log->log_transid % 2;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ spin_lock(&log->log_extents_lock[index]);
+ if (list_empty(&ordered->log_list)) {
+ list_add_tail(&ordered->log_list, &log->logged_list[index]);
+ atomic_inc(&ordered->refs);
+ }
+ spin_unlock(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ btrfs_put_ordered_extent(ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
/*
* used to drop a reference on an ordered extent. This will free
* the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
* extra check to make sure the ordered operation list really is empty
* before we return
*/
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct list_head splice;
struct list_head works;
struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
-again:
- list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+ list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
-
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
-
inode = &btrfs_inode->vfs_inode;
list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(inode);
-
- if (!wait && inode) {
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &root->fs_info->ordered_operations);
- }
-
if (!inode)
continue;
+
+ if (!wait)
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_extent_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
- spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_tail(&splice,
- &root->fs_info->ordered_operations);
+ &cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_extent_lock);
ret = -ENOMEM;
goto out;
@@ -603,9 +667,6 @@ again:
cond_resched();
spin_lock(&root->fs_info->ordered_extent_lock);
}
- if (wait && !list_empty(&root->fs_info->ordered_operations))
- goto again;
-
spin_unlock(&root->fs_info->ordered_extent_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
@@ -836,9 +897,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
* if the disk i_size is already at the inode->i_size, or
* this ordered extent is inside the disk i_size, we're done
*/
- if (disk_i_size == i_size || offset <= disk_i_size) {
+ if (disk_i_size == i_size)
+ goto out;
+
+ /*
+ * We still need to update disk_i_size if outstanding_isize is greater
+ * than disk_i_size.
+ */
+ if (offset <= disk_i_size &&
+ (!ordered || ordered->outstanding_isize <= disk_i_size))
goto out;
- }
/*
* walk backward from this ordered extent to disk_i_size.
@@ -870,7 +938,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
break;
if (test->file_offset >= i_size)
break;
- if (test->file_offset >= disk_i_size) {
+ if (entry_end(test) > disk_i_size) {
/*
* we don't update disk_i_size now, so record this
* undealt i_size. Or we will not know the real
@@ -967,6 +1035,7 @@ out:
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
u64 last_mod;
last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -981,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->ordered_extent_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &root->fs_info->ordered_operations);
+ &cur_trans->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe7..8eadfe406cdd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
* has done its due diligence in updating
* the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+ ordered extent */
struct btrfs_ordered_extent {
/* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
+ /* number of bytes that still need csumming */
+ u64 csum_bytes_left;
+
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* If we need to wait on this to be done */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a5..920957ecb27e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_dev_extent_chunk_offset(l, dev_extent),
(unsigned long long)
btrfs_dev_extent_length(l, dev_extent));
+ break;
case BTRFS_DEV_STATS_KEY:
printk(KERN_INFO "\t\tdevice stats\n");
break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..aee4b1cc3d98 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/btrfs.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
#include "ulist.h"
-#include "ioctl.h"
#include "backref.h"
/* TODO XXX FIXME
@@ -379,6 +379,13 @@ next1:
ret = add_relation_rb(fs_info, found_key.objectid,
found_key.offset);
+ if (ret == -ENOENT) {
+ printk(KERN_WARNING
+ "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)found_key.offset);
+ ret = 0; /* ignore the error */
+ }
if (ret)
goto out;
next2:
@@ -613,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
key.offset = qgroupid;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -654,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -695,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
key.offset = 0;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -725,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
{
struct btrfs_path *path;
struct btrfs_key key;
+ struct extent_buffer *leaf = NULL;
int ret;
-
- if (!root)
- return -EINVAL;
+ int nr = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- while (1) {
- key.objectid = 0;
- key.offset = 0;
- key.type = 0;
+ path->leave_spinning = 1;
- path->leave_spinning = 1;
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = 0;
+
+ while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0) {
- if (path->slots[0] == 0)
- break;
- path->slots[0]--;
- } else if (ret < 0) {
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+ nr = btrfs_header_nritems(leaf);
+ if (!nr)
break;
- }
-
- ret = btrfs_del_item(trans, root, path);
+ /*
+ * delete the leaf one by one
+ * since the whole tree is going
+ * to be deleted.
+ */
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
goto out;
+
btrfs_release_path(path);
}
ret = 0;
@@ -840,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
int ret = 0;
spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return 0;
+ }
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
quota_root = fs_info->quota_root;
@@ -956,17 +978,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
int ret = 0;
quota_root = fs_info->quota_root;
if (!quota_root)
return -EINVAL;
+ /* check if there are no relations to this qgroup */
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -EBUSY;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
ret = del_qgroup_item(trans, quota_root, qgroupid);
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
-
spin_unlock(&fs_info->qgroup_lock);
return ret;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..9a79fb790adb
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2100 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT 1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT 2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT 3
+
+
+#define RBIO_CACHE_SIZE 1024
+
+struct btrfs_raid_bio {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_bio *bbio;
+
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
+
+ /* while we're doing rmw on a stripe
+ * we put it into a hash table so we can
+ * lock the stripe and merge more rbios
+ * into it.
+ */
+ struct list_head hash_list;
+
+ /*
+ * LRU list for the stripe cache
+ */
+ struct list_head stripe_cache;
+
+ /*
+ * for scheduling work in the helper threads
+ */
+ struct btrfs_work work;
+
+ /*
+ * bio list and bio_list_lock are used
+ * to add more bios into the stripe
+ * in hopes of avoiding the full rmw
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /* also protected by the bio_list_lock, the
+ * plug list is used by the plugging code
+ * to collect partial bios while plugged. The
+ * stripe locking code also uses it to hand off
+ * the stripe lock to the next pending IO
+ */
+ struct list_head plug_list;
+
+ /*
+ * flags that tell us if it is safe to
+ * merge with this bio
+ */
+ unsigned long flags;
+
+ /* size of each individual stripe on disk */
+ int stripe_len;
+
+ /* number of data stripes (no p/q) */
+ int nr_data;
+
+ /*
+ * set if we're doing a parity rebuild
+ * for a read from higher up, which is handled
+ * differently from a parity rebuild as part of
+ * rmw
+ */
+ int read_rebuild;
+
+ /* first bad stripe */
+ int faila;
+
+ /* second bad stripe (for raid6 use) */
+ int failb;
+
+ /*
+ * number of pages needed to represent the full
+ * stripe
+ */
+ int nr_pages;
+
+ /*
+ * size of all the bios in the bio_list. This
+ * helps us decide if the rbio maps to a full
+ * stripe or not
+ */
+ int bio_list_bytes;
+
+ atomic_t refs;
+
+ /*
+ * these are two arrays of pointers. We allocate the
+ * rbio big enough to hold them both and setup their
+ * locations when the rbio is allocated
+ */
+
+ /* pointers to pages that we allocated for
+ * reading/writing stripes directly from the disk (including P/Q)
+ */
+ struct page **stripe_pages;
+
+ /*
+ * pointers to the pages in the bio_list. Stored
+ * here for faster lookup
+ */
+ struct page **bio_pages;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash_table *x;
+ struct btrfs_stripe_hash *cur;
+ struct btrfs_stripe_hash *h;
+ int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+ int i;
+ int table_size;
+
+ if (info->stripe_hash_table)
+ return 0;
+
+ /*
+ * The table is large, starting with order 4 and can go as high as
+ * order 7 in case lock debugging is turned on.
+ *
+ * Try harder to allocate and fallback to vmalloc to lower the chance
+ * of a failing mount.
+ */
+ table_size = sizeof(*table) + sizeof(*h) * num_entries;
+ table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!table) {
+ table = vzalloc(table_size);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&table->cache_lock);
+ INIT_LIST_HEAD(&table->stripe_cache);
+
+ h = table->table;
+
+ for (i = 0; i < num_entries; i++) {
+ cur = h + i;
+ INIT_LIST_HEAD(&cur->hash_list);
+ spin_lock_init(&cur->lock);
+ init_waitqueue_head(&cur->wait);
+ }
+
+ x = cmpxchg(&info->stripe_hash_table, NULL, table);
+ if (x) {
+ if (is_vmalloc_addr(x))
+ vfree(x);
+ else
+ kfree(x);
+ }
+ return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array. We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ char *s;
+ char *d;
+ int ret;
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ return;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (!rbio->bio_pages[i])
+ continue;
+
+ s = kmap(rbio->bio_pages[i]);
+ d = kmap(rbio->stripe_pages[i]);
+
+ memcpy(d, s, PAGE_CACHE_SIZE);
+
+ kunmap(rbio->bio_pages[i]);
+ kunmap(rbio->stripe_pages[i]);
+ SetPageUptodate(rbio->stripe_pages[i]);
+ }
+ set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+ u64 num = rbio->raid_map[0];
+
+ /*
+ * we shift down quite a bit. We're using byte
+ * addressing, and most of the lower bits are zeros.
+ * This tends to upset hash_64, and it consistently
+ * returns just one or two different values.
+ *
+ * shifting off the lower bits fixes things.
+ */
+ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+ int i;
+ struct page *s;
+ struct page *d;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+ return;
+
+ for (i = 0; i < dest->nr_pages; i++) {
+ s = src->stripe_pages[i];
+ if (!s || !PageUptodate(s)) {
+ continue;
+ }
+
+ d = dest->stripe_pages[i];
+ if (d)
+ __free_page(d);
+
+ dest->stripe_pages[i] = s;
+ src->stripe_pages[i] = NULL;
+ }
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination. The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+ struct btrfs_raid_bio *victim)
+{
+ bio_list_merge(&dest->bio_list, &victim->bio_list);
+ dest->bio_list_bytes += victim->bio_list_bytes;
+ bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache. The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash *h;
+ int freeit = 0;
+
+ /*
+ * check the bit again under the hash table lock.
+ */
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+ h = table->table + bucket;
+
+ /* hold the lock for the bucket because we may be
+ * removing it from the hash table
+ */
+ spin_lock(&h->lock);
+
+ /*
+ * hold the lock for the bio list because we need
+ * to make sure the bio list is empty
+ */
+ spin_lock(&rbio->bio_list_lock);
+
+ if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ list_del_init(&rbio->stripe_cache);
+ table->cache_size -= 1;
+ freeit = 1;
+
+ /* if the bio list isn't empty, this rbio is
+ * still involved in an IO. We take it out
+ * of the cache list, and drop the ref that
+ * was held for the list.
+ *
+ * If the bio_list was empty, we also remove
+ * the rbio from the hash_table, and drop
+ * the corresponding ref
+ */
+ if (bio_list_empty(&rbio->bio_list)) {
+ if (!list_empty(&rbio->hash_list)) {
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+ BUG_ON(!list_empty(&rbio->plug_list));
+ }
+ }
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock(&h->lock);
+
+ if (freeit)
+ __free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ __remove_rbio_from_cache(rbio);
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+ struct btrfs_raid_bio *rbio;
+
+ table = info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ while (!list_empty(&table->stripe_cache)) {
+ rbio = list_entry(table->stripe_cache.next,
+ struct btrfs_raid_bio,
+ stripe_cache);
+ __remove_rbio_from_cache(rbio);
+ }
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ if (!info->stripe_hash_table)
+ return;
+ btrfs_clear_rbio_cache(info);
+ if (is_vmalloc_addr(info->stripe_hash_table))
+ vfree(info->stripe_hash_table);
+ else
+ kfree(info->stripe_hash_table);
+ info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache. It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ /* bump our ref if we were not in the list before */
+ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+ atomic_inc(&rbio->refs);
+
+ if (!list_empty(&rbio->stripe_cache)){
+ list_move(&rbio->stripe_cache, &table->stripe_cache);
+ } else {
+ list_add(&rbio->stripe_cache, &table->stripe_cache);
+ table->cache_size += 1;
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+
+ if (table->cache_size > RBIO_CACHE_SIZE) {
+ struct btrfs_raid_bio *found;
+
+ found = list_entry(table->stripe_cache.prev,
+ struct btrfs_raid_bio,
+ stripe_cache);
+
+ if (found != rbio)
+ __remove_rbio_from_cache(found);
+ }
+
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+ return;
+}
+
+/*
+ * helper function to run the xor_blocks api. It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+ int src_off = 0;
+ int xor_src_cnt = 0;
+ void *dest = pages[src_cnt];
+
+ while(src_cnt > 0) {
+ xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+ xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+ src_cnt -= xor_src_cnt;
+ src_off += xor_src_cnt;
+ }
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long size = rbio->bio_list_bytes;
+ int ret = 1;
+
+ if (size != rbio->nr_data * rbio->stripe_len)
+ ret = 0;
+
+ BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ret = __rbio_is_full(rbio);
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+ return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+ struct btrfs_raid_bio *cur)
+{
+ if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+ test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+ return 0;
+
+ /*
+ * we can't merge with cached rbios, since the
+ * idea is that when we merge the destination
+ * rbio is going to run our IO for us. We can
+ * steal from cached rbio's though, other functions
+ * handle that.
+ */
+ if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+ test_bit(RBIO_CACHE_BIT, &cur->flags))
+ return 0;
+
+ if (last->raid_map[0] !=
+ cur->raid_map[0])
+ return 0;
+
+ /* reads can't merge with writes */
+ if (last->read_rebuild !=
+ cur->read_rebuild) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+ return NULL;
+
+ index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+ PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock. rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet. The rbio is given
+ * the lock and 0 is returned. The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner. The rbio is freed and the IO will
+ * start automatically along with the existing rbio. 1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list. When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission. If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *pending;
+ unsigned long flags;
+ DEFINE_WAIT(wait);
+ struct btrfs_raid_bio *freeit = NULL;
+ struct btrfs_raid_bio *cache_drop = NULL;
+ int ret = 0;
+ int walk = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ list_for_each_entry(cur, &h->hash_list, hash_list) {
+ walk++;
+ if (cur->raid_map[0] == rbio->raid_map[0]) {
+ spin_lock(&cur->bio_list_lock);
+
+ /* can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ atomic_dec(&cur->refs);
+
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
+
+ goto lockit;
+ }
+
+ /* can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * we couldn't merge with the running
+ * rbio, see if we can merge with the
+ * pending ones. We don't have to
+ * check for rmw_locked because there
+ * is no way they are inside finish_rmw
+ * right now
+ */
+ list_for_each_entry(pending, &cur->plug_list,
+ plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* no merging, put us on the tail of the plug list,
+ * our rbio will be started with the currently
+ * running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
+ }
+ }
+lockit:
+ atomic_inc(&rbio->refs);
+ list_add(&rbio->hash_list, &h->hash_list);
+out:
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (cache_drop)
+ remove_rbio_from_cache(cache_drop);
+ if (freeit)
+ __free_raid_bio(freeit);
+ return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed. If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bucket;
+ struct btrfs_stripe_hash *h;
+ unsigned long flags;
+ int keep_cache = 0;
+
+ bucket = rbio_bucket(rbio);
+ h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+ if (list_empty(&rbio->plug_list))
+ cache_rbio(rbio);
+
+ spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ if (!list_empty(&rbio->hash_list)) {
+ /*
+ * if we're still cached and there is no other IO
+ * to perform, just leave this rbio here for others
+ * to steal from later
+ */
+ if (list_empty(&rbio->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ keep_cache = 1;
+ clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ BUG_ON(!bio_list_empty(&rbio->bio_list));
+ goto done;
+ }
+
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+
+ /*
+ * we use the plug list to hold all the rbios
+ * waiting for the chance to lock this stripe.
+ * hand the lock over to one of them.
+ */
+ if (!list_empty(&rbio->plug_list)) {
+ struct btrfs_raid_bio *next;
+ struct list_head *head = rbio->plug_list.next;
+
+ next = list_entry(head, struct btrfs_raid_bio,
+ plug_list);
+
+ list_del_init(&rbio->plug_list);
+
+ list_add(&next->hash_list, &h->hash_list);
+ atomic_inc(&next->refs);
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (next->read_rebuild)
+ async_read_rebuild(next);
+ else {
+ steal_rbio(rbio, next);
+ async_rmw_stripe(next);
+ }
+
+ goto done_nolock;
+ } else if (waitqueue_active(&h->wait)) {
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+ wake_up(&h->wait);
+ goto done_nolock;
+ }
+ }
+done:
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+ if (!keep_cache)
+ remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ WARN_ON(atomic_read(&rbio->refs) < 0);
+ if (!atomic_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+ kfree(rbio->raid_map);
+ kfree(rbio->bbio);
+ kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ unlock_stripe(rbio);
+ __free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+ struct bio *cur = bio_list_get(&rbio->bio_list);
+ struct bio *next;
+ free_raid_bio(rbio);
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ if (uptodate)
+ set_bit(BIO_UPTODATE, &cur->bi_flags);
+ bio_endio(cur, err);
+ cur = next;
+ }
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ err = 0;
+
+ /* OK, we have read all the stripes we need to. */
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ err = -EIO;
+
+ rbio_orig_end_io(rbio, err, 0);
+ return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else. This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+ int index, int pagenr, int bio_list_only)
+{
+ int chunk_page;
+ struct page *p = NULL;
+
+ chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ p = rbio->bio_pages[chunk_page];
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ if (p || bio_list_only)
+ return p;
+
+ return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+ unsigned long nr = stripe_len * nr_stripes;
+ return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio. Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ int nr_data = 0;
+ int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+ void *p;
+
+ rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+ GFP_NOFS);
+ if (!rbio) {
+ kfree(raid_map);
+ kfree(bbio);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bio_list_init(&rbio->bio_list);
+ INIT_LIST_HEAD(&rbio->plug_list);
+ spin_lock_init(&rbio->bio_list_lock);
+ INIT_LIST_HEAD(&rbio->stripe_cache);
+ INIT_LIST_HEAD(&rbio->hash_list);
+ rbio->bbio = bbio;
+ rbio->raid_map = raid_map;
+ rbio->fs_info = root->fs_info;
+ rbio->stripe_len = stripe_len;
+ rbio->nr_pages = num_pages;
+ rbio->faila = -1;
+ rbio->failb = -1;
+ atomic_set(&rbio->refs, 1);
+
+ /*
+ * the stripe_pages and bio_pages array point to the extra
+ * memory we allocated past the end of the rbio
+ */
+ p = rbio + 1;
+ rbio->stripe_pages = p;
+ rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+
+ if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+ nr_data = bbio->num_stripes - 2;
+ else
+ nr_data = bbio->num_stripes - 1;
+
+ rbio->nr_data = nr_data;
+ return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+ for (; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ }
+ return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct page *page,
+ int stripe_nr,
+ unsigned long page_index,
+ unsigned long bio_max_len)
+{
+ struct bio *last = bio_list->tail;
+ u64 last_end = 0;
+ int ret;
+ struct bio *bio;
+ struct btrfs_bio_stripe *stripe;
+ u64 disk_start;
+
+ stripe = &rbio->bbio->stripes[stripe_nr];
+ disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+ /* if the device is missing, just fail this stripe */
+ if (!stripe->dev->bdev)
+ return fail_rbio_index(rbio, stripe_nr);
+
+ /* see if we can add this page onto our existing bio */
+ if (last) {
+ last_end = (u64)last->bi_sector << 9;
+ last_end += last->bi_size;
+
+ /*
+ * we can't merge these if they are from different
+ * devices or if they are not contiguous
+ */
+ if (last_end == disk_start && stripe->dev->bdev &&
+ test_bit(BIO_UPTODATE, &last->bi_flags) &&
+ last->bi_bdev == stripe->dev->bdev) {
+ ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+ if (ret == PAGE_CACHE_SIZE)
+ return 0;
+ }
+ }
+
+ /* put a new bio on the list */
+ bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_size = 0;
+ bio->bi_bdev = stripe->dev->bdev;
+ bio->bi_sector = disk_start >> 9;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_list_add(bio_list, bio);
+ return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk. This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction. The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+ __raid56_parity_recover(rbio);
+ } else {
+ finish_rmw(rbio);
+ }
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+ int index;
+ index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+ index += page;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result. This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ struct bio *bio;
+ u64 start;
+ unsigned long stripe_offset;
+ unsigned long page_index;
+ struct page *p;
+ int i;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ bio_list_for_each(bio, &rbio->bio_list) {
+ start = (u64)bio->bi_sector << 9;
+ stripe_offset = start - rbio->raid_map[0];
+ page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ rbio->bio_pages[page_index + i] = p;
+ }
+ }
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations. We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_bio *bbio = rbio->bbio;
+ void *pointers[bbio->num_stripes];
+ int stripe_len = rbio->stripe_len;
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int pagenr;
+ int p_stripe = -1;
+ int q_stripe = -1;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (bbio->num_stripes - rbio->nr_data == 1) {
+ p_stripe = bbio->num_stripes - 1;
+ } else if (bbio->num_stripes - rbio->nr_data == 2) {
+ p_stripe = bbio->num_stripes - 2;
+ q_stripe = bbio->num_stripes - 1;
+ } else {
+ BUG();
+ }
+
+ /* at this point we either have a full stripe,
+ * or we've read the full stripe from the drive.
+ * recalculate the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ atomic_set(&rbio->bbio->error, 0);
+
+ /*
+ * now that we've set rmw_locked, run through the
+ * bio list one last time and map the page pointers
+ *
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ index_rbio_pages(rbio);
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *p;
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
+ pointers[stripe] = kmap(p);
+ }
+
+ /* then add the parity stripe */
+ p = rbio_pstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ if (q_stripe != -1) {
+
+ /*
+ * raid6, add the qstripe and call the
+ * library function to fill in our p/q
+ */
+ p = rbio_qstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ }
+
+
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+ kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ }
+
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *page;
+ if (stripe < rbio->nr_data) {
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (!page)
+ continue;
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ page, stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+ BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_write_end_io;
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(WRITE, bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio. Used to figure out which
+ * stripe has failed. This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 physical = bio->bi_sector;
+ u64 stripe_start;
+ int i;
+ struct btrfs_bio_stripe *stripe;
+
+ physical <<= 9;
+
+ for (i = 0; i < rbio->bbio->num_stripes; i++) {
+ stripe = &rbio->bbio->stripes[i];
+ stripe_start = stripe->physical;
+ if (physical >= stripe_start &&
+ physical < stripe_start + rbio->stripe_len) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping). Used to figure out which stripe has
+ * failed. This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 logical = bio->bi_sector;
+ u64 stripe_start;
+ int i;
+
+ logical <<= 9;
+
+ for (i = 0; i < rbio->nr_data; i++) {
+ stripe_start = rbio->raid_map[i];
+ if (logical >= stripe_start &&
+ logical < stripe_start + rbio->stripe_len) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+ /* we already know this stripe is bad, move on */
+ if (rbio->faila == failed || rbio->failb == failed)
+ goto out;
+
+ if (rbio->faila == -1) {
+ /* first failure on this rbio */
+ rbio->faila = failed;
+ atomic_inc(&rbio->bbio->error);
+ } else if (rbio->failb == -1) {
+ /* second failure on this rbio */
+ rbio->failb = failed;
+ atomic_inc(&rbio->bbio->error);
+ } else {
+ ret = -EIO;
+ }
+out:
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ int failed = find_bio_stripe(rbio, bio);
+
+ if (failed < 0)
+ return -EIO;
+
+ return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate. It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ SetPageUptodate(p);
+ }
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ err = 0;
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ goto cleanup;
+
+ /*
+ * this will normally call finish_rmw to start our write
+ * but if there are any failed stripes we'll reconstruct
+ * from parity first
+ */
+ validate_rbio_for_rmw(rbio);
+ return;
+
+cleanup:
+
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ rbio->work.flags = 0;
+ rbio->work.func = rmw_work;
+
+ btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+ rbio->work.flags = 0;
+ rbio->work.func = read_rebuild_work;
+
+ btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller. It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct btrfs_bio *bbio = rbio->bbio;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ index_rbio_pages(rbio);
+
+ atomic_set(&rbio->bbio->error, 0);
+ /*
+ * build a list of bios to read all the missing parts of this
+ * stripe
+ */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *page;
+ /*
+ * we want to find all the pages missing from
+ * the rbio and read them from the disk. If
+ * page_in_rbio finds a page in the bio list
+ * we don't need to read it off the stripe.
+ */
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (page)
+ continue;
+
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ /*
+ * the bio cache may have handed us an uptodate
+ * page. If so, be happy and use it
+ */
+ if (PageUptodate(page))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&bbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_rmw_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return 0;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+
+finish:
+ validate_rbio_for_rmw(rbio);
+ return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret)
+ return ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ finish_rmw(rbio);
+ return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ async_rmw_stripe(rbio);
+ return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe. So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+ /* head off into rmw land if we don't have a full stripe */
+ if (!rbio_is_full(rbio))
+ return partial_stripe_write(rbio);
+ return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list. When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+ struct blk_plug_cb cb;
+ struct btrfs_fs_info *info;
+ struct list_head rbio_list;
+ struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
+ u64 a_sector = ra->bio_list.head->bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_sector;
+
+ if (a_sector < b_sector)
+ return -1;
+ if (a_sector > b_sector)
+ return 1;
+ return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *last = NULL;
+
+ /*
+ * sort our plug list then try to merge
+ * everything we can in hopes of creating full
+ * stripes.
+ */
+ list_sort(NULL, &plug->rbio_list, plug_cmp);
+ while (!list_empty(&plug->rbio_list)) {
+ cur = list_entry(plug->rbio_list.next,
+ struct btrfs_raid_bio, plug_list);
+ list_del_init(&cur->plug_list);
+
+ if (rbio_is_full(cur)) {
+ /* we have a full stripe, send it down */
+ full_stripe_write(cur);
+ continue;
+ }
+ if (last) {
+ if (rbio_can_merge(last, cur)) {
+ merge_rbio(last, cur);
+ __free_raid_bio(cur);
+ continue;
+
+ }
+ __raid56_parity_write(last);
+ }
+ last = cur;
+ }
+ if (last) {
+ __raid56_parity_write(last);
+ }
+ kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(work, struct btrfs_plug_cb, work);
+ run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+ if (from_schedule) {
+ plug->work.flags = 0;
+ plug->work.func = unplug_work;
+ btrfs_queue_worker(&plug->info->rmw_workers,
+ &plug->work);
+ return;
+ }
+ run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
+
+ rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ if (IS_ERR(rbio)) {
+ kfree(raid_map);
+ kfree(bbio);
+ return PTR_ERR(rbio);
+ }
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_size;
+
+ /*
+ * don't plug on full rbios, just get them out the door
+ * as quickly as we can
+ */
+ if (rbio_is_full(rbio))
+ return full_stripe_write(rbio);
+
+ cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+ sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = root->fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ } else {
+ return __raid56_parity_write(rbio);
+ }
+ return 0;
+}
+
+/*
+ * all parity reconstruction happens here. We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+ int pagenr, stripe;
+ void **pointers;
+ int faila = -1, failb = -1;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int err;
+ int i;
+
+ pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+ GFP_NOFS);
+ if (!pointers) {
+ err = -ENOMEM;
+ goto cleanup_io;
+ }
+
+ faila = rbio->faila;
+ failb = rbio->failb;
+
+ if (rbio->read_rebuild) {
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
+
+ index_rbio_pages(rbio);
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ /* setup our array of pointers with pages
+ * from each stripe
+ */
+ for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->read_rebuild &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ pointers[stripe] = kmap(page);
+ }
+
+ /* all raid6 handling here */
+ if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+ RAID6_Q_STRIPE) {
+
+ /*
+ * single failure, rebuild from parity raid5
+ * style
+ */
+ if (failb < 0) {
+ if (faila == rbio->nr_data) {
+ /*
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * TODO, we should redo the xor here.
+ */
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
+ */
+ goto pstripe;
+ }
+
+ /* make sure our ps and qs are in order */
+ if (faila > failb) {
+ int tmp = failb;
+ failb = faila;
+ faila = tmp;
+ }
+
+ /* if the q stripe is failed, do a pstripe reconstruction
+ * from the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want
+ */
+ if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
+
+ if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->bbio->num_stripes,
+ PAGE_SIZE, faila, pointers);
+ } else {
+ raid6_2data_recov(rbio->bbio->num_stripes,
+ PAGE_SIZE, faila, failb,
+ pointers);
+ }
+ } else {
+ void *p;
+
+ /* rebuild from P stripe here (raid5 or raid6) */
+ BUG_ON(failb != -1);
+pstripe:
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila],
+ pointers[rbio->nr_data],
+ PAGE_CACHE_SIZE);
+
+ /* rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+ pointers[stripe] = pointers[stripe + 1];
+ pointers[rbio->nr_data - 1] = p;
+
+ /* xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ }
+ /* if we're doing this rebuild as part of an rmw, go through
+ * and set all of our private rbio pages in the
+ * failed stripes as uptodate. This way finish_rmw will
+ * know they can be trusted. If this was a read reconstruction,
+ * other endio functions will fiddle the uptodate bits
+ */
+ if (!rbio->read_rebuild) {
+ for (i = 0; i < nr_pages; i++) {
+ if (faila != -1) {
+ page = rbio_stripe_page(rbio, faila, i);
+ SetPageUptodate(page);
+ }
+ if (failb != -1) {
+ page = rbio_stripe_page(rbio, failb, i);
+ SetPageUptodate(page);
+ }
+ }
+ }
+ for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->read_rebuild &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ kunmap(page);
+ }
+ }
+
+ err = 0;
+cleanup:
+ kfree(pointers);
+
+cleanup_io:
+
+ if (rbio->read_rebuild) {
+ if (err == 0)
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ rbio_orig_end_io(rbio, err, err == 0);
+ } else if (err == 0) {
+ rbio->faila = -1;
+ rbio->failb = -1;
+ finish_rmw(rbio);
+ } else {
+ rbio_orig_end_io(rbio, err, 0);
+ }
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ /*
+ * we only read stripe pages off the disk, set them
+ * up to date if there were no errors
+ */
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+ return;
+
+ if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ else
+ __raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct btrfs_bio *bbio = rbio->bbio;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ atomic_set(&rbio->bbio->error, 0);
+
+ /*
+ * read everything that hasn't failed. Thanks to the
+ * stripe cache, it is possible that some or all of these
+ * pages are going to be uptodate.
+ */
+ for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+ if (rbio->faila == stripe ||
+ rbio->failb == stripe)
+ continue;
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *p;
+
+ /*
+ * the rmw code may have already read this
+ * page in
+ */
+ p = rbio_stripe_page(rbio, stripe, pagenr);
+ if (PageUptodate(p))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ rbio_stripe_page(rbio, stripe, pagenr),
+ stripe, pagenr, rbio->stripe_len);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * we might have no bios to read just because the pages
+ * were up to date, or we might have no bios to read because
+ * the devices were gone.
+ */
+ if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+ __raid_recover_end_io(rbio);
+ goto out;
+ } else {
+ goto cleanup;
+ }
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&bbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_recover_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+out:
+ return 0;
+
+cleanup:
+ if (rbio->read_rebuild)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers. This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, int mirror_num)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ if (IS_ERR(rbio)) {
+ return PTR_ERR(rbio);
+ }
+
+ rbio->read_rebuild = 1;
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_size;
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ kfree(rbio);
+ return -EIO;
+ }
+
+ /*
+ * reconstruct from the q stripe if they are
+ * asking for mirror 3
+ */
+ if (mirror_num == 3)
+ rbio->failb = bbio->num_stripes - 2;
+
+ ret = lock_stripe_add(rbio);
+
+ /*
+ * __raid56_parity_recover will end the bio with
+ * any errors it hits. We don't want to return
+ * its error value up the stack because our caller
+ * will end up calling bio_endio with any nonzero
+ * return
+ */
+ if (ret == 0)
+ __raid56_parity_recover(rbio);
+ /*
+ * our rbio has been added to the list of
+ * rbios that will be handled after the
+ * currently lock owner is done
+ */
+ return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 1;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+ else
+ return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+ return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
+ ((x) == RAID6_Q_STRIPE))
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len, int mirror_num);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 *raid_map,
+ u64 stripe_len);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09ac3659..50695dc5e2ab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
}
}
- page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
@@ -3472,7 +3472,7 @@ out:
}
/*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
*/
static noinline_for_stack
int add_data_references(struct reloc_control *rc,
@@ -3566,7 +3566,7 @@ int add_data_references(struct reloc_control *rc,
}
/*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
*/
static noinline_for_stack
int find_next_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "raid56.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -580,20 +581,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
int corrected = 0;
struct btrfs_key key;
struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info;
u64 end = offset + PAGE_SIZE - 1;
struct btrfs_root *local_root;
+ int srcu_index;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
- if (IS_ERR(local_root))
+
+ fs_info = fixup->root->fs_info;
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
return PTR_ERR(local_root);
+ }
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
- inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -606,7 +616,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
}
if (PageUptodate(page)) {
- struct btrfs_fs_info *fs_info;
if (PageDirty(page)) {
/*
* we need to write the data to the defect sector. the
@@ -2246,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_device *extent_dev;
int extent_mirror_num;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ if (num >= nr_data_stripes(map)) {
+ return 0;
+ }
+ }
+
nstripes = length;
offset = 0;
do_div(nstripes, map->stripe_len);
@@ -2700,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_root *root = sctx->dev_root;
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EIO;
gen = root->fs_info->last_trans_committed;
@@ -3180,18 +3196,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
u64 physical_for_dev_replace;
u64 len;
struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ int srcu_index;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
local_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(local_root))
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
return PTR_ERR(local_root);
+ }
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 54454542ad40..f7a8b861058b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
u32 send_max_size;
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
struct vfsmount *mnt;
@@ -1814,8 +1815,10 @@ static int name_cache_insert(struct send_ctx *sctx,
(unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
- if (!nce_head)
+ if (!nce_head) {
+ kfree(nce);
return -ENOMEM;
+ }
INIT_LIST_HEAD(nce_head);
ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
@@ -3707,6 +3710,39 @@ out:
return ret;
}
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+ u64 offset, u32 len)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+ p = fs_path_alloc(sctx);
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(sctx, p);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -3742,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
goto out;
}
- if (!clone_root) {
+ if (clone_root) {
+ ret = send_clone(sctx, offset, len, clone_root);
+ } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ ret = send_update_extent(sctx, offset, len);
+ } else {
while (pos < len) {
l = len - pos;
if (l > BTRFS_SEND_READ_SIZE)
@@ -3755,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
pos += ret;
}
ret = 0;
- } else {
- ret = send_clone(sctx, offset, len, clone_root);
}
-
out:
return ret;
}
@@ -4534,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
struct btrfs_fs_info *fs_info;
struct btrfs_ioctl_send_args *arg = NULL;
struct btrfs_key key;
- struct file *filp = NULL;
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
@@ -4542,7 +4578,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
+ send_root = BTRFS_I(file_inode(mnt_file))->root;
fs_info = send_root->fs_info;
arg = memdup_user(arg_, sizeof(*arg));
@@ -4559,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
+ if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ ret = -EINVAL;
+ goto out;
+ }
+
sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
if (!sctx) {
ret = -ENOMEM;
@@ -4570,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
INIT_LIST_HEAD(&sctx->name_cache_list);
+ sctx->flags = arg->flags;
+
sctx->send_filp = fget(arg->send_fd);
if (IS_ERR(sctx->send_filp)) {
ret = PTR_ERR(sctx->send_filp);
@@ -4671,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
out:
- if (filp)
- fput(filp);
kfree(arg);
vfree(clone_sources_tmp);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4ef..8bb18f7ccaa6 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_UTIMES,
BTRFS_SEND_C_END,
+ BTRFS_SEND_C_UPDATE_EXTENT,
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86c..68a29a1ea068 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "xattr.h"
#include "volumes.h"
@@ -63,8 +63,7 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
- char nbuf[16])
+static const char *btrfs_decode_error(int errno, char nbuf[16])
{
char *errstr = NULL;
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
* today we only save the error info into ram. Long term we'll
* also send it down to the disk
*/
- fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
}
static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
if (sb->s_flags & MS_RDONLY)
return;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
/*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
struct super_block *sb = fs_info->sb;
char nbuf[16];
const char *errstr;
- va_list args;
- va_start(args, fmt);
/*
* Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ errstr = btrfs_decode_error(errno, nbuf);
if (fmt) {
- struct va_format vaf = {
- .fmt = fmt,
- .va = &args,
- };
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
sb->s_id, function, line, errstr, &vaf);
+ va_end(args);
} else {
printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
save_error_info(fs_info);
btrfs_handle_error(fs_info);
}
- va_end(args);
}
static const char * const logtypes[] = {
@@ -261,13 +260,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
char nbuf[16];
const char *errstr;
- errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+ errstr = btrfs_decode_error(errno, nbuf);
btrfs_printk(root->fs_info,
"%s:%d: Aborting unused transaction(%s).\n",
function, line, errstr);
return;
}
- trans->transaction->aborted = errno;
+ ACCESS_ONCE(trans->transaction->aborted) = errno;
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(fs_info, errno, nbuf);
- if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+ errstr = btrfs_decode_error(errno, nbuf);
+ if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
s_id, function, line, &vaf, errstr);
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
+ /* Fallthrough */
case Opt_compress:
case Opt_compress_type:
if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_alloc_start:
num = match_strdup(&args[0]);
if (num) {
+ mutex_lock(&info->chunk_mutex);
info->alloc_start = memparse(num, NULL);
+ mutex_unlock(&info->chunk_mutex);
kfree(num);
printk(KERN_INFO
"btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_wait_ordered_extents(root, 0);
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
new_pool_size);
}
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts, int flags)
+{
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (flags & MS_RDONLY))) {
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+ if (flags & MS_RDONLY)
+ sync_filesystem(fs_info->sb);
+ }
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts)
+{
+ /*
+ * We need cleanup all defragable inodes if the autodefragment is
+ * close or the fs is R/O.
+ */
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (fs_info->sb->s_flags & MS_RDONLY))) {
+ btrfs_cleanup_defrag_inodes(fs_info);
+ }
+
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
+ btrfs_remount_prepare(fs_info, old_opts, *flags);
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
fs_info->thread_pool_size, old_thread_pool_size);
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
- return 0;
+ goto out;
if (*flags & MS_RDONLY) {
/*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
sb->s_flags &= ~MS_RDONLY;
}
-
+out:
+ btrfs_remount_cleanup(fs_info, old_opts);
return 0;
restore:
@@ -1289,10 +1326,13 @@ restore:
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
+ mutex_lock(&fs_info->chunk_mutex);
fs_info->alloc_start = old_alloc_start;
+ mutex_unlock(&fs_info->chunk_mutex);
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
+ btrfs_remount_cleanup(fs_info, old_opts);
return ret;
}
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_sb(sb)->tree_root;
- trans = btrfs_attach_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_inode;
- err = btrfs_interface_init();
+ err = btrfs_delayed_ref_init();
if (err)
goto free_auto_defrag;
+ err = btrfs_interface_init();
+ if (err)
+ goto free_delayed_ref;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_delayed_ref:
+ btrfs_delayed_ref_exit();
free_auto_defrag:
btrfs_auto_defrag_exit();
free_delayed_inode:
@@ -1720,6 +1766,7 @@ free_compress:
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
+ btrfs_delayed_ref_exit();
btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d731..5b326cd60a4a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
-#include <linux/module.h>
#include <linux/kobject.h>
#include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..e52da6fb1165 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
- memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
root->commit_root = btrfs_root_node(root);
}
+static inline int can_join_transaction(struct btrfs_transaction *trans,
+ int type)
+{
+ return !(trans->in_commit &&
+ type != TRANS_JOIN &&
+ type != TRANS_JOIN_NOLOCK);
+}
+
/*
* either allocate a new transaction or hop into the existing one
*/
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -86,6 +93,10 @@ loop:
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
+ if (!can_join_transaction(cur_trans, type)) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EBUSY;
+ }
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
@@ -112,9 +123,8 @@ loop:
* to redo the trans_no_join checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- cur_trans = fs_info->running_transaction;
goto loop;
- } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
return -EROFS;
@@ -156,8 +166,12 @@ loop:
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
+ atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+ atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+ init_waitqueue_head(&cur_trans->delayed_refs.wait);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ INIT_LIST_HEAD(&cur_trans->ordered_operations);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -302,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
int ret;
u64 qgroup_reserved = 0;
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -333,12 +347,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
&root->fs_info->trans_block_rsv,
num_bytes, flush);
if (ret)
- return ERR_PTR(ret);
+ goto reserve_fail;
}
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
- if (!h)
- return ERR_PTR(-ENOMEM);
+ if (!h) {
+ ret = -ENOMEM;
+ goto alloc_fail;
+ }
/*
* If we are JOIN_NOLOCK we're already committing a transaction and
@@ -358,18 +374,17 @@ again:
do {
ret = join_transaction(root, type);
- if (ret == -EBUSY)
+ if (ret == -EBUSY) {
wait_current_trans(root);
+ if (unlikely(type == TRANS_ATTACH))
+ ret = -ENOENT;
+ }
} while (ret == -EBUSY);
if (ret < 0) {
/* We must get the transaction if we are JOIN_NOLOCK. */
BUG_ON(type == TRANS_JOIN_NOLOCK);
-
- if (type < TRANS_JOIN_NOLOCK)
- sb_end_intwrite(root->fs_info->sb);
- kmem_cache_free(btrfs_trans_handle_cachep, h);
- return ERR_PTR(ret);
+ goto join_fail;
}
cur_trans = root->fs_info->running_transaction;
@@ -385,9 +400,10 @@ again:
h->block_rsv = NULL;
h->orig_rsv = NULL;
h->aborted = 0;
- h->qgroup_reserved = qgroup_reserved;
+ h->qgroup_reserved = 0;
h->delayed_ref_elem.seq = 0;
h->type = type;
+ h->allocating_chunk = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
@@ -403,6 +419,7 @@ again:
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
}
+ h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -410,6 +427,19 @@ got_it:
if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
return h;
+
+join_fail:
+ if (type < TRANS_JOIN_NOLOCK)
+ sb_end_intwrite(root->fs_info->sb);
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+ if (num_bytes)
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ num_bytes);
+reserve_fail:
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
+ return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
@@ -441,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
return start_transaction(root, 0, TRANS_USERSPACE, 0);
}
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ * btrfs_attach_transaction_barrier()
+ */
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_ATTACH, 0);
}
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+ btrfs_wait_for_commit(root, 0);
+
+ return trans;
+}
+
/* wait for a transaction commit to be fully complete */
static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
@@ -577,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- while (count < 2) {
+ while (count < 1) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
@@ -589,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
count++;
}
+
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -634,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_run_delayed_iputs(root);
if (trans->aborted ||
- root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
err = -EIO;
- }
assert_qgroups_uptodate(trans);
- memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return err;
}
@@ -686,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -700,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+ blk_finish_plug(&plug);
return werr;
}
@@ -950,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
}
/*
- * defrag a given btree. If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
*/
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -967,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_defrag_leaves(trans, root, cacheonly);
+ ret = btrfs_defrag_leaves(trans, root);
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(info->tree_root);
@@ -975,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
break;
+
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
}
root->defrag_running = 0;
return ret;
@@ -997,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct inode *parent_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
- struct dentry *parent;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1012,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = pending->error = -ENOMEM;
- goto path_alloc_fail;
+ return ret;
}
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -1052,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
+ trans->bytes_reserved = trans->block_rsv->reserved;
dentry = pending->dentry;
- parent = dget_parent(dentry);
- parent_inode = parent->d_inode;
+ parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
@@ -1203,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, root, ret);
fail:
- dput(parent);
trans->block_rsv = rsv;
+ trans->bytes_reserved = 0;
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
btrfs_free_path(path);
-path_alloc_fail:
- btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
return ret;
}
@@ -1296,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_async_commit {
struct btrfs_trans_handle *newtrans;
struct btrfs_root *root;
- struct delayed_work work;
+ struct work_struct work;
};
static void do_async_commit(struct work_struct *work)
{
struct btrfs_async_commit *ac =
- container_of(work, struct btrfs_async_commit, work.work);
+ container_of(work, struct btrfs_async_commit, work);
/*
* We've got freeze protection passed with the transaction.
@@ -1330,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
if (!ac)
return -ENOMEM;
- INIT_DELAYED_WORK(&ac->work, do_async_commit);
+ INIT_WORK(&ac->work, do_async_commit);
ac->root = root;
ac->newtrans = btrfs_join_transaction(root);
if (IS_ERR(ac->newtrans)) {
@@ -1354,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1, _THIS_IP_);
- schedule_delayed_work(&ac->work, 0);
+ schedule_work(&ac->work);
/* wait for transaction to start and unblock */
if (wait_for_unblock)
@@ -1374,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int err)
{
struct btrfs_transaction *cur_trans = trans->transaction;
+ DEFINE_WAIT(wait);
WARN_ON(trans->use_count > 1);
@@ -1382,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
if (cur_trans == root->fs_info->running_transaction) {
+ root->fs_info->trans_no_join = 1;
+ spin_unlock(&root->fs_info->trans_lock);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ spin_lock(&root->fs_info->trans_lock);
root->fs_info->running_transaction = NULL;
- root->fs_info->trans_no_join = 0;
}
spin_unlock(&root->fs_info->trans_lock);
@@ -1417,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
}
if (flush_on_commit || snap_pending) {
- btrfs_start_delalloc_inodes(root, 1);
+ ret = btrfs_start_delalloc_inodes(root, 1);
+ if (ret)
+ return ret;
btrfs_wait_ordered_extents(root, 1);
}
@@ -1439,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
* it here and no for sure that nothing new will be added
* to the list
*/
- btrfs_run_ordered_operations(root, 1);
+ ret = btrfs_run_ordered_operations(trans, root, 1);
- return 0;
+ return ret;
}
/*
@@ -1462,26 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
int should_grow = 0;
unsigned long now = get_seconds();
- ret = btrfs_run_ordered_operations(root, 0);
+ ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto cleanup_transaction;
+ btrfs_end_transaction(trans, root);
+ return ret;
}
- if (cur_trans->aborted) {
+ /* Stop the commit early if ->aborted is set */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- goto cleanup_transaction;
+ btrfs_end_transaction(trans, root);
+ return ret;
}
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
- if (ret)
- goto cleanup_transaction;
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
cur_trans = trans->transaction;
@@ -1495,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_create_pending_block_groups(trans, root);
ret = btrfs_run_delayed_refs(trans, root, 0);
- if (ret)
- goto cleanup_transaction;
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
@@ -1574,6 +1660,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
+ /* ->aborted might be set after the previous check, so check it */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ goto cleanup_transaction;
+ }
/*
* the reloc mutex makes sure that we stop
* the balancing code from coming in and moving
@@ -1657,6 +1748,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
}
+ /*
+ * The tasks which save the space cache and inode cache may also
+ * update ->aborted, check it.
+ */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
+
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
@@ -1744,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
// WARN_ON(1);
if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c287..3c8e0d25c8e4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
+ struct list_head ordered_operations;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
struct btrfs_block_rsv *orig_rsv;
short aborted;
short adding_csums;
+ bool allocating_chunk;
enum btrfs_trans_type type;
/*
* this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
struct btrfs_pending_snapshot {
struct dentry *dentry;
+ struct inode *dir;
struct btrfs_root *root;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
+ u64 qgroup_reserved;
/* extra metadata reseration for relocation */
int error;
bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+ struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_old_snapshots(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1d..94e05c1f118a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
#include "transaction.h"
#include "locking.h"
-/* defrag all the leaves in a given btree. If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
* better reflect disk order
*/
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int cache_only)
+ struct btrfs_root *root)
{
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u64 last_ret = 0;
u64 min_trans = 0;
- if (cache_only)
- goto out;
-
if (root->fs_info->extent_root == root) {
/*
* there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
path->keep_locks = 1;
- if (cache_only)
- min_trans = root->defrag_trans_start;
- ret = btrfs_search_forward(root, &key, NULL, path,
- cache_only, min_trans);
+ ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
min_trans);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
- cache_only, &last_ret,
+ &last_ret,
&root->defrag_progress);
if (ret) {
WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d4..c7ef569eb22a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_pin_extent_for_log_replay(wc->trans,
- log->fs_info->extent_root,
+ btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
eb->start, eb->len);
if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *key)
{
int found_type;
- u64 mask = root->sectorsize - 1;
u64 extent_end;
u64 start = key->offset;
u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = start + btrfs_file_extent_num_bytes(eb, item);
else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_inline_len(eb, item);
- extent_end = (start + size + mask) & ~mask;
+ extent_end = ALIGN(start + size, root->sectorsize);
} else {
ret = 0;
goto out;
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
unsigned long log_transid = 0;
mutex_lock(&root->log_mutex);
+ log_transid = root->log_transid;
index1 = root->log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
ret = -EAGAIN;
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
- log_transid = root->log_transid;
if (log_transid % 2 == 0)
mark = EXTENT_DIRTY;
else
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out;
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = 0;
goto out;
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- BUG_ON(ret);
+ if (trans) {
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
+ /*
+ * We may have short-circuited the log tree with the full commit logic
+ * and left ordered extents on our list, so clear these out to keep us
+ * from leaking inodes and memory.
+ */
+ btrfs_free_logged_extents(log, 0);
+ btrfs_free_logged_extents(log, 1);
+
free_extent_buffer(log->node);
kfree(log);
}
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, trans->transid);
+ path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct btrfs_ordered_extent *ordered;
struct list_head ordered_sums;
struct btrfs_map_token token;
struct btrfs_key key;
- u64 csum_offset = em->mod_start - em->start;
- u64 csum_len = em->mod_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
+ u64 csum_offset;
+ u64 csum_len;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
+ int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+insert:
INIT_LIST_HEAD(&ordered_sums);
btrfs_init_map_token(&token);
key.objectid = btrfs_ino(inode);
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+
+ /*
+ * If we are overwriting an inline extent with a real one then we need
+ * to just delete the inline extent as it may not be large enough to
+ * have the entire file_extent_item.
+ */
+ if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ret = btrfs_del_item(trans, log, path);
+ btrfs_release_path(path);
+ if (ret) {
+ path->really_keep_locks = 0;
+ return ret;
+ }
+ goto insert;
+ }
+
btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3357,6 +3394,97 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (skip_csum)
return 0;
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = block_len;
+ }
+
+ /*
+ * First check and see if our csums are on our outstanding ordered
+ * extents.
+ */
+again:
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ struct btrfs_ordered_sum *sum;
+
+ if (!mod_len)
+ break;
+
+ if (ordered->inode != inode)
+ continue;
+
+ if (ordered->file_offset + ordered->len <= mod_start ||
+ mod_start + mod_len <= ordered->file_offset)
+ continue;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this
+ * ordered extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered->file_offset + ordered->len >=
+ mod_start + mod_len)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered->file_offset + ordered->len <
+ mod_start + mod_len) {
+ mod_len = (mod_start + mod_len) -
+ (ordered->file_offset + ordered->len);
+ mod_start = ordered->file_offset +
+ ordered->len;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+ &ordered->flags))
+ continue;
+ atomic_inc(&ordered->refs);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ /*
+ * we've dropped the lock, we must either break or
+ * start over after this.
+ */
+
+ wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ ret = btrfs_csum_file_blocks(trans, log, sum);
+ if (ret) {
+ btrfs_put_ordered_extent(ordered);
+ goto unlocked;
+ }
+ }
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+
+ if (!mod_len || ret)
+ return ret;
+
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
em->block_start + csum_offset,
@@ -3388,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
u64 test_gen;
int ret = 0;
+ int num = 0;
INIT_LIST_HEAD(&extents);
@@ -3396,27 +3525,42 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
+
+ /*
+ * Just an arbitrary number, this can be really CPU intensive
+ * once we start getting a lot of extents, and really once we
+ * have a bunch of extents we just want to commit since it will
+ * be faster.
+ */
+ if (++num > 32768) {
+ list_del_init(&tree->modified_extents);
+ ret = -EFBIG;
+ goto process;
+ }
+
if (em->generation <= test_gen)
continue;
/* Need a ref to keep it from getting evicted from cache */
atomic_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
+ num++;
}
list_sort(NULL, &extents, extent_cmp);
+process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
list_del_init(&em->list);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
/*
* If we had an error we just need to delete everybody from our
* private list.
*/
if (ret) {
+ clear_em_logging(tree, em);
free_extent_map(em);
continue;
}
@@ -3424,8 +3568,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
write_unlock(&tree->lock);
ret = log_one_extent(trans, inode, root, em, path);
- free_extent_map(em);
write_lock(&tree->lock);
+ clear_em_logging(tree, em);
+ free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
@@ -3507,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
+ btrfs_get_logged_extents(log, inode);
+
/*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
@@ -3552,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
while (1) {
ins_nr = 0;
ret = btrfs_search_forward(root, &min_key, &max_key,
- path, 0, trans->transid);
+ path, trans->transid);
if (ret != 0)
break;
again:
@@ -3650,6 +3797,8 @@ log_extents:
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
+ if (err)
+ btrfs_free_logged_extents(log, log->log_transid);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -3816,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- WARN_ON(ret != -ENOSPC);
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 1;
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db6..ddc61cad0080 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
*/
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include "ulist.h"
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..35bb2d4ed29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
@@ -32,6 +34,7 @@
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
+#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
new_device->can_discard = 0;
+ spin_lock_init(&new_device->io_lock);
list_replace_rcu(&device->dev_list, &new_device->dev_list);
call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
struct block_device *bdev;
- struct buffer_head *bh;
- int ret;
+ struct page *page;
+ void *p;
+ int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
+ u64 bytenr;
+ pgoff_t index;
+ /*
+ * we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
mutex_lock(&uuid_mutex);
- ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
- if (ret)
+
+ bdev = blkdev_get_by_path(path, flags, holder);
+
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
goto error;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ }
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+ goto error_bdev_put;
+
+ /* make sure our super fits in the page */
+ if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+ goto error_bdev_put;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_CACHE_SHIFT;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+ goto error_bdev_put;
+
+ /* pull in the page with our super */
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_NOFS);
+
+ if (IS_ERR_OR_NULL(page))
+ goto error_bdev_put;
+
+ p = kmap(page);
+
+ /* align our pointer to the offset of the super block */
+ disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+ goto error_unmap;
+
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
+
if (disk_super->label[0]) {
if (disk_super->label[BTRFS_LABEL_SIZE - 1])
disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
} else {
printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
}
+
printk(KERN_CONT "devid %llu transid %llu %s\n",
(unsigned long long)devid, (unsigned long long)transid, path);
+
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
- brelse(bh);
+
+error_unmap:
+ kunmap(page);
+ page_cache_release(page);
+
+error_bdev_put:
blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
u64 devid;
u64 num_devices;
u8 *dev_uuid;
+ unsigned seq;
int ret = 0;
bool clear_super = false;
mutex_lock(&uuid_mutex);
- all_avail = root->fs_info->avail_data_alloc_bits |
- root->fs_info->avail_system_alloc_bits |
- root->fs_info->avail_metadata_alloc_bits;
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ all_avail = root->fs_info->avail_data_alloc_bits |
+ root->fs_info->avail_system_alloc_bits |
+ root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto out;
}
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+ root->fs_info->fs_devices->rw_devices <= 2) {
+ printk(KERN_ERR "btrfs: unable to go below two "
+ "devices on raid5\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+ root->fs_info->fs_devices->rw_devices <= 3) {
+ printk(KERN_ERR "btrfs: unable to go below three "
+ "devices on raid6\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -1431,7 +1511,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
} else {
ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_READ | FMODE_EXCL,
+ FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder, 0,
&bdev, &bh);
if (ret)
@@ -1556,7 +1636,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
ret = 0;
/* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ if (bdev)
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
error_brelse:
brelse(bh);
@@ -2614,7 +2695,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (bargs->usage == 0)
+ user_thresh = 1;
+ else if (bargs->usage > 100)
+ user_thresh = cache->key.offset;
+ else
+ user_thresh = div_factor_fine(cache->key.offset,
+ bargs->usage);
+
if (chunk_used < user_thresh)
ret = 0;
@@ -2656,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
return 0;
if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
- factor = 2;
- else
- factor = 1;
- factor = num_stripes / factor;
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+ factor = num_stripes / 2;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+ factor = num_stripes - 1;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+ factor = num_stripes - 2;
+ } else {
+ factor = num_stripes;
+ }
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
@@ -2959,6 +3051,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
BUG_ON(ret);
+
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2975,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
int mixed = 0;
int ret;
u64 num_devices;
+ unsigned seq;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -3017,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
else
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10);
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6);
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3057,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10;
- if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (fs_info->avail_system_alloc_bits & allowed) &&
- !(bctl->sys.target & allowed)) ||
- ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (fs_info->avail_metadata_alloc_bits & allowed) &&
- !(bctl->meta.target & allowed))) {
- if (bctl->flags & BTRFS_BALANCE_FORCE) {
- printk(KERN_INFO "btrfs: force reducing metadata "
- "integrity\n");
- } else {
- printk(KERN_ERR "btrfs: balance will reduce metadata "
- "integrity, use force if you want this\n");
- ret = -EINVAL;
- goto out;
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6;
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ printk(KERN_INFO "btrfs: force reducing metadata "
+ "integrity\n");
+ } else {
+ printk(KERN_ERR "btrfs: balance will reduce metadata "
+ "integrity, use force if you want this\n");
+ ret = -EINVAL;
+ goto out;
+ }
}
- }
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
int num_tolerated_disk_barrier_failures;
@@ -3117,29 +3220,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
mutex_lock(&fs_info->balance_mutex);
atomic_dec(&fs_info->balance_running);
- if (bargs) {
- memset(bargs, 0, sizeof(*bargs));
- update_ioctl_balance_args(fs_info, 0, bargs);
- }
-
- if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- balance_need_close(fs_info)) {
- __cancel_balance(fs_info);
- }
-
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
}
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
wake_up(&fs_info->balance_wait_q);
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
__cancel_balance(fs_info);
- else
+ else {
kfree(bctl);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ }
return ret;
}
@@ -3156,7 +3256,6 @@ static int balance_kthread(void *data)
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
@@ -3179,7 +3278,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
if (IS_ERR(tsk))
return PTR_ERR(tsk);
@@ -3233,6 +3331,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+ WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -3492,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
}
struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- { 2, 1, 0, 4, 2, 2 /* raid10 */ },
- { 1, 1, 2, 2, 2, 2 /* raid1 */ },
- { 1, 2, 1, 1, 1, 2 /* dup */ },
- { 1, 1, 0, 2, 1, 1 /* raid0 */ },
- { 1, 1, 0, 1, 1, 1 /* single */ },
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
};
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+ /* TODO allow them to set a preferred stripe size */
+ return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ u64 features;
+
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ return;
+
+ features = btrfs_super_incompat_flags(info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
+ return;
+
+ features |= BTRFS_FEATURE_INCOMPAT_RAID56;
+ btrfs_set_super_incompat_flags(info->super_copy, features);
+ printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
+}
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
struct map_lookup **map_ret,
@@ -3514,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_device_info *devices_info = NULL;
u64 total_avail;
int num_stripes; /* total number of stripes to allocate */
+ int data_stripes; /* number of stripes that count for
+ block group size */
int sub_stripes; /* sub_stripes info for map */
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
@@ -3525,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 max_chunk_size;
u64 stripe_size;
u64 num_bytes;
+ u64 raid_stripe_len = BTRFS_STRIPE_LEN;
int ndevs;
int i;
int j;
@@ -3619,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
continue;
+ if (ndevs == fs_devices->rw_devices) {
+ WARN(1, "%s: found more than %llu devices\n",
+ __func__, fs_devices->rw_devices);
+ break;
+ }
devices_info[ndevs].dev_offset = dev_offset;
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
- WARN_ON(ndevs > fs_devices->rw_devices);
}
/*
@@ -3650,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = devices_info[ndevs-1].max_avail;
num_stripes = ndevs * dev_stripes;
- if (stripe_size * ndevs > max_chunk_size * ncopies) {
- stripe_size = max_chunk_size * ncopies;
- do_div(stripe_size, ndevs);
+ /*
+ * this will have to be fixed for RAID1 and RAID10 over
+ * more drives
+ */
+ data_stripes = num_stripes / ncopies;
+
+ if (type & BTRFS_BLOCK_GROUP_RAID5) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 1;
+ }
+ if (type & BTRFS_BLOCK_GROUP_RAID6) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 2;
+ }
+
+ /*
+ * Use the number of data stripes to figure out how big this chunk
+ * is really going to be in terms of logical address space,
+ * and compare that answer with the max chunk size
+ */
+ if (stripe_size * data_stripes > max_chunk_size) {
+ u64 mask = (1ULL << 24) - 1;
+ stripe_size = max_chunk_size;
+ do_div(stripe_size, data_stripes);
+
+ /* bump the answer up to a 16MB boundary */
+ stripe_size = (stripe_size + mask) & ~mask;
+
+ /* but don't go higher than the limits we found
+ * while searching for free extents
+ */
+ if (stripe_size > devices_info[ndevs-1].max_avail)
+ stripe_size = devices_info[ndevs-1].max_avail;
}
do_div(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- do_div(stripe_size, BTRFS_STRIPE_LEN);
- stripe_size *= BTRFS_STRIPE_LEN;
+ do_div(stripe_size, raid_stripe_len);
+ stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
@@ -3677,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
}
map->sector_size = extent_root->sectorsize;
- map->stripe_len = BTRFS_STRIPE_LEN;
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
+ map->stripe_len = raid_stripe_len;
+ map->io_align = raid_stripe_len;
+ map->io_width = raid_stripe_len;
map->type = type;
map->sub_stripes = sub_stripes;
*map_ret = map;
- num_bytes = stripe_size * (num_stripes / ncopies);
+ num_bytes = stripe_size * data_stripes;
*stripe_size_out = stripe_size;
*num_bytes_out = num_bytes;
@@ -3706,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- free_extent_map(em);
- if (ret)
- goto error;
-
- ret = btrfs_make_block_group(trans, extent_root, 0, type,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, num_bytes);
- if (ret)
+ if (ret) {
+ free_extent_map(em);
goto error;
+ }
for (i = 0; i < map->num_stripes; ++i) {
struct btrfs_device *device;
@@ -3727,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
info->chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, dev_offset, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
- goto error;
- }
+ if (ret)
+ goto error_dev_extent;
+ }
+
+ ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, num_bytes);
+ if (ret) {
+ i = map->num_stripes - 1;
+ goto error_dev_extent;
}
+ free_extent_map(em);
+ check_raid56_incompat_flag(extent_root->fs_info, type);
+
kfree(devices_info);
return 0;
+error_dev_extent:
+ for (; i >= 0; i--) {
+ struct btrfs_device *device;
+ int err;
+
+ device = map->stripes[i].dev;
+ err = btrfs_free_dev_extent(trans, device, start);
+ if (err) {
+ btrfs_abort_transaction(trans, extent_root, err);
+ break;
+ }
+ }
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* One for our allocation */
+ free_extent_map(em);
+ /* One for the tree reference */
+ free_extent_map(em);
error:
kfree(map);
kfree(devices_info);
@@ -3875,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- fs_info->avail_metadata_alloc_bits;
- alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+ alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
&stripe_size, chunk_offset, alloc_profile);
if (ret)
@@ -3886,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
sys_chunk_offset = chunk_offset + chunk_size;
- alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- fs_info->avail_system_alloc_bits;
- alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+ alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
&sys_chunk_size, &sys_stripe_size,
sys_chunk_offset, alloc_profile);
@@ -4002,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ ret = 3;
else
ret = 1;
free_extent_map(em);
@@ -4014,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ unsigned long len = root->sectorsize;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ len = map->stripe_len * nr_data_stripes(map);
+ }
+ free_extent_map(em);
+ return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
+ ret = 1;
+ free_extent_map(em);
+ return ret;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first, int num,
int optimal, int dev_replace_is_ongoing)
@@ -4051,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return optimal;
}
+static inline int parity_smaller(u64 a, u64 b)
+{
+ return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+ struct btrfs_bio_stripe s;
+ int i;
+ u64 l;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < bbio->num_stripes - 1; i++) {
+ if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ s = bbio->stripes[i];
+ l = raid_map[i];
+ bbio->stripes[i] = bbio->stripes[i+1];
+ raid_map[i] = raid_map[i+1];
+ bbio->stripes[i+1] = s;
+ raid_map[i+1] = l;
+ again = 1;
+ }
+ }
+ }
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
- int mirror_num)
+ int mirror_num, u64 **raid_map_ret)
{
struct extent_map *em;
struct map_lookup *map;
@@ -4066,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
+ u64 stripe_len;
+ u64 *raid_map = NULL;
int stripe_index;
int i;
int ret = 0;
@@ -4077,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
int num_alloc_stripes;
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
+ u64 raid56_full_stripe_start = (u64)-1;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4093,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
+ if (mirror_num > map->num_stripes)
+ mirror_num = 0;
+
+ stripe_len = map->stripe_len;
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- do_div(stripe_nr, map->stripe_len);
+ do_div(stripe_nr, stripe_len);
- stripe_offset = stripe_nr * map->stripe_len;
+ stripe_offset = stripe_nr * stripe_len;
BUG_ON(offset < stripe_offset);
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
- if (rw & REQ_DISCARD)
+ /* if we're here for raid56, we need to know the stripe aligned start */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ raid56_full_stripe_start = offset;
+
+ /* allow a write of a full stripe, but make sure we don't
+ * allow straddling of stripes
+ */
+ do_div(raid56_full_stripe_start, full_stripe_len);
+ raid56_full_stripe_start *= full_stripe_len;
+ }
+
+ if (rw & REQ_DISCARD) {
+ /* we don't discard raid56 yet */
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
*length = min_t(u64, em->len - offset, *length);
- else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- /* we limit the length of each bio to what fits in a stripe */
- *length = min_t(u64, em->len - offset,
- map->stripe_len - stripe_offset);
+ } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ u64 max_len;
+ /* For writes to RAID[56], allow a full stripeset across all disks.
+ For other RAID types and for RAID[56] reads, just allow a single
+ stripe (on a single disk). */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ (rw & REQ_WRITE)) {
+ max_len = stripe_len * nr_data_stripes(map) -
+ (offset - raid56_full_stripe_start);
+ } else {
+ /* we limit the length of each bio to what fits in a stripe */
+ max_len = stripe_len - stripe_offset;
+ }
+ *length = min_t(u64, em->len - offset, max_len);
} else {
*length = em->len - offset;
}
+ /* This is for when we're called from btrfs_merge_bio_hook() and all
+ it cares about is the length */
if (!bbio_ret)
goto out;
@@ -4148,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0);
+ logical, &tmp_length, &tmp_bbio, 0, NULL);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
@@ -4209,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = 1;
stripe_index = 0;
stripe_nr_orig = stripe_nr;
- stripe_nr_end = (offset + *length + map->stripe_len - 1) &
- (~(map->stripe_len - 1));
+ stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
do_div(stripe_nr_end, map->stripe_len);
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + *length);
+
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
@@ -4264,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
+
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ u64 tmp;
+
+ if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+ && raid_map_ret) {
+ int i, rot;
+
+ /* push stripe_nr back to the start of the full stripe */
+ stripe_nr = raid56_full_stripe_start;
+ do_div(stripe_nr, stripe_len);
+
+ stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+
+ /* RAID[56] write or recovery. Return all stripes */
+ num_stripes = map->num_stripes;
+ max_errors = nr_parity_stripes(map);
+
+ raid_map = kmalloc(sizeof(u64) * num_stripes,
+ GFP_NOFS);
+ if (!raid_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Work out the disk rotation on this stripe-set */
+ tmp = stripe_nr;
+ rot = do_div(tmp, num_stripes);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+
+ *length = map->stripe_len;
+ stripe_index = 0;
+ stripe_offset = 0;
+ } else {
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ if (mirror_num > 1)
+ stripe_index = nr_data_stripes(map) +
+ mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ tmp = stripe_nr + stripe_index;
+ stripe_index = do_div(tmp, map->num_stripes);
+ }
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@@ -4372,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_DUP)) {
max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
}
}
@@ -4474,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
+ if (raid_map) {
+ sort_parity_stripes(bbio, raid_map);
+ *raid_map_ret = raid_map;
+ }
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
@@ -4486,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num);
+ mirror_num, NULL);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4500,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 bytenr;
u64 length;
u64 stripe_nr;
+ u64 rmap_len;
int i, j, nr = 0;
read_lock(&em_tree->lock);
@@ -4510,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map = (struct map_lookup *)em->bdev;
length = em->len;
+ rmap_len = map->stripe_len;
+
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
do_div(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
do_div(length, map->num_stripes);
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ do_div(length, nr_data_stripes(map));
+ rmap_len = map->stripe_len * nr_data_stripes(map);
+ }
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
@@ -4533,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
- }
- bytenr = chunk_start + stripe_nr * map->stripe_len;
+ } /* else if RAID[56], multiply by nr_data_stripes().
+ * Alternatively, just use rmap_len below instead of
+ * map->stripe_len */
+
+ bytenr = chunk_start + stripe_nr * rmap_len;
WARN_ON(nr >= map->num_stripes);
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
@@ -4548,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
*logical = buf;
*naddrs = nr;
- *stripe_len = map->stripe_len;
+ *stripe_len = rmap_len;
free_extent_map(em);
return 0;
@@ -4622,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio->bi_bdev = (struct block_device *)
(unsigned long)bbio->mirror_num;
/* only send an error to the higher layers if it is
- * beyond the tolerance of the multi-bio
+ * beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
err = -EIO;
@@ -4656,13 +5079,18 @@ struct async_sched {
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-static noinline void schedule_bio(struct btrfs_root *root,
+noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
int rw, struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
+ if (device->missing || !device->bdev) {
+ bio_endio(bio, -EIO);
+ return;
+ }
+
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
@@ -4760,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
#endif
bio->bi_bdev = dev->bdev;
if (async)
- schedule_bio(root, dev, rw, bio);
+ btrfs_schedule_bio(root, dev, rw, bio);
else
btrfsic_submit_bio(rw, bio);
}
@@ -4819,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
+ u64 *raid_map = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
@@ -4827,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
length = bio->bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num);
- if (ret)
+ ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+ mirror_num, &raid_map);
+ if (ret) /* -ENOMEM */
return ret;
total_devs = bbio->num_stripes;
+ bbio->orig_bio = first_bio;
+ bbio->private = first_bio->bi_private;
+ bbio->end_io = first_bio->bi_end_io;
+ atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+
+ if (raid_map) {
+ /* In this case, map_length has been set to the length of
+ a single stripe; not the whole write */
+ if (rw & WRITE) {
+ return raid56_parity_write(root, bio, bbio,
+ raid_map, map_length);
+ } else {
+ return raid56_parity_recover(root, bio, bbio,
+ raid_map, map_length,
+ mirror_num);
+ }
+ }
+
if (map_length < length) {
printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
"len %llu\n", (unsigned long long)logical,
@@ -4841,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG();
}
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
-
while (dev_nr < total_devs) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
#include <linux/bio.h>
#include <linux/sort.h>
+#include <linux/btrfs.h>
#include "async-thread.h"
-#include "ioctl.h"
#define BTRFS_STRIPE_LEN (64 * 1024)
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
int btrfs_scratch_superblock(struct btrfs_device *device);
-
+void btrfs_schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
diff --git a/fs/buffer.c b/fs/buffer.c
index c017a2dfb909..b4dcb34c9635 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
}
EXPORT_SYMBOL(init_buffer);
+inline void touch_buffer(struct buffer_head *bh)
+{
+ trace_block_touch_buffer(bh);
+ mark_page_accessed(bh->b_page);
+}
+EXPORT_SYMBOL(touch_buffer);
+
static int sleep_on_buffer(void *word)
{
io_schedule();
@@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));
+ trace_block_dirty_buffer(bh);
+
/*
* Very *carefully* optimize the it-is-already-dirty case.
*
@@ -2332,7 +2342,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
loff_t size;
int ret;
@@ -2359,7 +2369,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret < 0))
goto out_unlock;
set_page_dirty(page);
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
return 0;
out_unlock:
unlock_page(page);
@@ -2371,7 +2381,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
int ret;
- struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
sb_start_pagefault(sb);
@@ -2935,6 +2945,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
void *kaddr = kmap_atomic(bh->b_page);
memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
kunmap_atomic(kaddr);
+ flush_dcache_page(bh->b_page);
}
}
@@ -3226,7 +3237,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
-static int max_buffer_heads;
+static unsigned long max_buffer_heads;
int buffer_heads_over_limit;
@@ -3342,7 +3353,7 @@ EXPORT_SYMBOL(bh_submit_read);
void __init buffer_init(void)
{
- int nrpages;
+ unsigned long nrpages;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 67bef6d01484..746ce532e130 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(
_enter("{%s},%p,", cache->cache.identifier, cookie);
- lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+ lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
if (!lookup_data)
goto nomem_lookup_data;
/* create a new object record and a temporary leaf image */
- object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+ object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
if (!object)
goto nomem_object;
@@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(
* - stick the length on the front and leave space on the back for the
* encoder
*/
- buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+ buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
if (!buffer)
goto nomem_buffer;
@@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
return;
}
- auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+ auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
_leave(" [nomem]");
return;
@@ -441,6 +441,54 @@ truncate_failed:
}
/*
+ * Invalidate an object
+ */
+static void cachefiles_invalidate_object(struct fscache_operation *op)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ struct path path;
+ uint64_t ni_size;
+ int ret;
+
+ object = container_of(op->object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
+ &ni_size);
+
+ _enter("{OBJ%x},[%llu]",
+ op->object->debug_id, (unsigned long long)ni_size);
+
+ if (object->backer) {
+ ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+ fscache_set_store_limit(&object->fscache, ni_size);
+
+ path.dentry = object->backer;
+ path.mnt = cache->mnt;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = vfs_truncate(&path, 0);
+ if (ret == 0)
+ ret = vfs_truncate(&path, ni_size);
+ cachefiles_end_secure(cache, saved_cred);
+
+ if (ret != 0) {
+ fscache_set_store_limit(&object->fscache, 0);
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Invalidate failed");
+ }
+ }
+
+ fscache_op_complete(op, true);
+ _leave("");
+}
+
+/*
* dissociate a cache from all the pages it was backing
*/
static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.lookup_complete = cachefiles_lookup_complete,
.grab_object = cachefiles_grab_object,
.update_object = cachefiles_update_object,
+ .invalidate_object = cachefiles_invalidate_object,
.drop_object = cachefiles_drop_object,
.put_object = cachefiles_put_object,
.sync_cache = cachefiles_sync_cache,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d7..49382519907a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
+#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
/*
* node records
*/
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a674..33b58c60f2d1 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
_debug("max: %d", max);
- key = kmalloc(max, GFP_KERNEL);
+ key = kmalloc(max, cachefiles_gfp);
if (!key)
return NULL;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index b0b5f7cdfffa..8c01c5fcdf75 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
prefix, fscache_object_states[object->fscache.state],
object->fscache.flags, work_busy(&object->fscache.work),
- object->fscache.events,
- object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+ object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
prefix, object->fscache.n_ops, object->fscache.n_in_progress,
object->fscache.n_exclusive);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c994691d9445..480992259707 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
struct page *backpage = monitor->back_page, *backpage2;
int ret;
- kenter("{ino=%lx},{%lx,%lx}",
+ _enter("{ino=%lx},{%lx,%lx}",
object->backer->d_inode->i_ino,
backpage->index, backpage->flags);
/* skip if the page was truncated away completely */
if (backpage->mapping != bmapping) {
- kleave(" = -ENODATA [mapping]");
+ _leave(" = -ENODATA [mapping]");
return -ENODATA;
}
backpage2 = find_get_page(bmapping, backpage->index);
if (!backpage2) {
- kleave(" = -ENODATA [gone]");
+ _leave(" = -ENODATA [gone]");
return -ENODATA;
}
if (backpage != backpage2) {
put_page(backpage2);
- kleave(" = -ENODATA [different]");
+ _leave(" = -ENODATA [different]");
return -ENODATA;
}
@@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
if (PageUptodate(backpage))
goto unlock_discard;
- kdebug("reissue read");
+ _debug("reissue read");
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
goto unlock_discard;
@@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
}
/* it'll reappear on the todo list */
- kleave(" = -EINPROGRESS");
+ _leave(" = -EINPROGRESS");
return -EINPROGRESS;
unlock_discard:
@@ -137,7 +137,7 @@ unlock_discard:
spin_lock_irq(&object->work_lock);
list_del(&monitor->op_link);
spin_unlock_irq(&object->work_lock);
- kleave(" = %d", ret);
+ _leave(" = %d", ret);
return ret;
}
@@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
_debug("- copy {%lu}", monitor->back_page->index);
recheck:
- if (PageUptodate(monitor->back_page)) {
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING,
+ &object->fscache.cookie->flags)) {
+ error = -ESTALE;
+ } else if (PageUptodate(monitor->back_page)) {
copy_highpage(monitor->netfs_page, monitor->back_page);
-
- pagevec_add(&pagevec, monitor->netfs_page);
- fscache_mark_pages_cached(monitor->op, &pagevec);
+ fscache_mark_page_cached(monitor->op,
+ monitor->netfs_page);
error = 0;
} else if (!PageError(monitor->back_page)) {
/* the page has probably been truncated */
@@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
fscache_end_io(op, monitor->netfs_page, error);
page_cache_release(monitor->netfs_page);
+ fscache_retrieval_complete(op, 1);
fscache_put_retrieval(op);
kfree(monitor);
@@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
_debug("read back %p{%lu,%d}",
netpage, netpage->index, page_count(netpage));
- monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
if (!monitor)
goto nomem;
@@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
goto backing_page_already_present;
if (!newpage) {
- newpage = page_cache_alloc_cold(bmapping);
+ newpage = __page_cache_alloc(cachefiles_gfp |
+ __GFP_COLD);
if (!newpage)
goto nomem_monitor;
}
ret = add_to_page_cache(newpage, bmapping,
- netpage->index, GFP_KERNEL);
+ netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
@@ -335,11 +339,11 @@ backing_page_already_present:
backing_page_already_uptodate:
_debug("- uptodate");
- pagevec_add(pagevec, netpage);
- fscache_mark_pages_cached(op, pagevec);
+ fscache_mark_page_cached(op, netpage);
copy_highpage(netpage, backpage);
fscache_end_io(op, netpage, 0);
+ fscache_retrieval_complete(op, 1);
success:
_debug("success");
@@ -357,10 +361,13 @@ out:
read_error:
_debug("read error %d", ret);
- if (ret == -ENOMEM)
+ if (ret == -ENOMEM) {
+ fscache_retrieval_complete(op, 1);
goto out;
+ }
io_error:
cachefiles_io_error_obj(object, "Page read error on backing file");
+ fscache_retrieval_complete(op, 1);
ret = -ENOBUFS;
goto out;
@@ -370,6 +377,7 @@ nomem_monitor:
fscache_put_retrieval(monitor->op);
kfree(monitor);
nomem:
+ fscache_retrieval_complete(op, 1);
_leave(" = -ENOMEM");
return -ENOMEM;
}
@@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
_enter("{%p},{%lx},,,", object, page->index);
if (!object->backer)
- return -ENOBUFS;
+ goto enobufs;
inode = object->backer->d_inode;
ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
/* calculate the shift required to use bmap */
if (inode->i_sb->s_blocksize > PAGE_SIZE)
- return -ENOBUFS;
+ goto enobufs;
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
&pagevec);
} else if (cachefiles_has_space(cache, 0, 1) == 0) {
/* there's space in the cache we can use */
- pagevec_add(&pagevec, page);
- fscache_mark_pages_cached(op, &pagevec);
+ fscache_mark_page_cached(op, page);
+ fscache_retrieval_complete(op, 1);
ret = -ENODATA;
} else {
- ret = -ENOBUFS;
+ goto enobufs;
}
_leave(" = %d", ret);
return ret;
+
+enobufs:
+ fscache_retrieval_complete(op, 1);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
}
/*
@@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
*/
static int cachefiles_read_backing_file(struct cachefiles_object *object,
struct fscache_retrieval *op,
- struct list_head *list,
- struct pagevec *mark_pvec)
+ struct list_head *list)
{
struct cachefiles_one_read *monitor = NULL;
struct address_space *bmapping = object->backer->d_inode->i_mapping;
@@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage, netpage->index, page_count(netpage));
if (!monitor) {
- monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
if (!monitor)
goto nomem;
@@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto backing_page_already_present;
if (!newpage) {
- newpage = page_cache_alloc_cold(bmapping);
+ newpage = __page_cache_alloc(cachefiles_gfp |
+ __GFP_COLD);
if (!newpage)
goto nomem;
}
ret = add_to_page_cache(newpage, bmapping,
- netpage->index, GFP_KERNEL);
+ netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
@@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
_debug("- monitor add");
ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- GFP_KERNEL);
+ cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
continue;
}
goto nomem;
@@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
_debug("- uptodate");
ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- GFP_KERNEL);
+ cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
continue;
}
goto nomem;
@@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
page_cache_release(backpage);
backpage = NULL;
- if (!pagevec_add(mark_pvec, netpage))
- fscache_mark_pages_cached(op, mark_pvec);
+ fscache_mark_page_cached(op, netpage);
page_cache_get(netpage);
if (!pagevec_add(&lru_pvec, netpage))
__pagevec_lru_add_file(&lru_pvec);
+ /* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
page_cache_release(netpage);
netpage = NULL;
+ fscache_retrieval_complete(op, 1);
continue;
}
@@ -661,6 +677,7 @@ out:
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
}
_leave(" = %d", ret);
@@ -669,15 +686,17 @@ out:
nomem:
_debug("nomem");
ret = -ENOMEM;
- goto out;
+ goto record_page_complete;
read_error:
_debug("read error %d", ret);
if (ret == -ENOMEM)
- goto out;
+ goto record_page_complete;
io_error:
cachefiles_io_error_obj(object, "Page read error on backing file");
ret = -ENOBUFS;
+record_page_complete:
+ fscache_retrieval_complete(op, 1);
goto out;
}
@@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
*nr_pages);
if (!object->backer)
- return -ENOBUFS;
+ goto all_enobufs;
space = 1;
if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
/* calculate the shift required to use bmap */
if (inode->i_sb->s_blocksize > PAGE_SIZE)
- return -ENOBUFS;
+ goto all_enobufs;
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
nrbackpages++;
} else if (space && pagevec_add(&pagevec, page) == 0) {
fscache_mark_pages_cached(op, &pagevec);
+ fscache_retrieval_complete(op, 1);
ret = -ENODATA;
+ } else {
+ fscache_retrieval_complete(op, 1);
}
}
@@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
/* submit the apparently valid pages to the backing fs to be read from
* disk */
if (nrbackpages > 0) {
- ret2 = cachefiles_read_backing_file(object, op, &backpages,
- &pagevec);
+ ret2 = cachefiles_read_backing_file(object, op, &backpages);
if (ret2 == -ENOMEM || ret2 == -EINTR)
ret = ret2;
}
- if (pagevec_count(&pagevec) > 0)
- fscache_mark_pages_cached(op, &pagevec);
-
_leave(" = %d [nr=%u%s]",
ret, *nr_pages, list_empty(pages) ? " empty" : "");
return ret;
+
+all_enobufs:
+ fscache_retrieval_complete(op, *nr_pages);
+ return -ENOBUFS;
}
/*
@@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- struct pagevec pagevec;
int ret;
object = container_of(op->op.object,
@@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
_enter("%p,{%lx},", object, page->index);
ret = cachefiles_has_space(cache, 0, 1);
- if (ret == 0) {
- pagevec_init(&pagevec, 0);
- pagevec_add(&pagevec, page);
- fscache_mark_pages_cached(op, &pagevec);
- } else {
+ if (ret == 0)
+ fscache_mark_page_cached(op, page);
+ else
ret = -ENOBUFS;
- }
+ fscache_retrieval_complete(op, 1);
_leave(" = %d", ret);
return ret;
}
@@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
ret = -ENOBUFS;
}
+ fscache_retrieval_complete(op, *nr_pages);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e1..73b46288b54b 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
ASSERT(dentry);
ASSERT(dentry->d_inode);
- auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+ auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
if (!auxbuf) {
_leave(" = -ENOMEM");
return -ENOMEM;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 9eb134ea6eb2..49bc78243db9 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,6 +1,6 @@
config CEPH_FS
- tristate "Ceph distributed file system (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ tristate "Ceph distributed file system"
+ depends on INET
select CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269f5dde..a60ea977af6f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -195,7 +195,7 @@ static int ceph_releasepage(struct page *page, gfp_t g)
*/
static int readpage_nounlock(struct file *filp, struct page *page)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- int rc, bytes;
+ int rc = req->r_result;
+ int bytes = le32_to_cpu(msg->hdr.data_len);
int i;
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- rc = le32_to_cpu(replyhead->result);
- bytes = le32_to_cpu(msg->hdr.data_len);
-
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
/* unlock all pages, zeroing any data we didn't read */
@@ -267,6 +261,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
kfree(req->r_pages);
}
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ unlock_page(pages[i]);
+}
+
/*
* start an async read(ahead) operation. return nr_pages we submitted
* a read for on success, or negative error code.
@@ -307,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0,
ci->i_truncate_seq, ci->i_truncate_size,
- NULL, false, 1, 0);
+ NULL, false, 0);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -347,6 +349,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
return nr_pages;
out_pages:
+ ceph_unlock_page_vector(pages, nr_pages);
ceph_release_page_vector(pages, nr_pages);
out:
ceph_osdc_put_request(req);
@@ -361,7 +364,7 @@ out:
static int ceph_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned nr_pages)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
int rc = 0;
int max = 0;
@@ -483,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
&ci->i_layout, snapc,
page_off, len,
ci->i_truncate_seq, ci->i_truncate_size,
- &inode->i_mtime,
- &page, 1, 0, 0, true);
+ &inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
SetPageError(page);
@@ -545,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned wrote;
struct page *page;
int i;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- __s32 rc = -EIO;
- u64 bytes = 0;
+ int rc = req->r_result;
+ u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
-
if (rc >= 0) {
/*
* Assume we wrote the pages we originally sent. The
@@ -732,8 +725,6 @@ retry:
struct page *page;
int want;
u64 offset, len;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
long writeback_stat;
next = 0;
@@ -829,7 +820,7 @@ get_more_pages:
snapc, do_sync,
ci->i_truncate_seq,
ci->i_truncate_size,
- &inode->i_mtime, true, 1, 0);
+ &inode->i_mtime, true, 0);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
@@ -897,10 +888,8 @@ get_more_pages:
/* revise final length, page count */
req->r_num_pages = locked_pages;
- reqhead = req->r_request->front.iov_base;
- op = (void *)(reqhead + 1);
- op->extent.length = cpu_to_le64(len);
- op->payload_len = cpu_to_le32(len);
+ req->r_request_ops[0].extent.length = cpu_to_le64(len);
+ req->r_request_ops[0].payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
@@ -968,7 +957,7 @@ static int ceph_update_writeable_page(struct file *file,
loff_t pos, unsigned len,
struct page *page)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t page_off = pos & PAGE_CACHE_MASK;
@@ -1077,24 +1066,52 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = file->private_data;
struct page *page;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- int r;
+ int r, want, got = 0;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, len, inode->i_size);
+ r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
+ if (r < 0)
+ return r;
+ dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+ if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
+ ceph_put_cap_refs(ci, got);
+ return -EAGAIN;
+ }
do {
/* get a page */
page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ if (!page) {
+ r = -ENOMEM;
+ break;
+ }
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
+ if (r)
+ page_cache_release(page);
} while (r == -EAGAIN);
+ if (r) {
+ ceph_put_cap_refs(ci, got);
+ } else {
+ *pagep = page;
+ *(int *)fsdata = got;
+ }
return r;
}
@@ -1107,11 +1124,13 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
+ int got = (unsigned long)fsdata;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1153,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
up_read(&mdsc->snap_rwsem);
page_cache_release(page);
+ if (copied > 0) {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1176,7 +1208,7 @@ const struct address_space_operations ceph_aops = {
*/
static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct page *page = vmf->page;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t off = page_offset(page);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc6401..78e2f575247d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
if (!ctx) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
if (cap) {
+ spin_lock(&mdsc->caps_list_lock);
mdsc->caps_use_count++;
mdsc->caps_total_count++;
+ spin_unlock(&mdsc->caps_list_lock);
}
return cap;
}
@@ -609,8 +611,16 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH)
ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap)
+ else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p to cap_dirty_migrating\n", inode);
+ list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -928,7 +938,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
- uid_t uid, gid_t gid, umode_t mode,
+ kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
u64 follows)
@@ -972,8 +982,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->atime, atime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq);
- fc->uid = cpu_to_le32(uid);
- fc->gid = cpu_to_le32(gid);
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
fc->mode = cpu_to_le32(mode);
fc->xattr_version = cpu_to_le64(xattr_version);
@@ -1079,8 +1089,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct timespec mtime, atime;
int wake = 0;
umode_t mode;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
struct ceph_mds_session *session;
u64 xattr_version = 0;
struct ceph_buffer *xattr_blob = NULL;
@@ -1349,11 +1359,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
if (!ci->i_head_snapc)
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
- ci->i_head_snapc);
+ dout(" inode %p now dirty snapc %p auth cap %p\n",
+ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ if (ci->i_auth_cap)
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ else
+ list_add(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -1454,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
- int file_wanted, used;
+ int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
@@ -1557,9 +1571,14 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ cap_used = used;
+ if (ci->i_auth_cap && cap != ci->i_auth_cap)
+ cap_used &= ~ci->i_auth_cap->issued;
+
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+ dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued),
+ ceph_cap_string(cap_used),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
@@ -1587,7 +1606,7 @@ retry_locked:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & used) == 0) {
+ if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
@@ -1664,8 +1683,8 @@ ack:
sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+ want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2353,10 +2372,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
- inode->i_uid = le32_to_cpu(grant->uid);
- inode->i_gid = le32_to_cpu(grant->gid);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -2388,7 +2408,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
&atime);
/* max size increase? */
- if (max_size != ci->i_max_size) {
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
ci->i_max_size = max_size;
if (max_size >= ci->i_wanted_max_size) {
@@ -2410,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
dout("mds wanted %s -> %s\n",
ceph_cap_string(le32_to_cpu(grant->wanted)),
ceph_cap_string(wanted));
- grant->wanted = cpu_to_le32(wanted);
+ /* imported cap may not have correct mds_wanted */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+ check_caps = 1;
}
cap->seq = seq;
@@ -2745,6 +2767,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
/* make sure we re-request max_size, if necessary */
spin_lock(&ci->i_ceph_lock);
+ ci->i_wanted_max_size = 0; /* reset */
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
}
@@ -2813,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
+ if (op == CEPH_CAP_OP_IMPORT)
+ ceph_add_cap_releases(mdsc, session);
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -2840,8 +2866,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session,
snaptrace, snaptrace_len);
- ceph_check_caps(ceph_inode(inode), 0, session);
- goto done_unlocked;
}
/* the rest require a cap */
@@ -2858,6 +2882,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
+ case CEPH_CAP_OP_IMPORT:
handle_cap_grant(inode, h, session, cap, msg->middle);
goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8c1aabe93b67..6d797f46d772 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -238,7 +238,7 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1138,7 +1138,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
struct ceph_file_info *cf = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
const int bufsize = 1024;
@@ -1188,7 +1188,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct list_head *head = &ci->i_unsafe_dirops;
struct ceph_mds_request *req;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index ca3ab3f9ca70..16796be53ca5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -81,7 +81,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
if (parent_inode) {
/* nfsd wants connectable */
*max_len = connected_handle_length;
- type = 255;
+ type = FILEID_INVALID;
} else {
dout("encode_fh %p\n", dentry);
fh->ino = ceph_ino(inode);
@@ -90,7 +90,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
}
} else {
*max_len = handle_length;
- type = 255;
+ type = FILEID_INVALID;
}
if (dentry)
dput(dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d4dfdcf76d7f..bf338d9b67e3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
+ if (err)
+ goto out_err;
+
err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = finish_no_open(file, dn);
} else {
dout("atomic_open finish_open on dn %p\n", dn);
+ if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+ *opened |= FILE_CREATED;
+ }
err = finish_open(file, dentry, ceph_open, opened);
}
@@ -393,7 +399,7 @@ more:
static ssize_t ceph_sync_read(struct file *file, char __user *data,
unsigned len, loff_t *poff, int *checkeof)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct page **pages;
u64 off = *poff;
int num_pages, ret;
@@ -466,7 +472,7 @@ static void sync_write_commit(struct ceph_osd_request *req,
static ssize_t ceph_sync_write(struct file *file, const char __user *data,
size_t left, loff_t *offset)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -483,7 +489,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
int ret;
struct timespec mtime = CURRENT_TIME;
- if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+ if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
dout("sync_write on file %p %lld~%u %s\n", file, *offset,
@@ -535,7 +541,7 @@ more:
ci->i_snap_realm->cached_context,
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2, page_align);
+ &mtime, false, page_align);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -637,7 +643,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
struct ceph_file_info *fi = filp->private_data;
loff_t *ppos = &iocb->ki_pos;
size_t len = iov->iov_len;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
void __user *base = iov->iov_base;
ssize_t ret;
@@ -707,68 +713,58 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
loff_t endoff = pos + iov->iov_len;
- int want, got = 0;
- int ret, err;
+ int got = 0;
+ int ret, err, written;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
retry_snap:
+ written = 0;
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
return -ENOSPC;
__ceph_do_pending_vmtruncate(inode);
- dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- inode->i_size);
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
- if (ret < 0)
- goto out_put;
-
- dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
-
- if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC)) {
- ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
- &iocb->ki_pos);
- } else {
- /*
- * buffered write; drop Fw early to avoid slow
- * revocation if we get stuck on balance_dirty_pages
- */
- int dirty;
-
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- ceph_put_cap_refs(ci, got);
+ /*
+ * try to do a buffered write. if we don't have sufficient
+ * caps, we'll get -EAGAIN from generic_file_aio_write, or a
+ * short write if we only get caps for some pages.
+ */
+ if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
+ !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
+ !(fi->flags & CEPH_F_SYNC)) {
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (ret >= 0)
+ written = ret;
+
if ((ret >= 0 || ret == -EIOCBQUEUED) &&
((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
|| ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+ err = vfs_fsync_range(file, pos, pos + written - 1, 1);
if (err < 0)
ret = err;
}
+ if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
+ goto out;
+ }
- if (dirty)
- __mark_inode_dirty(inode, dirty);
+ dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, inode->i_size);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
+ if (ret < 0)
goto out;
- }
+ dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, ceph_cap_string(got));
+ ret = ceph_sync_write(file, iov->iov_base + written,
+ iov->iov_len - written, &iocb->ki_pos);
if (ret >= 0) {
int dirty;
spin_lock(&ci->i_ceph_lock);
@@ -777,13 +773,10 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
}
-
-out_put:
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
-
out:
if (ret == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ba95eea201bf..851814d951cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -612,10 +612,11 @@ static int fill_inode(struct inode *inode,
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
- inode->i_uid = le32_to_cpu(info->uid);
- inode->i_gid = le32_to_cpu(info->gid);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -1130,8 +1131,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
req->r_request_started);
dout(" final dn %p\n", dn);
i++;
- } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) {
+ } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
struct dentry *dn = req->r_dentry;
/* fill out a snapdir LOOKUPSNAP dentry */
@@ -1195,6 +1196,39 @@ done:
/*
* Prepopulate our cache with readdir results, leases, etc.
*/
+static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ int i, err = 0;
+
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_vino vino;
+ struct inode *in;
+ int rc;
+
+ vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+ vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+ in = ceph_get_inode(req->r_dentry->d_sb, vino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ dout("new_inode badness got %d\n", err);
+ continue;
+ }
+ rc = fill_inode(in, &rinfo->dir_in[i], NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (rc < 0) {
+ pr_err("fill_inode badness on %p got %d\n", in, rc);
+ err = rc;
+ continue;
+ }
+ }
+
+ return err;
+}
+
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
@@ -1209,6 +1243,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
u64 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_dentry_info *di;
+ if (req->r_aborted)
+ return readdir_prepopulate_inodes_only(req, session);
+
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(parent->d_inode);
parent = d_find_alias(snapdir);
@@ -1466,7 +1503,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
- int wrbuffer_refs, wake = 0;
+ int wrbuffer_refs, finish = 0;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1535,18 @@ retry:
truncate_inode_pages(inode->i_mapping, to);
spin_lock(&ci->i_ceph_lock);
- ci->i_truncate_pending--;
- if (ci->i_truncate_pending == 0)
- wake = 1;
+ if (to == ci->i_truncate_size) {
+ ci->i_truncate_pending = 0;
+ finish = 1;
+ }
spin_unlock(&ci->i_ceph_lock);
+ if (!finish)
+ goto retry;
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- if (wake)
- wake_up_all(&ci->i_cap_wq);
+
+ wake_up_all(&ci->i_cap_wq);
}
@@ -1562,26 +1602,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_UID) {
dout("setattr %p uid %d -> %d\n", inode,
- inode->i_uid, attr->ia_uid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_uid = attr->ia_uid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_uid != inode->i_uid) {
- req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+ !uid_eq(attr->ia_uid, inode->i_uid)) {
+ req->r_args.setattr.uid = cpu_to_le32(
+ from_kuid(&init_user_ns, attr->ia_uid));
mask |= CEPH_SETATTR_UID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_GID) {
dout("setattr %p gid %d -> %d\n", inode,
- inode->i_gid, attr->ia_gid);
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_gid = attr->ia_gid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_gid != inode->i_gid) {
- req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+ !gid_eq(attr->ia_gid, inode->i_gid)) {
+ req->r_args.setattr.gid = cpu_to_le32(
+ from_kgid(&init_user_ns, attr->ia_gid));
mask |= CEPH_SETATTR_GID;
release |= CEPH_CAP_AUTH_SHARED;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a46e311..4a989345b37b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -16,11 +16,11 @@
*/
static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
{
- struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_inode_info *ci = ceph_inode(file_inode(file));
struct ceph_ioctl_layout l;
int err;
- err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
if (!err) {
l.stripe_unit = ceph_file_layout_su(ci->i_layout);
l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
@@ -63,12 +63,12 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct inode *parent_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
- struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_inode_info *ci = ceph_inode(file_inode(file));
struct ceph_ioctl_layout nl;
int err;
@@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return -EFAULT;
/* validate changed params against current layout */
- err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
if (err)
return err;
@@ -136,7 +136,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
*/
static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err;
@@ -179,13 +179,12 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
{
struct ceph_ioctl_dataloc dl;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen;
u64 tmp;
- struct ceph_object_layout ol;
struct ceph_pg pgid;
int r;
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EFAULT;
down_read(&osdc->map_sem);
- r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+ r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0)
@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+ ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
osdc->osdmap);
- pgid = ol.ol_pgid;
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
@@ -234,7 +232,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
static long ceph_ioctl_lazyio(struct file *file)
{
struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 80576d05d687..202dd3d68be0 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -13,7 +13,7 @@
static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
int cmd, u8 wait, struct file_lock *fl)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d9..442880d099c9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
}
/*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
+{
+ if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (*p == end) {
+ info->has_create_ino = false;
+ } else {
+ info->has_create_ino = true;
+ info->ino = ceph_decode_64(p);
+ }
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
- else
+ else if (info->head->op == CEPH_MDS_OP_READDIR)
return parse_reply_info_dir(p, end, info, features);
+ else if (info->head->op == CEPH_MDS_OP_CREATE)
+ return parse_reply_info_create(p, end, info, features);
+ else
+ return -EIO;
}
/*
@@ -1590,7 +1618,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
} else if (rpath || rino) {
*ino = rino;
*ppath = rpath;
- *pathlen = strlen(rpath);
+ *pathlen = rpath ? strlen(rpath) : 0;
dout(" path %.*s\n", *pathlen, rpath);
}
@@ -1658,8 +1686,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(req->r_uid);
- head->caller_gid = cpu_to_le32(req->r_gid);
+ head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+ head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -1876,9 +1904,14 @@ finish:
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head)
{
- struct ceph_mds_request *req, *nreq;
+ struct ceph_mds_request *req;
+ LIST_HEAD(tmp_list);
+
+ list_splice_init(head, &tmp_list);
- list_for_each_entry_safe(req, nreq, head, r_wait) {
+ while (!list_empty(&tmp_list)) {
+ req = list_entry(tmp_list.next,
+ struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
@@ -2165,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
- if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+ if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+ req->r_op == CEPH_MDS_OP_LSSNAP) &&
rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846dd71d..c2a19fbbe517 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in *dir_in;
u8 dir_complete, dir_end;
};
+
+ /* for create results */
+ struct {
+ bool has_create_ino;
+ u64 ino;
+ };
};
/* encoded blob describing snapshot contexts for certain
@@ -184,8 +190,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- uid_t r_uid;
- gid_t r_gid;
+ kuid_t r_uid;
+ kgid_t r_gid;
/* for choosing which mds to send this request to */
int r_direct_mode;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a35..0d3c9240c61b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
return ERR_PTR(-ENOMEM);
ceph_decode_16_safe(p, end, version, bad);
+ if (version > 3) {
+ pr_warning("got mdsmap version %d > 3, failing", version);
+ goto bad;
+ }
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
m->m_num_data_pg_pools = n;
- m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+ m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
goto badmem;
- ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+ ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
- m->m_data_pg_pools[i] = ceph_decode_32(p);
- m->m_cas_pg_pool = ceph_decode_32(p);
+ m->m_data_pg_pools[i] = ceph_decode_64(p);
+ m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c804..89fa4a940a0f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
case CEPH_MDS_STATE_BOOT: return "up:boot";
case CEPH_MDS_STATE_STANDBY: return "up:standby";
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
case CEPH_MDS_STATE_CREATING: return "up:creating";
case CEPH_MDS_STATE_STARTING: return "up:starting";
/* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUP: return "lookup";
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
+ case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+ case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+ case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
case CEPH_MDS_OP_READDIR: return "readdir";
case CEPH_MDS_OP_MKNOD: return "mknod";
case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2eb43f211325..9fe17c6c2876 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
+ *
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
*/
buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
- buf->f_frsize = PAGE_CACHE_SIZE;
/* leave fsid little-endian, regardless of host endianness */
fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
@@ -403,8 +408,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
- if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
- seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, ",osdkeepalivetimeout=%d",
opt->osd_keepalive_timeout);
@@ -849,7 +852,7 @@ static int ceph_register_bdi(struct super_block *sb,
fsc->backing_dev_info.ra_pages =
default_backing_dev_info.ra_pages;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+ err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
if (!err)
sb->s_bdi = &fsc->backing_dev_info;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 66ebe720e40d..c7b309723dcc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
+#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -138,8 +138,8 @@ struct ceph_cap_snap {
struct ceph_snap_context *context;
umode_t mode;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
struct ceph_buffer *xattr_blob;
u64 xattr_version;
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
- const char *data,
- loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
- char *data,
- loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be9902..9b6b2b6dd164 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,9 +29,94 @@ struct ceph_vxattr {
size_t name_size; /* strlen(name) + 1 (for '\0') */
size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
size_t size);
- bool readonly;
+ bool readonly, hidden;
+ bool (*exists_cb)(struct ceph_inode_info *ci);
};
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+ size_t s;
+ char *p = (char *)&ci->i_layout;
+
+ for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+ if (*p)
+ return true;
+ return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ int ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ const char *pool_name;
+
+ dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ down_read(&osdc->map_sem);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name)
+ ret = snprintf(val, size,
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout),
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+ pool_name);
+ else
+ ret = snprintf(val, size,
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout),
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+ (unsigned long long)pool);
+
+ up_read(&osdc->map_sem);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%lld",
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ const char *pool_name;
+
+ down_read(&osdc->map_sem);
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+ if (pool_name)
+ ret = snprintf(val, size, "%s", pool_name);
+ else
+ ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+ up_read(&osdc->map_sem);
+ return ret;
+}
+
/* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
(long)ci->i_rctime.tv_nsec);
}
-#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
-#define XATTR_NAME_CEPH(_type, _name) \
- { \
- .name = CEPH_XATTR_NAME(_type, _name), \
- .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
- .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
- .readonly = true, \
- }
+#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2) \
+ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
+
+#define XATTR_NAME_CEPH(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .readonly = true, \
+ .hidden = false, \
+ .exists_cb = NULL, \
+ }
+#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
+ { \
+ .name = CEPH_XATTR_NAME2(_type, _name, _field), \
+ .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+ .readonly = false, \
+ .hidden = true, \
+ .exists_cb = ceph_vxattrcb_layout_exists, \
+ }
static struct ceph_vxattr ceph_dir_vxattrs[] = {
+ {
+ .name = "ceph.dir.layout",
+ .name_size = sizeof("ceph.dir.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .readonly = false,
+ .hidden = false,
+ .exists_cb = ceph_vxattrcb_layout_exists,
+ },
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(dir, layout, object_size),
+ XATTR_LAYOUT_FIELD(dir, layout, pool),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_NAME_CEPH(dir, rsubdirs),
XATTR_NAME_CEPH(dir, rbytes),
XATTR_NAME_CEPH(dir, rctime),
- { 0 } /* Required table terminator */
+ { .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- int ret;
-
- ret = snprintf(val, size,
- "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- return ret;
-}
-
static struct ceph_vxattr ceph_file_vxattrs[] = {
- XATTR_NAME_CEPH(file, layout),
- /* The following extended attribute name is deprecated */
{
- .name = XATTR_CEPH_PREFIX "layout",
- .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
- .getxattr_cb = ceph_vxattrcb_file_layout,
- .readonly = true,
+ .name = "ceph.file.layout",
+ .name_size = sizeof("ceph.file.layout"),
+ .getxattr_cb = ceph_vxattrcb_layout,
+ .readonly = false,
+ .hidden = false,
+ .exists_cb = ceph_vxattrcb_layout_exists,
},
- { 0 } /* Required table terminator */
+ XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+ XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+ XATTR_LAYOUT_FIELD(file, layout, object_size),
+ XATTR_LAYOUT_FIELD(file, layout, pool),
+ { .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
size_t size = 0;
for (vxattr = vxattrs; vxattr->name; vxattr++)
- size += vxattr->name_size;
+ if (!vxattr->hidden)
+ size += vxattr->name_size;
return size;
}
@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- /* let's see if a virtual xattr was requested */
- vxattr = ceph_match_vxattr(inode, name);
-
spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
+ /* let's see if a virtual xattr was requested */
+ vxattr = ceph_match_vxattr(inode, name);
+ if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+ err = vxattr->getxattr_cb(ci, value, size);
+ goto out;
+ }
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
spin_lock(&ci->i_ceph_lock);
- if (vxattr && vxattr->readonly) {
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
get_xattr:
err = -ENODATA; /* == ENOATTR */
xattr = __get_xattr(ci, name);
- if (!xattr) {
- if (vxattr)
- err = vxattr->getxattr_cb(ci, value, size);
+ if (!xattr)
goto out;
- }
err = -ERANGE;
if (size && size < xattr->val_len)
@@ -664,23 +763,30 @@ list_xattr:
vir_namelen = ceph_vxattrs_name_size(vxattrs);
/* adding 1 byte per each variable due to the null termination */
- namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+ namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
err = -ERANGE;
- if (size && namelen > size)
+ if (size && vir_namelen + namelen > size)
goto out;
- err = namelen;
+ err = namelen + vir_namelen;
if (size == 0)
goto out;
names = __copy_xattr_names(ci, names);
/* virtual xattr names, too */
- if (vxattrs)
+ err = namelen;
+ if (vxattrs) {
for (i = 0; vxattrs[i].name; i++) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
+ if (!vxattrs[i].hidden &&
+ !(vxattrs[i].exists_cb &&
+ !vxattrs[i].exists_cb(ci))) {
+ len = sprintf(names, "%s", vxattrs[i].name);
+ names += len + 1;
+ err += len + 1;
+ }
}
+ }
out:
spin_unlock(&ci->i_ceph_lock);
@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (vxattr && vxattr->readonly)
return -EOPNOTSUPP;
+ /* pass any unhandled ceph.* xattrs through to the MDS */
+ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto do_sync_unlocked;
+
/* preallocate memory for xattr name, value, index node */
err = -ENOMEM;
newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -838,6 +948,7 @@ retry:
do_sync:
spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
err = ceph_sync_setxattr(dentry, name, value, size, flags);
out:
kfree(newname);
@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
if (vxattr && vxattr->readonly)
return -EOPNOTSUPP;
+ /* pass any unhandled ceph.* xattrs through to the MDS */
+ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto do_sync_unlocked;
+
err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
retry:
@@ -931,6 +1046,7 @@ retry:
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
err = ceph_send_removexattr(dentry, name);
out:
return err;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 21ff76c22a17..2906ee276408 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL
points. If unsure, say N.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
- depends on CIFS && EXPERIMENTAL && BROKEN
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
help
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB2
- bool "SMB2 network file system support (EXPERIMENTAL)"
- depends on CIFS && EXPERIMENTAL && INET
+ bool "SMB2 network file system support"
+ depends on CIFS && INET
select NLS
select KEYS
select FSCACHE
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 86e92ef2abc1..69ae3d3c3b31 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -37,7 +37,6 @@ void dump_smb(void *, int);
#define CIFS_TIMER 0x04
extern int cifsFYI;
-extern int cifsERROR;
/*
* debug ON
@@ -64,10 +63,7 @@ do { \
/* error event message: e.g., i/o error */
#define cifserror(fmt, ...) \
-do { \
- if (cifsERROR) \
- printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
-} while (0)
+ printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
#define cERROR(set, fmt, ...) \
do { \
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bfc..210fce2df308 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
compose_mount_options_err:
kfree(mountdata);
mountdata = ERR_PTR(rc);
+ kfree(*devname);
+ *devname = NULL;
goto compose_mount_options_out;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c865bfdfe819..37e4a72a7d1c 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -55,10 +55,10 @@ struct cifs_sb_info {
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
atomic_t active;
- uid_t mnt_uid;
- gid_t mnt_gid;
- uid_t mnt_backupuid;
- gid_t mnt_backupgid;
+ kuid_t mnt_uid;
+ kgid_t mnt_gid;
+ kuid_t mnt_backupuid;
+ kgid_t mnt_backupgid;
umode_t mnt_file_mode;
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 086f381d6489..10e774761299 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -149,10 +149,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
goto out;
dp = description + strlen(description);
- sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+ sprintf(dp, ";uid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
dp = description + strlen(description);
- sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+ sprintf(dp, ";creduid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
if (sesInfo->user_name) {
dp = description + strlen(description);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 5cbd00e74067..f1e3f25fe004 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -266,8 +266,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct key *sidkey;
char *sidstr;
const struct cred *saved_cred;
- uid_t fuid = cifs_sb->mnt_uid;
- gid_t fgid = cifs_sb->mnt_gid;
+ kuid_t fuid = cifs_sb->mnt_uid;
+ kgid_t fgid = cifs_sb->mnt_gid;
/*
* If we have too many subauthorities, then something is really wrong.
@@ -297,6 +297,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
* probably a safe assumption but might be better to check based on
* sidtype.
*/
+ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
if (sidkey->datalen != sizeof(uid_t)) {
rc = -EIO;
cFYI(1, "%s: Downcall contained malformed key "
@@ -305,10 +306,21 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
goto out_key_put;
}
- if (sidtype == SIDOWNER)
- memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
- else
- memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+ if (sidtype == SIDOWNER) {
+ kuid_t uid;
+ uid_t id;
+ memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+ uid = make_kuid(&init_user_ns, id);
+ if (uid_valid(uid))
+ fuid = uid;
+ } else {
+ kgid_t gid;
+ gid_t id;
+ memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+ gid = make_kgid(&init_user_ns, id);
+ if (gid_valid(gid))
+ fgid = gid;
+ }
out_key_put:
key_put(sidkey);
@@ -346,7 +358,8 @@ init_cifs_idmap(void)
if (!cred)
return -ENOMEM;
- keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
+ keyring = keyring_alloc(".cifs_idmap",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
KEY_ALLOC_NOT_IN_QUOTA, NULL);
@@ -774,7 +787,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
- __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
+ __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
{
int rc = 0;
__u32 dacloffset;
@@ -806,17 +819,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
*aclflag = CIFS_ACL_DACL;
} else {
memcpy(pnntsd, pntsd, secdesclen);
- if (uid != NO_CHANGE_32) { /* chown */
+ if (uid_valid(uid)) { /* chown */
+ uid_t id;
owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
le32_to_cpu(pnntsd->osidoffset));
nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!nowner_sid_ptr)
return -ENOMEM;
- rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+ id = from_kuid(&init_user_ns, uid);
+ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
if (rc) {
cFYI(1, "%s: Mapping error %d for owner id %d",
- __func__, rc, uid);
+ __func__, rc, id);
kfree(nowner_sid_ptr);
return rc;
}
@@ -824,17 +839,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
kfree(nowner_sid_ptr);
*aclflag = CIFS_ACL_OWNER;
}
- if (gid != NO_CHANGE_32) { /* chgrp */
+ if (gid_valid(gid)) { /* chgrp */
+ gid_t id;
group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
le32_to_cpu(pnntsd->gsidoffset));
ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr)
return -ENOMEM;
- rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+ id = from_kgid(&init_user_ns, gid);
+ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
if (rc) {
cFYI(1, "%s: Mapping error %d for group id %d",
- __func__, rc, gid);
+ __func__, rc, id);
kfree(ngroup_sid_ptr);
return rc;
}
@@ -1002,7 +1019,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
/* Convert mode bits to an ACL so we can update the ACL on the server */
int
id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
- uid_t uid, gid_t gid)
+ kuid_t uid, kgid_t gid)
{
int rc = 0;
int aclflag = CIFS_ACL_DACL; /* default flag to set */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ce9f3c5421bf..1a052c0eee8e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,7 +54,6 @@
#endif
int cifsFYI = 0;
-int cifsERROR = 1;
int traceSMB = 0;
bool enable_oplocks = true;
unsigned int linuxExtEnabled = 1;
@@ -229,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb)
cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->delete_pending = false;
cifs_inode->invalid_mapping = false;
- cifs_inode->leave_pages_clean = false;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
@@ -377,13 +375,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
(int)(srcaddr->sa_family));
}
- seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
+ seq_printf(s, ",uid=%u",
+ from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
seq_printf(s, ",forceuid");
else
seq_printf(s, ",noforceuid");
- seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
+ seq_printf(s, ",gid=%u",
+ from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
seq_printf(s, ",forcegid");
else
@@ -438,9 +438,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
seq_printf(s, ",noperm");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
- seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
+ seq_printf(s, ",backupuid=%u",
+ from_kuid_munged(&init_user_ns,
+ cifs_sb->mnt_backupuid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
- seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
+ seq_printf(s, ",backupgid=%u",
+ from_kgid_munged(&init_user_ns,
+ cifs_sb->mnt_backupgid));
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
@@ -560,6 +564,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
dentry = ERR_PTR(-ENOENT);
break;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
/* skip separators */
while (*s == sep)
@@ -679,7 +688,7 @@ out_nls:
static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
ssize_t written;
int rc;
@@ -703,7 +712,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
*/
if (whence != SEEK_SET && whence != SEEK_CUR) {
int rc;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
/*
* We need to be sure that all dirty pages are written and the
@@ -735,7 +744,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
{
/* note that this is called by vfs setlease with lock_flocks held
to protect *lease from going away */
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
if (!(S_ISREG(inode->i_mode)))
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index aea1eec64911..4f07f6fbe494 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -386,6 +386,7 @@ struct smb_version_values {
unsigned int cap_unix;
unsigned int cap_nt_find;
unsigned int cap_large_files;
+ unsigned int oplock_read;
};
#define HEADER_SIZE(server) (server->vals->header_size)
@@ -399,11 +400,11 @@ struct smb_vol {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
- uid_t cred_uid;
- uid_t linux_uid;
- gid_t linux_gid;
- uid_t backupuid;
- gid_t backupgid;
+ kuid_t cred_uid;
+ kuid_t linux_uid;
+ kgid_t linux_gid;
+ kuid_t backupuid;
+ kgid_t backupgid;
umode_t file_mode;
umode_t dir_mode;
unsigned secFlg;
@@ -702,8 +703,8 @@ struct cifs_ses {
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
__u64 Suid; /* remote smb uid */
- uid_t linux_uid; /* overriding owner of files on the mount */
- uid_t cred_uid; /* owner of credentials */
+ kuid_t linux_uid; /* overriding owner of files on the mount */
+ kuid_t cred_uid; /* owner of credentials */
unsigned int capabilities;
char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
TCP names - will ipv6 and sctp addresses fit? */
@@ -837,7 +838,7 @@ struct cifs_tcon {
*/
struct tcon_link {
struct rb_node tl_rbnode;
- uid_t tl_uid;
+ kuid_t tl_uid;
unsigned long tl_flags;
#define TCON_LINK_MASTER 0
#define TCON_LINK_PENDING 1
@@ -930,7 +931,7 @@ struct cifsFileInfo {
struct list_head tlist; /* pointer to next fid owned by tcon */
struct list_head flist; /* next fid (file instance) for this inode */
struct cifs_fid_locks *llist; /* brlocks held by this fid */
- unsigned int uid; /* allows finding which FileInfo structure */
+ kuid_t uid; /* allows finding which FileInfo structure */
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
/* BB add lock scope info here if needed */ ;
@@ -1030,7 +1031,6 @@ struct cifsInodeInfo {
bool clientCanCacheAll; /* read and writebehind oplock */
bool delete_pending; /* DELETE_ON_CLOSE is set */
bool invalid_mapping; /* pagecache is invalid */
- bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
unsigned long time; /* jiffies of last update of inode */
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
@@ -1245,8 +1245,8 @@ struct cifs_fattr {
u64 cf_eof;
u64 cf_bytes;
u64 cf_createtime;
- uid_t cf_uid;
- gid_t cf_gid;
+ kuid_t cf_uid;
+ kgid_t cf_gid;
umode_t cf_mode;
dev_t cf_rdev;
unsigned int cf_nlink;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b9d59a948a2c..e996ff6b26d1 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -277,7 +277,6 @@
#define CIFS_NO_HANDLE 0xFFFF
#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL
-#define NO_CHANGE_32 0xFFFFFFFFUL
/* IPC$ in ASCII */
#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1988c1baa224..f450f0683ddd 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -46,7 +46,8 @@ extern void _free_xid(unsigned int);
({ \
unsigned int __xid = _get_xid(); \
cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \
- __func__, __xid, current_fsuid()); \
+ __func__, __xid, \
+ from_kuid(&init_user_ns, current_fsuid())); \
__xid; \
})
@@ -161,7 +162,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
const char *path, const __u16 *pfid);
extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
- uid_t, gid_t);
+ kuid_t, kgid_t);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
@@ -304,8 +305,8 @@ struct cifs_unix_set_info_args {
__u64 atime;
__u64 mtime;
__u64 mode;
- __u64 uid;
- __u64 gid;
+ kuid_t uid;
+ kgid_t gid;
dev_t device;
};
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 76d0d2998850..7353bc5d73d7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1909,8 +1909,11 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
} while (rc == -EAGAIN);
for (i = 0; i < wdata->nr_pages; i++) {
- if (rc != 0)
+ if (rc != 0) {
SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ page_cache_release(wdata->pages[i]);
+ }
unlock_page(wdata->pages[i]);
}
@@ -5819,8 +5822,14 @@ static void
cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
const struct cifs_unix_set_info_args *args)
{
+ u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;
u64 mode = args->mode;
+ if (uid_valid(args->uid))
+ uid = from_kuid(&init_user_ns, args->uid);
+ if (gid_valid(args->gid))
+ gid = from_kgid(&init_user_ns, args->gid);
+
/*
* Samba server ignores set of file size to zero due to bugs in some
* older clients, but we should be precise - we use SetFileSize to
@@ -5833,8 +5842,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
data_offset->LastStatusChange = cpu_to_le64(args->ctime);
data_offset->LastAccessTime = cpu_to_le64(args->atime);
data_offset->LastModificationTime = cpu_to_le64(args->mtime);
- data_offset->Uid = cpu_to_le64(args->uid);
- data_offset->Gid = cpu_to_le64(args->gid);
+ data_offset->Uid = cpu_to_le64(uid);
+ data_offset->Gid = cpu_to_le64(gid);
/* better to leave device as zero when it is */
data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7635b5db26a7..54125e04fd0c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -987,6 +987,41 @@ static int get_option_ul(substring_t args[], unsigned long *option)
return rc;
}
+static int get_option_uid(substring_t args[], kuid_t *result)
+{
+ unsigned long value;
+ kuid_t uid;
+ int rc;
+
+ rc = get_option_ul(args, &value);
+ if (rc)
+ return rc;
+
+ uid = make_kuid(current_user_ns(), value);
+ if (!uid_valid(uid))
+ return -EINVAL;
+
+ *result = uid;
+ return 0;
+}
+
+static int get_option_gid(substring_t args[], kgid_t *result)
+{
+ unsigned long value;
+ kgid_t gid;
+ int rc;
+
+ rc = get_option_ul(args, &value);
+ if (rc)
+ return rc;
+
+ gid = make_kgid(current_user_ns(), value);
+ if (!gid_valid(gid))
+ return -EINVAL;
+
+ *result = gid;
+ return 0;
+}
static int cifs_parse_security_flavors(char *value,
struct smb_vol *vol)
@@ -996,7 +1031,7 @@ static int cifs_parse_security_flavors(char *value,
switch (match_token(value, cifs_secflavor_tokens, args)) {
case Opt_sec_krb5:
- vol->secFlg |= CIFSSEC_MAY_KRB5;
+ vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
break;
case Opt_sec_krb5i:
vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
@@ -1424,47 +1459,42 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* Numeric Values */
case Opt_backupuid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->backupuid)) {
cERROR(1, "%s: Invalid backupuid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->backupuid = option;
vol->backupuid_specified = true;
break;
case Opt_backupgid:
- if (get_option_ul(args, &option)) {
+ if (get_option_gid(args, &vol->backupgid)) {
cERROR(1, "%s: Invalid backupgid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->backupgid = option;
vol->backupgid_specified = true;
break;
case Opt_uid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->linux_uid)) {
cERROR(1, "%s: Invalid uid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->linux_uid = option;
uid_specified = true;
break;
case Opt_cruid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->cred_uid)) {
cERROR(1, "%s: Invalid cruid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->cred_uid = option;
break;
case Opt_gid:
- if (get_option_ul(args, &option)) {
+ if (get_option_gid(args, &vol->linux_gid)) {
cERROR(1, "%s: Invalid gid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->linux_gid = option;
gid_specified = true;
break;
case Opt_file_mode:
@@ -1624,14 +1654,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_unc:
string = vol->UNC;
vol->UNC = match_strdup(args);
- if (vol->UNC == NULL) {
- kfree(string);
+ if (vol->UNC == NULL)
goto out_nomem;
- }
convert_delimiter(vol->UNC, '\\');
if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
- kfree(string);
printk(KERN_ERR "CIFS: UNC Path does not "
"begin with // or \\\\\n");
goto cifs_parse_mount_err;
@@ -1687,10 +1714,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
string = vol->prepath;
vol->prepath = match_strdup(args);
- if (vol->prepath == NULL) {
- kfree(string);
+ if (vol->prepath == NULL)
goto out_nomem;
- }
/* Compare old prefixpath= option to new one */
if (!string || strcmp(string, vol->prepath))
printk(KERN_WARNING "CIFS: the value of the "
@@ -1922,7 +1947,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
}
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
- struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
}
default:
@@ -2246,7 +2271,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
{
switch (ses->server->secType) {
case Kerberos:
- if (vol->cred_uid != ses->cred_uid)
+ if (!uid_eq(vol->cred_uid, ses->cred_uid))
return 0;
break;
default:
@@ -2718,7 +2743,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
if (new->rsize && new->rsize < old->rsize)
return 0;
- if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+ if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))
return 0;
if (old->mnt_file_mode != new->mnt_file_mode ||
@@ -3924,7 +3949,7 @@ cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
}
static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
{
int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
@@ -3994,7 +4019,7 @@ cifs_sb_tcon_pending_wait(void *unused)
/* find and return a tlink with given uid */
static struct tcon_link *
-tlink_rb_search(struct rb_root *root, uid_t uid)
+tlink_rb_search(struct rb_root *root, kuid_t uid)
{
struct rb_node *node = root->rb_node;
struct tcon_link *tlink;
@@ -4002,9 +4027,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid)
while (node) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
- if (tlink->tl_uid > uid)
+ if (uid_gt(tlink->tl_uid, uid))
node = node->rb_left;
- else if (tlink->tl_uid < uid)
+ else if (uid_lt(tlink->tl_uid, uid))
node = node->rb_right;
else
return tlink;
@@ -4023,7 +4048,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
parent = *new;
- if (tlink->tl_uid > new_tlink->tl_uid)
+ if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))
new = &((*new)->rb_left);
else
new = &((*new)->rb_right);
@@ -4053,7 +4078,7 @@ struct tcon_link *
cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
{
int ret;
- uid_t fsuid = current_fsuid();
+ kuid_t fsuid = current_fsuid();
struct tcon_link *tlink, *newtlink;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8719bbe0dcc3..1cd016217448 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
*created |= FILE_CREATED;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64) current_fsuid();
+ args.uid = current_fsuid();
if (inode->i_mode & S_ISGID)
- args.gid = (__u64) inode->i_gid;
+ args.gid = inode->i_gid;
else
- args.gid = (__u64) current_fsgid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
current->tgid);
@@ -588,11 +588,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
.device = device_number,
};
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64) current_fsuid();
- args.gid = (__u64) current_fsgid();
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
cifs_sb->local_nls,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0a6677ba212b..8c0d85577314 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -43,6 +43,7 @@
#include "cifs_fs_sb.h"
#include "fscache.h"
+
static inline int cifs_convert_flags(unsigned int flags)
{
if ((flags & O_ACCMODE) == O_RDONLY)
@@ -72,10 +73,15 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
else if ((flags & O_ACCMODE) == O_RDWR)
posix_flags = SMB_O_RDWR;
- if (flags & O_CREAT)
+ if (flags & O_CREAT) {
posix_flags |= SMB_O_CREAT;
- if (flags & O_EXCL)
- posix_flags |= SMB_O_EXCL;
+ if (flags & O_EXCL)
+ posix_flags |= SMB_O_EXCL;
+ } else if (flags & O_EXCL)
+ cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag"
+ "but not O_CREAT on file open. Ignoring O_EXCL",
+ current->comm, current->tgid);
+
if (flags & O_TRUNC)
posix_flags |= SMB_O_TRUNC;
/* be safe and imply O_SYNC for O_DSYNC */
@@ -238,6 +244,23 @@ out:
return rc;
}
+static bool
+cifs_has_mand_locks(struct cifsInodeInfo *cinode)
+{
+ struct cifs_fid_locks *cur;
+ bool has_locks = false;
+
+ down_read(&cinode->lock_sem);
+ list_for_each_entry(cur, &cinode->llist, llist) {
+ if (!list_empty(&cur->locks)) {
+ has_locks = true;
+ break;
+ }
+ }
+ up_read(&cinode->lock_sem);
+ return has_locks;
+}
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -248,6 +271,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct cifsFileInfo *cfile;
struct cifs_fid_locks *fdlocks;
struct cifs_tcon *tcon = tlink_tcon(tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
if (cfile == NULL)
@@ -276,12 +300,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
mutex_init(&cfile->fh_mutex);
+ /*
+ * If the server returned a read oplock and we have mandatory brlocks,
+ * set oplock level to None.
+ */
+ if (oplock == server->vals->oplock_read &&
+ cifs_has_mand_locks(cinode)) {
+ cFYI(1, "Reset oplock val from read to None due to mand locks");
+ oplock = 0;
+ }
+
spin_lock(&cifs_file_list_lock);
- if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE)
+ if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
oplock = fid->pending_open->oplock;
list_del(&fid->pending_open->olist);
- tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock);
+ server->ops->set_fid(cfile, fid, oplock);
list_add(&cfile->tlist, &tcon->openFileList);
/* if readable file instance put first in list*/
@@ -487,8 +521,8 @@ int cifs_open(struct inode *inode, struct file *file)
*/
struct cifs_unix_set_info_args args = {
.mode = inode->i_mode,
- .uid = NO_CHANGE_64,
- .gid = NO_CHANGE_64,
+ .uid = INVALID_UID, /* no change */
+ .gid = INVALID_GID, /* no change */
.ctime = NO_CHANGE_64,
.atime = NO_CHANGE_64,
.mtime = NO_CHANGE_64,
@@ -919,7 +953,7 @@ static int
cifs_posix_lock_test(struct file *file, struct file_lock *flock)
{
int rc = 0;
- struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
unsigned char saved_type = flock->fl_type;
if ((flock->fl_flags & FL_POSIX) == 0)
@@ -946,7 +980,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
static int
cifs_posix_lock_set(struct file *file, struct file_lock *flock)
{
- struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
int rc = 1;
if ((flock->fl_flags & FL_POSIX) == 0)
@@ -1422,6 +1456,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+ struct inode *inode = cfile->dentry->d_inode;
if (posix_lck) {
int posix_lock_type;
@@ -1459,6 +1494,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
if (!rc)
goto out;
+ /*
+ * Windows 7 server can delay breaking lease from read to None
+ * if we set a byte-range lock on a file - break it explicitly
+ * before sending the lock to the server to be sure the next
+ * read won't conflict with non-overlapted locks due to
+ * pagereading.
+ */
+ if (!CIFS_I(inode)->clientCanCacheAll &&
+ CIFS_I(inode)->clientCanCacheRead) {
+ cifs_invalidate_mapping(inode);
+ cFYI(1, "Set no oplock for inode=%p due to mand locks",
+ inode);
+ CIFS_I(inode)->clientCanCacheRead = false;
+ }
+
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type, 1, 0, wait_flag);
if (rc) {
@@ -1504,7 +1554,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
netfid = cfile->fid.netfid;
- cinode = CIFS_I(file->f_path.dentry->d_inode);
+ cinode = CIFS_I(file_inode(file));
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
@@ -1649,7 +1699,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
- if (fsuid_only && open_file->uid != current_fsuid())
+ if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
if (!open_file->invalidHandle) {
@@ -1702,7 +1752,7 @@ refind_writable:
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
continue;
- if (fsuid_only && open_file->uid != current_fsuid())
+ if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
@@ -2103,15 +2153,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
} else {
rc = copied;
pos += copied;
- /*
- * When we use strict cache mode and cifs_strict_writev was run
- * with level II oplock (indicated by leave_pages_clean field of
- * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
- * sent the data to the server itself.
- */
- if (!CIFS_I(inode)->leave_pages_clean ||
- !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
- set_page_dirty(page);
+ set_page_dirty(page);
}
if (rc > 0) {
@@ -2135,7 +2177,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct cifsFileInfo *smbfile = file->private_data;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -2210,7 +2252,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
int cifs_flush(struct file *file, fl_owner_t id)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
int rc = 0;
if (file->f_mode & FMODE_WRITE)
@@ -2444,7 +2486,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
ssize_t written;
struct inode *inode;
- inode = iocb->ki_filp->f_path.dentry->d_inode;
+ inode = file_inode(iocb->ki_filp);
/*
* BB - optimize the way when signing is disabled. We can drop this
@@ -2462,8 +2504,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
}
static ssize_t
-cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool cache_ex)
+cifs_writev(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2485,12 +2527,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
server->vals->exclusive_lock_type, NULL,
CIFS_WRITE_OP)) {
mutex_lock(&inode->i_mutex);
- if (!cache_ex)
- cinode->leave_pages_clean = true;
rc = __generic_file_aio_write(iocb, iov, nr_segs,
- &iocb->ki_pos);
- if (!cache_ex)
- cinode->leave_pages_clean = false;
+ &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
}
@@ -2511,66 +2549,38 @@ ssize_t
cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)
iocb->ki_filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- ssize_t written, written2;
- /*
- * We need to store clientCanCacheAll here to prevent race
- * conditions - this value can be changed during an execution
- * of generic_file_aio_write. For CIFS it can be changed from
- * true to false only, but for SMB2 it can be changed both from
- * true to false and vice versa. So, we can end up with a data
- * stored in the cache, not marked dirty and not sent to the
- * server if this value changes its state from false to true
- * after cifs_write_end.
- */
- bool cache_ex = cinode->clientCanCacheAll;
- bool cache_read = cinode->clientCanCacheRead;
- int rc;
- loff_t saved_pos;
+ ssize_t written;
- if (cache_ex) {
+ if (cinode->clientCanCacheAll) {
if (cap_unix(tcon->ses) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
- tcon->fsUnixInfo.Capability)))
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
+ && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
return generic_file_aio_write(iocb, iov, nr_segs, pos);
- return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
+ return cifs_writev(iocb, iov, nr_segs, pos);
}
-
/*
- * For files without exclusive oplock in strict cache mode we need to
- * write the data to the server exactly from the pos to pos+len-1 rather
- * than flush all affected pages because it may cause a error with
- * mandatory locks on these pages but not on the region from pos to
- * ppos+len-1.
+ * For non-oplocked files in strict cache mode we need to write the data
+ * to the server exactly from the pos to pos+len-1 rather than flush all
+ * affected pages because it may cause a error with mandatory locks on
+ * these pages but not on the region from pos to ppos+len-1.
*/
written = cifs_user_writev(iocb, iov, nr_segs, pos);
- if (!cache_read || written <= 0)
- return written;
-
- saved_pos = iocb->ki_pos;
- iocb->ki_pos = pos;
- /* we have a read oplock - need to store a data in the page cache */
- if (cap_unix(tcon->ses) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
- tcon->fsUnixInfo.Capability)))
- written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
- else
- written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
- cache_ex);
- /* errors occured during writing - invalidate the page cache */
- if (written2 < 0) {
- rc = cifs_invalidate_mapping(inode);
- if (rc)
- written = (ssize_t)rc;
- else
- iocb->ki_pos = saved_pos;
+ if (written > 0 && cinode->clientCanCacheRead) {
+ /*
+ * Windows 7 server can delay breaking level2 oplock if a write
+ * request comes - break it on the client to prevent reading
+ * an old data.
+ */
+ cifs_invalidate_mapping(inode);
+ cFYI(1, "Set no oplock for inode=%p after a write operation",
+ inode);
+ cinode->clientCanCacheRead = false;
}
return written;
}
@@ -2911,7 +2921,7 @@ ssize_t
cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)
@@ -3059,7 +3069,7 @@ static struct vm_operations_struct cifs_file_vm_ops = {
int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
{
int rc, xid;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
xid = get_xid();
@@ -3352,7 +3362,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
int rc;
/* Is the page cached? */
- rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+ rc = cifs_readpage_from_fscache(file_inode(file), page);
if (rc == 0)
goto read_complete;
@@ -3367,8 +3377,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
else
cFYI(1, "Bytes read %d", rc);
- file->f_path.dentry->d_inode->i_atime =
- current_fs_time(file->f_path.dentry->d_inode->i_sb);
+ file_inode(file)->i_atime =
+ current_fs_time(file_inode(file)->i_sb);
if (PAGE_CACHE_SIZE > rc)
memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
@@ -3377,7 +3387,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
SetPageUptodate(page);
/* send this page to the cache */
- cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+ cifs_readpage_to_fscache(file_inode(file), page);
rc = 0;
@@ -3577,6 +3587,13 @@ void cifs_oplock_break(struct work_struct *work)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
+ if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
+ cifs_has_mand_locks(cinode)) {
+ cFYI(1, "Reset oplock to None for inode=%p due to mand locks",
+ inode);
+ cinode->clientCanCacheRead = false;
+ }
+
if (inode && S_ISREG(inode->i_mode)) {
if (cinode->clientCanCacheRead)
break_lease(inode, O_RDONLY);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ed6208ff85a7..83f2606c76d0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -244,15 +244,25 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
break;
}
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
- fattr->cf_uid = cifs_sb->mnt_uid;
- else
- fattr->cf_uid = le64_to_cpu(info->Uid);
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
- fattr->cf_gid = cifs_sb->mnt_gid;
- else
- fattr->cf_gid = le64_to_cpu(info->Gid);
+ fattr->cf_uid = cifs_sb->mnt_uid;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) {
+ u64 id = le64_to_cpu(info->Uid);
+ if (id < ((uid_t)-1)) {
+ kuid_t uid = make_kuid(&init_user_ns, id);
+ if (uid_valid(uid))
+ fattr->cf_uid = uid;
+ }
+ }
+
+ fattr->cf_gid = cifs_sb->mnt_gid;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) {
+ u64 id = le64_to_cpu(info->Gid);
+ if (id < ((gid_t)-1)) {
+ kgid_t gid = make_kgid(&init_user_ns, id);
+ if (gid_valid(gid))
+ fattr->cf_gid = gid;
+ }
+ }
fattr->cf_nlink = le64_to_cpu(info->Nlinks);
}
@@ -289,7 +299,7 @@ cifs_get_file_info_unix(struct file *filp)
unsigned int xid;
FILE_UNIX_BASIC_INFO find_data;
struct cifs_fattr fattr;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
@@ -558,7 +568,7 @@ cifs_get_file_info(struct file *filp)
unsigned int xid;
FILE_ALL_INFO find_data;
struct cifs_fattr fattr;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
@@ -806,10 +816,9 @@ static bool
inode_has_hashed_dentries(struct inode *inode)
{
struct dentry *dentry;
- struct hlist_node *p;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
spin_unlock(&inode->i_lock);
return true;
@@ -1245,14 +1254,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
.device = 0,
};
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64)current_fsuid();
+ args.uid = current_fsuid();
if (parent->i_mode & S_ISGID)
- args.gid = (__u64)parent->i_gid;
+ args.gid = parent->i_gid;
else
- args.gid = (__u64)current_fsgid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
cifs_sb->local_nls,
@@ -1678,7 +1687,7 @@ cifs_invalidate_mapping(struct inode *inode)
int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
if (!cifs_inode_needs_reval(inode))
@@ -1735,7 +1744,7 @@ out:
int cifs_revalidate_file(struct file *filp)
{
int rc;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
rc = cifs_revalidate_file_attr(filp);
if (rc)
@@ -2013,12 +2022,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if (attrs->ia_valid & ATTR_UID)
args->uid = attrs->ia_uid;
else
- args->uid = NO_CHANGE_64;
+ args->uid = INVALID_UID; /* no change */
if (attrs->ia_valid & ATTR_GID)
args->gid = attrs->ia_gid;
else
- args->gid = NO_CHANGE_64;
+ args->gid = INVALID_GID; /* no change */
if (attrs->ia_valid & ATTR_ATIME)
args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
@@ -2086,8 +2095,8 @@ static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
{
unsigned int xid;
- uid_t uid = NO_CHANGE_32;
- gid_t gid = NO_CHANGE_32;
+ kuid_t uid = INVALID_UID;
+ kgid_t gid = INVALID_GID;
struct inode *inode = direntry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,7 +2155,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+ if (uid_valid(uid) || gid_valid(gid)) {
rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
uid, gid);
if (rc) {
@@ -2170,7 +2179,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = id_mode_to_cifs_acl(inode, full_path, mode,
- NO_CHANGE_32, NO_CHANGE_32);
+ INVALID_UID, INVALID_GID);
if (rc) {
cFYI(1, "%s: Setting ACL failed with error: %d",
__func__, rc);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index fd5009d56f9f..6c9f1214cf0b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -30,7 +30,7 @@
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
- struct inode *inode = filep->f_dentry->d_inode;
+ struct inode *inode = file_inode(filep);
int rc = -ENOTTY; /* strange error - but the precedent */
unsigned int xid;
struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 51dc2fb6e854..9f6c4c45d21e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
}
rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
if (rc) {
- cERROR(1, "%s: Could not update iwth link_str", __func__);
+ cERROR(1, "%s: Could not update with link_str", __func__);
goto symlink_hash_err;
}
rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3a00c0d0cead..1b15bf839f37 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,7 +569,7 @@ bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
- if (cifs_sb->mnt_backupuid == current_fsuid())
+ if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))
return true;
}
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6002fdc920ae..df40cc5fd13a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -78,23 +78,32 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct dentry *dentry, *alias;
struct inode *inode;
struct super_block *sb = parent->d_inode->i_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
cFYI(1, "%s: for %s", __func__, name->name);
- if (parent->d_op && parent->d_op->d_hash)
- parent->d_op->d_hash(parent, parent->d_inode, name);
- else
- name->hash = full_name_hash(name->name, name->len);
+ dentry = d_hash_and_lookup(parent, name);
+ if (unlikely(IS_ERR(dentry)))
+ return;
- dentry = d_lookup(parent, name);
if (dentry) {
int err;
inode = dentry->d_inode;
- /* update inode in place if i_ino didn't change */
- if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
- cifs_fattr_to_inode(inode, fattr);
- goto out;
+ if (inode) {
+ /*
+ * If we're generating inode numbers, then we don't
+ * want to clobber the existing one with the one that
+ * the readdir code created.
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+ fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
+
+ /* update inode in place if i_ino didn't change */
+ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ cifs_fattr_to_inode(inode, fattr);
+ goto out;
+ }
}
err = d_invalidate(dentry);
dput(dentry);
@@ -494,7 +503,7 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
whether we can use the cached search results from the previous search */
static int is_dir_changed(struct file *file)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
if (cifsInfo->time == 0)
@@ -767,7 +776,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
switch ((int) file->f_pos) {
case 0:
if (filldir(direntry, ".", 1, file->f_pos,
- file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
+ file_inode(file)->i_ino, DT_DIR) < 0) {
cERROR(1, "Filldir for current dir failed");
rc = -ENOMEM;
break;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index a5d234c8d5d9..47bc5a87f94e 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
mutex_unlock(&server->srv_mutex);
return rc;
}
+
+ /*
+ * The response to this call was already factored into the sequence
+ * number when the call went out, so we must adjust it back downward
+ * after signing here.
+ */
+ --server->sequence_number;
rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
mutex_unlock(&server->srv_mutex);
@@ -952,4 +959,5 @@ struct smb_version_values smb1_values = {
.cap_unix = CAP_UNIX,
.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
.cap_large_files = CAP_LARGE_FILES,
+ .oplock_read = OPLOCK_READ,
};
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d79de7bc4435..c9c7aa7ed966 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -708,6 +708,7 @@ struct smb_version_values smb20_values = {
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
+ .oplock_read = SMB2_OPLOCK_LEVEL_II,
};
struct smb_version_values smb21_values = {
@@ -725,6 +726,7 @@ struct smb_version_values smb21_values = {
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
+ .oplock_read = SMB2_OPLOCK_LEVEL_II,
};
struct smb_version_values smb30_values = {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 76d974c952fe..1a528680ec5a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
*sent = 0;
- if (ssocket == NULL)
- return -ENOTSOCK; /* BB eventually add reconnect code here */
-
smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
smb_msg.msg_namelen = sizeof(struct sockaddr);
smb_msg.msg_control = NULL;
@@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
struct socket *ssocket = server->ssocket;
int val = 1;
+ if (ssocket == NULL)
+ return -ENOTSOCK;
+
cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
dump_smb(iov[0].iov_base, iov[0].iov_len);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 958ae0e0ff8c..1da168c61d35 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask)
spin_lock(&cii->c_lock);
cii->c_cached_epoch = atomic_read(&permission_epoch);
- if (cii->c_uid != current_fsuid()) {
+ if (!uid_eq(cii->c_uid, current_fsuid())) {
cii->c_uid = current_fsuid();
cii->c_cached_perm = mask;
} else
@@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask)
spin_lock(&cii->c_lock);
hit = (mask & cii->c_cached_perm) == mask &&
- cii->c_uid == current_fsuid() &&
+ uid_eq(cii->c_uid, current_fsuid()) &&
cii->c_cached_epoch == atomic_read(&permission_epoch);
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index b24fdfd8a3f0..c64075213218 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -25,7 +25,7 @@ struct coda_inode_info {
u_short c_flags; /* flags (see below) */
unsigned int c_mapcount; /* nr of times this inode is mapped */
unsigned int c_cached_epoch; /* epoch for cached permissions */
- vuid_t c_uid; /* fsuid for cached permissions */
+ kuid_t c_uid; /* fsuid for cached permissions */
unsigned int c_cached_perm; /* cached access permissions */
spinlock_t c_lock;
struct inode vfs_inode;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 854ace712685..2849f41e72a2 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_mode != (u_short) -1)
inode->i_mode = attr->va_mode | inode_type;
if (attr->va_uid != -1)
- inode->i_uid = (uid_t) attr->va_uid;
+ inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);
if (attr->va_gid != -1)
- inode->i_gid = (gid_t) attr->va_gid;
+ inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);
if (attr->va_nlink != -1)
set_nlink(inode, attr->va_nlink);
if (attr->va_size != -1)
@@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
vattr->va_mode = iattr->ia_mode;
}
if ( valid & ATTR_UID ) {
- vattr->va_uid = (vuid_t) iattr->ia_uid;
+ vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);
}
if ( valid & ATTR_GID ) {
- vattr->va_gid = (vgid_t) iattr->ia_gid;
+ vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);
}
if ( valid & ATTR_SIZE ) {
vattr->va_size = iattr->ia_size;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 49fe52d25600..b7d3a05c062c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -397,7 +397,7 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
* We can't use vfs_readdir because we have to keep the file
* position in sync between the coda_file and the host_file.
* and as such we need grab the inode mutex. */
- struct inode *host_inode = host_file->f_path.dentry->d_inode;
+ struct inode *host_inode = file_inode(host_file);
mutex_lock(&host_inode->i_mutex);
host_file->f_pos = coda_file->f_pos;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 8edd404e6419..fa4c100bdc7d 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -66,7 +66,7 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
static ssize_t
coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode;
+ struct inode *host_inode, *coda_inode = file_inode(coda_file);
struct coda_file_info *cfi;
struct file *host_file;
ssize_t ret;
@@ -78,7 +78,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
if (!host_file->f_op || !host_file->f_op->write)
return -EINVAL;
- host_inode = host_file->f_path.dentry->d_inode;
+ host_inode = file_inode(host_file);
mutex_lock(&coda_inode->i_mutex);
ret = host_file->f_op->write(host_file, buf, count, ppos);
@@ -106,8 +106,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
if (!host_file->f_op || !host_file->f_op->mmap)
return -ENODEV;
- coda_inode = coda_file->f_path.dentry->d_inode;
- host_inode = host_file->f_path.dentry->d_inode;
+ coda_inode = file_inode(coda_file);
+ host_inode = file_inode(host_file);
cii = ITOC(coda_inode);
spin_lock(&cii->c_lock);
@@ -178,7 +178,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
coda_flags, coda_file->f_cred->fsuid);
- host_inode = cfi->cfi_container->f_path.dentry->d_inode;
+ host_inode = file_inode(cfi->cfi_container);
cii = ITOC(coda_inode);
/* did we mmap this file? */
@@ -202,7 +202,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
{
struct file *host_file;
- struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
+ struct inode *coda_inode = file_inode(coda_file);
struct coda_file_info *cfi;
int err;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index be2aa4909487..dada9d0abede 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -20,6 +20,7 @@
#include <linux/file.h>
#include <linux/vfs.h>
#include <linux/slab.h>
+#include <linux/pid_namespace.h>
#include <asm/uaccess.h>
@@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
ei->c_flags = 0;
- ei->c_uid = 0;
+ ei->c_uid = GLOBAL_ROOT_UID;
ei->c_cached_perm = 0;
spin_lock_init(&ei->c_lock);
return &ei->vfs_inode;
@@ -129,7 +130,7 @@ static int get_device_index(struct coda_mount_data *data)
f = fdget(data->fd);
if (!f.file)
goto Ebadf;
- inode = f.file->f_path.dentry->d_inode;
+ inode = file_inode(f.file);
if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
fdput(f);
goto Ebadf;
@@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
int error;
int idx;
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
idx = get_device_index((struct coda_mount_data *) data);
/* Ignore errors in data, for backward compatibility */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ee0981f1375b..3f5de96bbb58 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -52,7 +52,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
struct path path;
int error;
struct PioctlData data;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct inode *target_inode = NULL;
struct coda_inode_info *cnp;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 761d5b31b18d..ebc2bae6c289 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -37,6 +37,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/device.h>
+#include <linux/pid_namespace.h>
#include <asm/io.h>
#include <asm/poll.h>
#include <asm/uaccess.h>
@@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
struct venus_comm *vcp;
int idx, err;
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
+ if (current_user_ns() != &init_user_ns)
+ return -EINVAL;
+
idx = iminor(inode);
if (idx < 0 || idx >= MAX_CODADEVS)
return -ENODEV;
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 0c68fd31fbf2..3a731976dc5e 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size)
return ERR_PTR(-ENOMEM);
inp->ih.opcode = opcode;
- inp->ih.pid = current->pid;
- inp->ih.pgid = task_pgrp_nr(current);
- inp->ih.uid = current_fsuid();
+ inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns);
+ inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns);
+ inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());
return (void*)inp;
}
@@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
}
int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
- vuid_t uid)
+ kuid_t uid)
{
union inputArgs *inp;
union outputArgs *outp;
@@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
insize = SIZE(release);
UPARG(CODA_CLOSE);
- inp->ih.uid = uid;
+ inp->ih.uid = from_kuid(&init_user_ns, uid);
inp->coda_close.VFid = *fid;
inp->coda_close.flags = flags;
diff --git a/fs/compat.c b/fs/compat.c
index 015e1e1f87c6..fe40fde29111 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1278,8 +1278,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_open(const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
@@ -1288,8 +1287,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode)
* Exactly like fs/open.c:sys_openat(), except that it doesn't set the
* O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(dfd, filename, flags, mode);
}
@@ -1739,55 +1737,13 @@ asmlinkage long compat_sys_signalfd(int ufd,
}
#endif /* CONFIG_SIGNALFD */
-#ifdef CONFIG_TIMERFD
-
-asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
- const struct compat_itimerspec __user *utmr,
- struct compat_itimerspec __user *otmr)
-{
- int error;
- struct itimerspec t;
- struct itimerspec __user *ut;
-
- if (get_compat_itimerspec(&t, utmr))
- return -EFAULT;
- ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
- if (copy_to_user(&ut[0], &t, sizeof(t)))
- return -EFAULT;
- error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
- if (!error && otmr)
- error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
- put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
- return error;
-}
-
-asmlinkage long compat_sys_timerfd_gettime(int ufd,
- struct compat_itimerspec __user *otmr)
-{
- int error;
- struct itimerspec t;
- struct itimerspec __user *ut;
-
- ut = compat_alloc_user_space(sizeof(struct itimerspec));
- error = sys_timerfd_gettime(ufd, ut);
- if (!error)
- error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
- put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
- return error;
-}
-
-#endif /* CONFIG_TIMERFD */
-
#ifdef CONFIG_FHANDLE
/*
* Exactly like fs/open.c:sys_open_by_handle_at(), except that it
* doesn't set the O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_open_by_handle_at(int mountdirfd,
- struct file_handle __user *handle, int flags)
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+ struct file_handle __user *, handle, int, flags)
{
return do_handle_open(mountdirfd, handle, flags);
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e2f57a007029..3ced75f765ca 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1582,7 +1582,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
case FIBMAP:
case FIGETBSZ:
case FIONREAD:
- if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))
+ if (S_ISREG(file_inode(f.file)->i_mode))
break;
/*FALL THROUGH*/
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 712b10f64c70..7aabc6ad4e9b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
static int configfs_depend_prep(struct dentry *origin,
struct config_item *target)
{
- struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+ struct configfs_dirent *child_sd, *sd;
int ret = 0;
- BUG_ON(!origin || !sd);
+ BUG_ON(!origin || !origin->d_fsdata);
+ sd = origin->d_fsdata;
if (sd->s_element == target) /* Boo-yah */
goto out;
@@ -1625,7 +1626,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&file_inode(file)->i_mutex);
return -EINVAL;
}
if (offset != file->f_pos) {
diff --git a/fs/coredump.c b/fs/coredump.c
index 177493272a61..c6479658d487 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -411,7 +411,7 @@ static void wait_for_dump_helpers(struct file *file)
{
struct pipe_inode_info *pipe;
- pipe = file->f_path.dentry->d_inode->i_pipe;
+ pipe = file_inode(file)->i_pipe;
pipe_lock(pipe);
pipe->readers++;
@@ -501,7 +501,7 @@ void do_coredump(siginfo_t *siginfo)
* so we dump it as root in mode 2, and only into a controlled
* environment (pipe handler or fully qualified path).
*/
- if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
+ if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
/* Setuid core dump mode */
flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
@@ -600,7 +600,7 @@ void do_coredump(siginfo_t *siginfo)
if (IS_ERR(cprm.file))
goto fail_unlock;
- inode = cprm.file->f_path.dentry->d_inode;
+ inode = file_inode(cprm.file);
if (inode->i_nlink > 1)
goto close_fail;
if (d_unhashed(cprm.file->f_path.dentry))
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c6c3f91ecf06..3ceb9ec976e1 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -351,7 +351,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
char *buf;
unsigned int offset;
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0c4fe8..fbfae008ba44 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)
EXPORT_SYMBOL(d_drop);
/*
- * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
- * @dentry: dentry to drop
- *
- * This is called when we do a lookup on a placeholder dentry that needed to be
- * looked up. The dentry should have been hashed in order for it to be found by
- * the lookup code, but now needs to be unhashed while we do the actual lookup
- * and clear the DCACHE_NEED_LOOKUP flag.
- */
-void d_clear_need_lookup(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __d_drop(dentry);
- dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_clear_need_lookup);
-
-/*
* Finish off a dentry we've decided to kill.
* dentry->d_lock must be held, returns with it unlocked.
* If ref is non-zero, then decrement the refcount too.
@@ -565,13 +547,7 @@ repeat:
if (d_unhashed(dentry))
goto kill_it;
- /*
- * If this dentry needs lookup, don't set the referenced flag so that it
- * is more likely to be cleaned up by the dcache shrinker in case of
- * memory pressure.
- */
- if (!d_need_lookup(dentry))
- dentry->d_flags |= DCACHE_REFERENCED;
+ dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_count--;
@@ -699,11 +675,10 @@ EXPORT_SYMBOL(dget_parent);
static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
{
struct dentry *alias, *discon_alias;
- struct hlist_node *p;
again:
discon_alias = NULL;
- hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
spin_lock(&alias->d_lock);
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
if (IS_ROOT(alias) &&
@@ -754,10 +729,9 @@ EXPORT_SYMBOL(d_find_alias);
void d_prune_aliases(struct inode *inode)
{
struct dentry *dentry;
- struct hlist_node *p;
restart:
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
if (!dentry->d_count) {
__dget_dlock(dentry);
@@ -1382,6 +1356,7 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
+ DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE ));
dentry->d_op = op;
if (!op)
@@ -1392,6 +1367,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_COMPARE;
if (op->d_revalidate)
dentry->d_flags |= DCACHE_OP_REVALIDATE;
+ if (op->d_weak_revalidate)
+ dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
if (op->d_delete)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
@@ -1464,14 +1441,13 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
int len = entry->d_name.len;
const char *name = entry->d_name.name;
unsigned int hash = entry->d_name.hash;
- struct hlist_node *p;
if (!inode) {
__d_instantiate(entry, NULL);
return NULL;
}
- hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
/*
* Don't need alias->d_lock here, because aliases with
* d_parent == entry->d_parent are not subject to name or
@@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);
*/
struct dentry *d_obtain_alias(struct inode *inode)
{
- static const struct qstr anonstring = { .name = "" };
+ static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
struct dentry *res;
@@ -1696,7 +1672,6 @@ EXPORT_SYMBOL(d_splice_alias);
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
struct qstr *name)
{
- int error;
struct dentry *found;
struct dentry *new;
@@ -1705,10 +1680,12 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
* if not go ahead and create it now.
*/
found = d_hash_and_lookup(dentry->d_parent, name);
+ if (unlikely(IS_ERR(found)))
+ goto err_out;
if (!found) {
new = d_alloc(dentry->d_parent, name);
if (!new) {
- error = -ENOMEM;
+ found = ERR_PTR(-ENOMEM);
goto err_out;
}
@@ -1737,13 +1714,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
/*
- * We are going to instantiate this dentry, unhash it and clear the
- * lookup flag so we can do that.
- */
- if (unlikely(d_need_lookup(found)))
- d_clear_need_lookup(found);
-
- /*
* Negative dentry: instantiate it unless the inode is a directory and
* already has a dentry.
*/
@@ -1756,7 +1726,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
err_out:
iput(inode);
- return ERR_PTR(error);
+ return found;
}
EXPORT_SYMBOL(d_add_ci);
@@ -1920,7 +1890,7 @@ seqretry:
* dentry is returned. The caller must use dput to free the entry when it has
* finished using it. %NULL is returned if the dentry does not exist.
*/
-struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
+struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
struct dentry *dentry;
unsigned seq;
@@ -1950,7 +1920,7 @@ EXPORT_SYMBOL(d_lookup);
*
* __d_lookup callers must be commented.
*/
-struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
+struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
@@ -2028,12 +1998,10 @@ next:
* @dir: Directory to search in
* @name: qstr of name we wish to find
*
- * On hash failure or on lookup failure NULL is returned.
+ * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
*/
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
- struct dentry *dentry = NULL;
-
/*
* Check for a fs-specific hash function. Note that we must
* calculate the standard hash first, as the d_op->d_hash()
@@ -2041,13 +2009,13 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
*/
name->hash = full_name_hash(name->name, name->len);
if (dir->d_flags & DCACHE_OP_HASH) {
- if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
- goto out;
+ int err = dir->d_op->d_hash(dir, dir->d_inode, name);
+ if (unlikely(err < 0))
+ return ERR_PTR(err);
}
- dentry = d_lookup(dir, name);
-out:
- return dentry;
+ return d_lookup(dir, name);
}
+EXPORT_SYMBOL(d_hash_and_lookup);
/**
* d_validate - verify dentry provided from insecure source (deprecated)
@@ -2425,7 +2393,7 @@ out_err:
*/
static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
{
- struct dentry *dparent, *aparent;
+ struct dentry *dparent;
dentry_lock_for_move(anon, dentry);
@@ -2433,24 +2401,15 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
write_seqcount_begin(&anon->d_seq);
dparent = dentry->d_parent;
- aparent = anon->d_parent;
switch_names(dentry, anon);
swap(dentry->d_name.hash, anon->d_name.hash);
- dentry->d_parent = (aparent == anon) ? dentry : aparent;
- list_del(&dentry->d_u.d_child);
- if (!IS_ROOT(dentry))
- list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
- else
- INIT_LIST_HEAD(&dentry->d_u.d_child);
-
- anon->d_parent = (dparent == dentry) ? anon : dparent;
+ dentry->d_parent = dentry;
+ list_del_init(&dentry->d_u.d_child);
+ anon->d_parent = dparent;
list_del(&anon->d_u.d_child);
- if (!IS_ROOT(anon))
- list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
- else
- INIT_LIST_HEAD(&anon->d_u.d_child);
+ list_add(&anon->d_u.d_child, &dparent->d_subdirs);
write_seqcount_end(&dentry->d_seq);
write_seqcount_end(&anon->d_seq);
@@ -2753,37 +2712,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
}
EXPORT_SYMBOL(d_path);
-/**
- * d_path_with_unreachable - return the path of a dentry
- * @path: path to report
- * @buf: buffer to return value in
- * @buflen: buffer length
- *
- * The difference from d_path() is that this prepends "(unreachable)"
- * to paths which are unreachable from the current process' root.
- */
-char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
-{
- char *res = buf + buflen;
- struct path root;
- int error;
-
- if (path->dentry->d_op && path->dentry->d_op->d_dname)
- return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
-
- get_fs_root(current->fs, &root);
- write_seqlock(&rename_lock);
- error = path_with_deleted(path, &root, &res, &buflen);
- if (error > 0)
- error = prepend_unreachable(&res, &buflen);
- write_sequnlock(&rename_lock);
- path_put(&root);
- if (error)
- res = ERR_PTR(error);
-
- return res;
-}
-
/*
* Helper function for dentry_operations.d_dname() members
*/
@@ -3066,7 +2994,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
ino_t ino = 0;
dentry = d_hash_and_lookup(dir, name);
- if (dentry) {
+ if (!IS_ERR_OR_NULL(dentry)) {
if (dentry->d_inode)
ino = dentry->d_inode->i_ino;
dput(dentry);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 153bb1e42e63..0c4f80b447fb 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
opts->uid = uid;
break;
case Opt_gid:
- if (match_octal(&args[0], &option))
+ if (match_int(&args[0], &option))
return -EINVAL;
gid = make_kgid(current_user_ns(), option);
if (!gid_valid(gid))
@@ -322,7 +322,6 @@ static struct dentry *__create_file(const char *name, umode_t mode,
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = NULL;
mutex_lock(&parent->d_inode->i_mutex);
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry)) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 472e6befc54d..073d30b9d1ac 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb)
struct dentry *root = sb->s_root;
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
+ kuid_t root_uid;
+ kgid_t root_gid;
+
+ root_uid = make_kuid(current_user_ns(), 0);
+ root_gid = make_kgid(current_user_ns(), 0);
+ if (!uid_valid(root_uid) || !gid_valid(root_gid))
+ return -EINVAL;
mutex_lock(&root->d_inode->i_mutex);
@@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb)
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+ inode->i_uid = root_uid;
+ inode->i_gid = root_gid;
d_add(dentry, inode);
@@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
return ERR_PTR(error);
+ /* Require newinstance for all user namespace mounts to ensure
+ * the mount options are not changed.
+ */
+ if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+ return ERR_PTR(-EINVAL);
+
if (opts.newinstance)
s = sget(fs_type, NULL, set_anon_super, flags, NULL);
else
@@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+#endif
};
/*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index cf5b44b10c67..f853263cf74f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -261,9 +261,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
dio->end_io(dio->iocb, offset, transferred,
dio->private, ret, is_async);
} else {
+ inode_dio_done(dio->inode);
if (is_async)
aio_complete(dio->iocb, ret, 0);
- inode_dio_done(dio->inode);
}
return ret;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index a0387dd8b1f0..7d58d5b112b5 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -158,7 +158,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
unsigned int x;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
x = simple_strtoul(buf, NULL, 0);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 77c0f70f8fe8..e7665c31f7b1 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -96,10 +96,13 @@ do { \
}
+#define DLM_RTF_SHRINK 0x00000001
+
struct dlm_rsbtable {
struct rb_root keep;
struct rb_root toss;
spinlock_t lock;
+ uint32_t flags;
};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a579f30f237d..1b1146670c4b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref)
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
+ ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
if (r->res_lvbptr) {
dlm_free_lvb(r->res_lvbptr);
r->res_lvbptr = NULL;
@@ -1182,7 +1183,7 @@ static void detach_lkb(struct dlm_lkb *lkb)
static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
- int rv, id;
+ int rv;
lkb = dlm_allocate_lkb(ls);
if (!lkb)
@@ -1198,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
mutex_init(&lkb->lkb_cb_mutex);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
- retry:
- rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS);
- if (!rv)
- return -ENOMEM;
-
+ idr_preload(GFP_NOFS);
spin_lock(&ls->ls_lkbidr_spin);
- rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id);
- if (!rv)
- lkb->lkb_id = id;
+ rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT);
+ if (rv >= 0)
+ lkb->lkb_id = rv;
spin_unlock(&ls->ls_lkbidr_spin);
-
- if (rv == -EAGAIN)
- goto retry;
+ idr_preload_end();
if (rv < 0) {
log_error(ls, "create_lkb idr error %d", rv);
@@ -1659,11 +1654,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
char *name;
int our_nodeid = dlm_our_nodeid();
int remote_count = 0;
+ int need_shrink = 0;
int i, len, rv;
memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
next = rb_next(n);
r = rb_entry(n, struct dlm_rsb, res_hashnode);
@@ -1679,6 +1681,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
continue;
}
+ need_shrink = 1;
+
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.ci_toss_secs * HZ)) {
continue;
@@ -1710,6 +1714,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
dlm_free_rsb(r);
}
+
+ if (need_shrink)
+ ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
+ else
+ ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
spin_unlock(&ls->ls_rsbtbl[b].lock);
/*
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 2e99fb0c9737..3ca79d3253b9 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -796,7 +796,6 @@ static int release_lockspace(struct dlm_ls *ls, int force)
*/
idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls);
- idr_remove_all(&ls->ls_lkbidr);
idr_destroy(&ls->ls_lkbidr);
/*
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index dd87a31bcc21..4f5ad246582f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -177,12 +177,11 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
int r;
- struct hlist_node *h;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, h, &connection_hash[r], list) {
+ hlist_for_each_entry(con, &connection_hash[r], list) {
if (con->nodeid == nodeid)
return con;
}
@@ -232,13 +231,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
static void foreach_conn(void (*conn_func)(struct connection *c))
{
int i;
- struct hlist_node *h, *n;
+ struct hlist_node *n;
struct connection *con;
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+ hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
conn_func(con);
- }
}
}
@@ -257,13 +255,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
static struct connection *assoc2con(int assoc_id)
{
int i;
- struct hlist_node *h;
struct connection *con;
mutex_lock(&connections_lock);
for (i = 0 ; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry(con, h, &connection_hash[i], list) {
+ hlist_for_each_entry(con, &connection_hash[i], list) {
if (con->sctp_assoc == assoc_id) {
mutex_unlock(&connections_lock);
return con;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index aedea28a86a1..a6bc63f6e31b 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -305,27 +305,26 @@ static int recover_idr_empty(struct dlm_ls *ls)
static int recover_idr_add(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
- int rv, id;
-
- rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS);
- if (!rv)
- return -ENOMEM;
+ int rv;
+ idr_preload(GFP_NOFS);
spin_lock(&ls->ls_recover_idr_lock);
if (r->res_id) {
- spin_unlock(&ls->ls_recover_idr_lock);
- return -1;
- }
- rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id);
- if (rv) {
- spin_unlock(&ls->ls_recover_idr_lock);
- return rv;
+ rv = -1;
+ goto out_unlock;
}
- r->res_id = id;
+ rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
+ if (rv < 0)
+ goto out_unlock;
+
+ r->res_id = rv;
ls->ls_recover_list_count++;
dlm_hold_rsb(r);
+ rv = 0;
+out_unlock:
spin_unlock(&ls->ls_recover_idr_lock);
- return 0;
+ idr_preload_end();
+ return rv;
}
static void recover_idr_del(struct dlm_rsb *r)
@@ -351,24 +350,21 @@ static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
return r;
}
-static int recover_idr_clear_rsb(int id, void *p, void *data)
+static void recover_idr_clear(struct dlm_ls *ls)
{
- struct dlm_ls *ls = data;
- struct dlm_rsb *r = p;
+ struct dlm_rsb *r;
+ int id;
- r->res_id = 0;
- r->res_recover_locks_count = 0;
- ls->ls_recover_list_count--;
+ spin_lock(&ls->ls_recover_idr_lock);
- dlm_put_rsb(r);
- return 0;
-}
+ idr_for_each_entry(&ls->ls_recover_idr, r, id) {
+ idr_remove(&ls->ls_recover_idr, id);
+ r->res_id = 0;
+ r->res_recover_locks_count = 0;
+ ls->ls_recover_list_count--;
-static void recover_idr_clear(struct dlm_ls *ls)
-{
- spin_lock(&ls->ls_recover_idr_lock);
- idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls);
- idr_remove_all(&ls->ls_recover_idr);
+ dlm_put_rsb(r);
+ }
if (ls->ls_recover_list_count != 0) {
log_error(ls, "warning: recover_list_count %d",
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0cb..911649a47dd5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,
#endif
return -EINVAL;
-#ifdef CONFIG_COMPAT
- if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
-#else
+ /*
+ * can't compare against COMPAT/dlm_write_request32 because
+ * we don't yet know if is64bit is zero
+ */
if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
-#endif
return -EINVAL;
kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index cc16562654de..e15ef38c24fa 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
config ECRYPT_FS
- tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
+ tristate "eCrypt filesystem layer support"
+ depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO_ECB
select CRYPTO_CBC
select CRYPTO_MD5
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ea9931281557..a7b0c2dfb3db 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1935,7 +1935,7 @@ static const unsigned char filename_rev_map[256] = {
* @src: Source location for the filename to encode
* @src_size: Size of the source in bytes
*/
-void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
unsigned char *src, size_t src_size)
{
size_t num_blocks;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index cfb4b9fed520..7e2c6f5d7985 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -509,6 +509,12 @@ ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
}
+static inline struct path *
+ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+{
+ return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+}
+
static inline void
ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
{
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index d45ba4568128..53acc9d0c138 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -118,7 +118,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
lower_file = ecryptfs_file_to_lower(file);
lower_file->f_pos = file->f_pos;
- inode = file->f_path.dentry->d_inode;
+ inode = file_inode(file);
memset(&buf, 0, sizeof(buf));
buf.dirent = dirent;
buf.dentry = file->f_path.dentry;
@@ -133,7 +133,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
goto out;
if (rc >= 0)
fsstack_copy_attr_atime(inode,
- lower_file->f_path.dentry->d_inode);
+ file_inode(lower_file));
out:
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cc7709e7c508..e0f07fb6d56b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1027,8 +1027,7 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat lower_stat;
int rc;
- rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
- ecryptfs_dentry_to_lower(dentry), &lower_stat);
+ rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
if (!rc) {
fsstack_copy_attr_all(dentry->d_inode,
ecryptfs_inode_to_lower(dentry->d_inode));
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 809e67d05ca3..f1ea610362c6 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -102,12 +102,12 @@ int __init ecryptfs_init_kthread(void)
void ecryptfs_destroy_kthread(void)
{
- struct ecryptfs_open_req *req;
+ struct ecryptfs_open_req *req, *tmp;
mutex_lock(&ecryptfs_kthread_ctl.mux);
ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
- list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
- kthread_ctl_list) {
+ list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list,
+ kthread_ctl_list) {
list_del(&req->kthread_ctl_list);
*req->lower_file = ERR_PTR(-EIO);
complete(&req->done);
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 5fa2471796c2..8d7a577ae497 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -115,10 +115,9 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
*/
int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
{
- struct hlist_node *elem;
int rc;
- hlist_for_each_entry(*daemon, elem,
+ hlist_for_each_entry(*daemon,
&ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
euid_chain) {
if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) {
@@ -445,7 +444,6 @@ void ecryptfs_release_messaging(void)
mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
}
if (ecryptfs_daemon_hash) {
- struct hlist_node *elem;
struct ecryptfs_daemon *daemon;
int i;
@@ -453,7 +451,7 @@ void ecryptfs_release_messaging(void)
for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
int rc;
- hlist_for_each_entry(daemon, elem,
+ hlist_for_each_entry(daemon,
&ecryptfs_daemon_hash[i],
euid_chain) {
rc = ecryptfs_exorcise_daemon(daemon);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index bd1d57f98f74..564a1fa34b99 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file,
if (prev_page_end_size
>= i_size_read(page->mapping->host)) {
zero_user(page, 0, PAGE_CACHE_SIZE);
- } else {
+ SetPageUptodate(page);
+ } else if (len < PAGE_CACHE_SIZE) {
rc = ecryptfs_decrypt_page(page);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
@@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file,
ClearPageUptodate(page);
goto out;
}
+ SetPageUptodate(page);
}
- SetPageUptodate(page);
}
}
/* If creating a page or more of holes, zero them out via truncate.
@@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file,
}
goto out;
}
+ if (!PageUptodate(page)) {
+ if (copied < PAGE_CACHE_SIZE) {
+ rc = 0;
+ goto out;
+ }
+ SetPageUptodate(page);
+ }
/* Fills in zeros if 'to' goes beyond inode size */
rc = fill_zeros_to_end_of_page(page, to);
if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index b2a34a192f4f..6a160539cd23 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -40,16 +40,12 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size)
{
struct file *lower_file;
- mm_segment_t fs_save;
ssize_t rc;
lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
if (!lower_file)
return -EIO;
- fs_save = get_fs();
- set_fs(get_ds());
- rc = vfs_write(lower_file, data, size, &offset);
- set_fs(fs_save);
+ rc = kernel_write(lower_file, data, size, offset);
mark_inode_dirty_sync(ecryptfs_inode);
return rc;
}
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 6ebfc1c207a8..d020e3c30fea 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -1,6 +1,6 @@
config EFS_FS
- tristate "EFS file system support (read only) (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "EFS file system support (read only)"
+ depends on BLOCK
help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 7ee6f7e3a608..055a9e9ca747 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -20,7 +20,7 @@ const struct inode_operations efs_dir_inode_operations = {
};
static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct buffer_head *bh;
struct efs_dir *dirblock;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index be56b21435f8..9fec1836057a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1313,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* otherwise we might miss an event that happens between the
* f_op->poll() call and the new event set registering.
*/
- epi->event.events = event->events;
+ epi->event.events = event->events; /* need barrier below */
pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
@@ -1324,6 +1324,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
}
/*
+ * The following barrier has two effects:
+ *
+ * 1) Flush epi changes above to other CPUs. This ensures
+ * we do not miss events from ep_poll_callback if an
+ * event occurs immediately after we call f_op->poll().
+ * We need this because we did not take ep->lock while
+ * changing epi above (but ep_poll_callback does take
+ * ep->lock).
+ *
+ * 2) We also need to ensure we do not miss _past_ events
+ * when calling f_op->poll(). This barrier also
+ * pairs with the barrier in wq_has_sleeper (see
+ * comments for wq_has_sleeper).
+ *
+ * This barrier will now guarantee ep_poll_callback or f_op->poll
+ * (or both) will notice the readiness of an item.
+ */
+ smp_mb();
+
+ /*
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
diff --git a/fs/exec.c b/fs/exec.c
index d8e1191cb112..a96a4885bbbf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -123,7 +123,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
goto out;
error = -EINVAL;
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ if (!S_ISREG(file_inode(file)->i_mode))
goto exit;
error = -EACCES;
@@ -355,7 +355,7 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
* flags, permissions, and offset, so we use temporary values. We'll update
* them later in setup_arg_pages().
*/
-int bprm_mm_init(struct linux_binprm *bprm)
+static int bprm_mm_init(struct linux_binprm *bprm)
{
int err;
struct mm_struct *mm = NULL;
@@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max)
if (IS_ERR(p))
return -EFAULT;
- if (i++ >= max)
+ if (i >= max)
return -E2BIG;
+ ++i;
if (fatal_signal_pending(current))
return -ERESTARTNOHAND;
@@ -763,7 +764,7 @@ struct file *open_exec(const char *name)
goto out;
err = -EACCES;
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ if (!S_ISREG(file_inode(file)->i_mode))
goto exit;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
@@ -1097,7 +1098,7 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+ if (inode_permission(file_inode(file), MAY_READ) < 0)
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
}
EXPORT_SYMBOL(would_dump);
@@ -1110,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm)
current->sas_ss_sp = current->sas_ss_size = 0;
if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
- set_dumpable(current->mm, SUID_DUMPABLE_ENABLED);
+ set_dumpable(current->mm, SUID_DUMP_USER);
else
set_dumpable(current->mm, suid_dumpable);
@@ -1175,9 +1176,24 @@ void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
+ /* If a binfmt changed the interp, free it. */
+ if (bprm->interp != bprm->filename)
+ kfree(bprm->interp);
kfree(bprm);
}
+int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+{
+ /* If a binfmt changed the interp, free it first. */
+ if (bprm->interp != bprm->filename)
+ kfree(bprm->interp);
+ bprm->interp = kstrdup(interp, GFP_KERNEL);
+ if (!bprm->interp)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(bprm_change_interp);
+
/*
* install the new credentials for this executable
*/
@@ -1254,7 +1270,7 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
int prepare_binprm(struct linux_binprm *bprm)
{
umode_t mode;
- struct inode * inode = bprm->file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(bprm->file);
int retval;
mode = inode->i_mode;
@@ -1623,17 +1639,17 @@ EXPORT_SYMBOL(set_binfmt);
void set_dumpable(struct mm_struct *mm, int value)
{
switch (value) {
- case SUID_DUMPABLE_DISABLED:
+ case SUID_DUMP_DISABLE:
clear_bit(MMF_DUMPABLE, &mm->flags);
smp_wmb();
clear_bit(MMF_DUMP_SECURELY, &mm->flags);
break;
- case SUID_DUMPABLE_ENABLED:
+ case SUID_DUMP_USER:
set_bit(MMF_DUMPABLE, &mm->flags);
smp_wmb();
clear_bit(MMF_DUMP_SECURELY, &mm->flags);
break;
- case SUID_DUMPABLE_SAFE:
+ case SUID_DUMP_ROOT:
set_bit(MMF_DUMP_SECURELY, &mm->flags);
smp_wmb();
set_bit(MMF_DUMPABLE, &mm->flags);
@@ -1646,7 +1662,7 @@ int __get_dumpable(unsigned long mm_flags)
int ret;
ret = mm_flags & MMF_DUMPABLE_MASK;
- return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret;
+ return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
}
int get_dumpable(struct mm_struct *mm)
@@ -1654,7 +1670,6 @@ int get_dumpable(struct mm_struct *mm)
return __get_dumpable(mm->flags);
}
-#ifdef __ARCH_WANT_SYS_EXECVE
SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
@@ -1682,23 +1697,3 @@ asmlinkage long compat_sys_execve(const char __user * filename,
return error;
}
#endif
-#endif
-
-#ifdef __ARCH_WANT_KERNEL_EXECVE
-int kernel_execve(const char *filename,
- const char *const argv[],
- const char *const envp[])
-{
- int ret = do_execve(filename,
- (const char __user *const __user *)argv,
- (const char __user *const __user *)envp);
- if (ret < 0)
- return ret;
-
- /*
- * We were successful. We won't be returning to our caller, but
- * instead to user space by manipulating the kernel stack.
- */
- ret_from_kernel_execve(current_pt_regs());
-}
-#endif
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index c61e62ac231c..46375896cfc0 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -242,7 +242,7 @@ static int
exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 606bb074c501..262fc9940982 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -44,14 +44,13 @@ find_acceptable_alias(struct dentry *result,
{
struct dentry *dentry, *toput = NULL;
struct inode *inode;
- struct hlist_node *p;
if (acceptable(context, result))
return result;
inode = result->d_inode;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
dget(dentry);
spin_unlock(&inode->i_lock);
if (toput)
@@ -322,10 +321,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
if (parent && (len < 4)) {
*max_len = 4;
- return 255;
+ return FILEID_INVALID;
} else if (len < 2) {
*max_len = 2;
- return 255;
+ return FILEID_INVALID;
}
len = 2;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2616d0ea5c5c..9f9992b37924 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
return bh;
}
-static void release_blocks(struct super_block *sb, int count)
-{
- if (count) {
- struct ext2_sb_info *sbi = EXT2_SB(sb);
-
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
- }
-}
-
static void group_adjust_blocks(struct super_block *sb, int group_no,
struct ext2_group_desc *desc, struct buffer_head *bh, int count)
{
@@ -568,8 +559,11 @@ do_more:
}
error_return:
brelse(bitmap_bh);
- release_blocks(sb, freed);
- dquot_free_block_nodirty(inode, freed);
+ if (freed) {
+ percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+ dquot_free_block_nodirty(inode, freed);
+ mark_inode_dirty(inode);
+ }
}
/**
@@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
*errp = -ENOSPC;
sb = inode->i_sb;
- if (!sb) {
- printk("ext2_new_blocks: nonexistent device");
- return 0;
- }
/*
* Check quota for allocation of this block.
@@ -1416,9 +1406,11 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- dquot_free_block_nodirty(inode, *count-num);
- mark_inode_dirty(inode);
- *count = num;
+ if (num < *count) {
+ dquot_free_block_nodirty(inode, *count-num);
+ mark_inode_dirty(inode);
+ *count = num;
+ }
return ret_block;
io_error:
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0f4f5c929257..4237722bfd27 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -290,7 +290,7 @@ static int
ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
{
loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6363ac66fafa..c3881e56662e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -495,6 +495,10 @@ static int ext2_alloc_branch(struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
@@ -523,6 +527,14 @@ static int ext2_alloc_branch(struct inode *inode,
}
*blks = num;
return err;
+
+failed:
+ for (i = 1; i < n; i++)
+ bforget(branch[i].bh);
+ for (i = 0; i < indirect_blks; i++)
+ ext2_free_blocks(inode, new_blocks[i], 1);
+ ext2_free_blocks(inode, new_blocks[i], num);
+ return err;
}
/**
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 2de655f5d625..5d46c09863f0 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -19,7 +19,7 @@
long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ext2_inode_info *ei = EXT2_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fa04d023177e..7f68c8114026 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
bh = sb_bread(sb, tmp_bh.b_blocknr);
else
bh = sb_getblk(sb, tmp_bh.b_blocknr);
- if (!bh) {
+ if (unlikely(!bh)) {
err = -EIO;
goto out;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index b6754dbbce3c..2d7557db3ae8 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
ext2_free_blocks(inode, block, 1);
mark_inode_dirty(inode);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index dd91264ba94f..87eccbbca255 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -99,7 +99,7 @@ static int ext3_readdir(struct file * filp,
int i, stored;
struct ext3_dir_entry_2 *de;
int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
int ret = 0;
int dir_has_error = 0;
@@ -114,7 +114,7 @@ static int ext3_readdir(struct file * filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
+ EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -457,7 +457,7 @@ static int call_filldir(struct file * filp, void * dirent,
{
struct dir_private_info *info = filp->private_data;
loff_t curr_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block * sb;
int error;
@@ -487,7 +487,7 @@ static int ext3_dx_readdir(struct file * filp,
void * dirent, filldir_t filldir)
{
struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct fname *fname;
int ret;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b176d4253544..d512c4bc4ad7 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -717,7 +721,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext3_free_blocks(handle, inode, new_blocks[i], 1);
ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
@@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
"inode=%lu, block="E3FSBLK,
inode->i_ino, block);
- return -EIO;
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 677a5c27dc69..4d96e9a64532 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -14,7 +14,7 @@
long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 890b8947c546..692de13e3596 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,7 +36,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext3_append(handle_t *handle,
struct inode *inode,
@@ -624,7 +623,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
start_minor_hash));
- dir = dir_file->f_path.dentry->d_inode;
+ dir = file_inode(dir_file);
if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
@@ -638,7 +637,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+ frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
if (!frame)
return err;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0f814f3450de..27105655502c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto exit_bh;
}
if ((err = ext3_journal_get_write_access(handle, gdb))) {
@@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext3_debug("update metadata backup %#04lx\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b3299..5546ca225ffe 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return 0;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
+ if (sbi->s_qf_names[qtype]) {
+ int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
kfree(qname);
- return 0;
+ if (!same) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ }
+ return same;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext3_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ kfree(qname);
return 0;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sbi->s_mount_opt, QUOTA);
return 1;
}
@@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {
" when quota turned on");
return 0;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
+ if (sbi->s_qf_names[qtype]) {
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ }
return 1;
}
#endif
@@ -2065,6 +2067,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ sb->s_flags |= MS_SNAP_STABLE;
return 0;
@@ -2605,7 +2608,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
/*
@@ -2698,9 +2712,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
#endif
if (enable_quota)
dquot_resume(sb, -1);
@@ -2714,9 +2726,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d22ebb7a4f55..b1fc96383e08 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -813,10 +813,10 @@ inserted:
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
getblk_failed:
ext3_free_blocks(handle, inode, block, 1);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 0a475c881852..987358740cb9 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23
config EXT4_FS_POSIX_ACL
bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL
config EXT4_FS_SECURITY
bool "Ext4 Security Labels"
+ depends on EXT4_FS
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e6e0d988439b..39a54a0e9fe4 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode)
if (error)
return error;
retry:
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
ext4_std_error(inode->i_sb, error);
@@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
acl = NULL;
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto release_and_out;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cf1821784a16..92e68b33fffd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb,
}
/**
- * ext4_read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
@@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (!bh)
+ return NULL;
if (ext4_wait_block_bitmap(sb, block_group, bh)) {
put_bh(bh);
return NULL;
@@ -482,11 +484,16 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
- root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+
+ /*
+ * r_blocks_count should always be multiple of the cluster ratio so
+ * we are safe to do a plane bit shift only.
+ */
+ root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
- free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+ free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
}
/* Check whether we have space after accounting for current
@@ -628,7 +635,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
brelse(bitmap_bh);
printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
", computed = %llu, %llu\n",
- EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+ EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 80a28b297279..d8cd1f0f4661 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -110,7 +110,7 @@ static int ext4_readdir(struct file *filp,
int i, stored;
struct ext4_dir_entry_2 *de;
int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
int ret = 0;
int dir_has_error = 0;
@@ -133,7 +133,7 @@ static int ext4_readdir(struct file *filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+ ext4_clear_inode_flag(file_inode(filp),
EXT4_INODE_INDEX);
}
stored = 0;
@@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp,
"at offset %llu",
(unsigned long long)filp->f_pos);
filp->f_pos += sb->s_blocksize - offset;
+ brelse(bh);
continue;
}
set_buffer_verified(bh);
@@ -333,7 +334,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
*
* For non-htree, ext4_llseek already chooses the proper max offset.
*/
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
@@ -494,7 +495,7 @@ static int call_filldir(struct file *filp, void *dirent,
{
struct dir_private_info *info = filp->private_data;
loff_t curr_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb;
int error;
@@ -526,7 +527,7 @@ static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir)
{
struct dir_private_info *info = filp->private_data;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct fname *fname;
int ret;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3c33aa..4a01ba315262 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -194,8 +194,7 @@ struct mpage_da_data {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_QUEUED 0x0004
-#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_DIRECT 0x0004
struct ext4_io_page {
struct page *p_page;
@@ -215,10 +214,8 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
- struct page *page; /* for writepage() path */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct work_struct work; /* data work queue */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
int num_io_pages; /* for writepages() */
@@ -582,6 +579,8 @@ enum {
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Do not take i_data_sem locking in ext4_map_blocks */
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
+ /* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
* Flags used by ext4_free_blocks
@@ -810,17 +809,6 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
-/*
- * storage for cached extent
- * If ec_len == 0, then the cache is invalid.
- * If ec_start == 0, then the cache represents a gap (null mapping)
- */
-struct ext4_ext_cache {
- ext4_fsblk_t ec_start;
- ext4_lblk_t ec_block;
- __u32 ec_len; /* must be 32bit to return holes */
-};
-
#include "extents_status.h"
/*
@@ -887,7 +875,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -901,6 +888,8 @@ struct ext4_inode_info {
/* extents status tree */
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
+ struct list_head i_es_lru;
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -930,6 +919,7 @@ struct ext4_inode_info {
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ struct work_struct i_unwritten_work; /* deferred extent conversion */
spinlock_t i_block_reservation_lock;
@@ -985,7 +975,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1316,6 +1305,12 @@ struct ext4_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+ struct percpu_counter s_extent_cache_cnt;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2007,9 +2002,20 @@ extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
- const struct qstr *qstr, __u32 goal,
- uid_t *owner);
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+ 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+ type, nblocks) \
+ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ (type), __LINE__, (nblocks))
+
+
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -2103,6 +2109,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
extern void ext4_ind_truncate(struct inode *inode);
+extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2151,6 +2158,8 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16]);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2227,6 +2236,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed);
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
@@ -2454,6 +2465,75 @@ extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern void ext4_unwritten_wait(struct inode *inode);
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_inline_size(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern void ext4_write_inline_data(struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buffer, loff_t pos,
+ unsigned int len);
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
@@ -2520,6 +2600,9 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end);
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
@@ -2537,6 +2620,7 @@ extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 487fda12bc00..8643ff5bbeb7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void
-ext4_ext_invalidate_cache(struct inode *inode)
-{
- EXT4_I(inode)->i_cached_extent.ec_len = 0;
-}
-
static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
{
/* We can not have an uninitialized extent of zero length! */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b4323ba846b5..7058975e3a55 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,6 +6,108 @@
#include <trace/events/ext4.h>
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int nblocks)
+{
+ journal_t *journal;
+
+ trace_ext4_journal_start(sb, nblocks, _RET_IP_);
+ if (sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return ERR_PTR(-EROFS);
+ }
+ return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+ ext4_put_nojournal(handle);
+ return 0;
+ }
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
+ if (err)
+ __ext4_std_error(sb, where, line, err);
+ return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn, struct buffer_head *bh,
+ handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+ caller, line, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7177f9b21cb2..4c216b1bf20c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -59,12 +59,6 @@
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
- * generous. We can grow the delete transaction later if necessary. */
-
-#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
-
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
@@ -110,6 +104,36 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * so we need to reserve credits for this eventuality
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+ return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC 0
+#define EXT4_HT_INODE 1
+#define EXT4_HT_WRITE_PAGE 2
+#define EXT4_HT_MAP_BLOCKS 3
+#define EXT4_HT_DIR 4
+#define EXT4_HT_TRUNCATE 5
+#define EXT4_HT_QUOTA 6
+#define EXT4_HT_RESIZE 7
+#define EXT4_HT_MIGRATE 8
+#define EXT4_HT_MOVE_EXTENTS 9
+#define EXT4_HT_XATTR 10
+#define EXT4_HT_MAX 11
+
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
*
@@ -234,7 +258,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_super(handle, sb) \
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int nblocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -268,9 +293,17 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
return 1;
}
-static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+#define ext4_journal_start_sb(sb, type, nblocks) \
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+
+#define ext4_journal_start(inode, type, nblocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+ unsigned int line, int type,
+ int nblocks)
{
- return ext4_journal_start_sb(inode->i_sb, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
}
#define ext4_journal_stop(handle) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 26af22832a84..28dd8eeea6a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle,
int flags);
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex);
+ struct extent_status *newes);
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
@@ -714,7 +714,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
- ext4_ext_invalidate_cache(inode);
return 0;
}
@@ -725,6 +724,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
struct ext4_extent_header *eh;
struct buffer_head *bh;
short int depth, i, ppos = 0, alloc = 0;
+ int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -752,12 +752,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_ext = NULL;
bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto err;
+ }
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, block,
path[ppos].p_block);
- if (bh_submit_read(bh) < 0) {
+ ret = bh_submit_read(bh);
+ if (ret < 0) {
put_bh(bh);
goto err;
}
@@ -768,13 +771,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
+ ret = -EIO;
goto err;
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_block(inode, eh, i, bh))
+ ret = ext4_ext_check_block(inode, eh, i, bh);
+ if (ret < 0)
goto err;
}
@@ -796,7 +801,7 @@ err:
ext4_ext_drop_refs(path);
if (alloc)
kfree(path);
- return ERR_PTR(-EIO);
+ return ERR_PTR(ret);
}
/*
@@ -950,8 +955,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1023,8 +1028,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1136,11 +1141,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return err;
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
- ext4_std_error(inode->i_sb, err);
- return err;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -1960,7 +1962,6 @@ cleanup:
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_invalidate_cache(inode);
return err;
}
@@ -1969,8 +1970,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache newex;
struct ext4_extent *ex;
+ struct extent_status es;
ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
int exists, depth = 0, err = 0;
@@ -2044,37 +2045,47 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
BUG_ON(end <= start);
if (!exists) {
- newex.ec_block = start;
- newex.ec_len = end - start;
- newex.ec_start = 0;
+ es.es_lblk = start;
+ es.es_len = end - start;
+ es.es_pblk = 0;
} else {
- newex.ec_block = le32_to_cpu(ex->ee_block);
- newex.ec_len = ext4_ext_get_actual_len(ex);
- newex.ec_start = ext4_ext_pblock(ex);
+ es.es_lblk = le32_to_cpu(ex->ee_block);
+ es.es_len = ext4_ext_get_actual_len(ex);
+ es.es_pblk = ext4_ext_pblock(ex);
if (ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
/*
- * Find delayed extent and update newex accordingly. We call
- * it even in !exists case to find out whether newex is the
+ * Find delayed extent and update es accordingly. We call
+ * it even in !exists case to find out whether es is the
* last existing extent or not.
*/
- next_del = ext4_find_delayed_extent(inode, &newex);
+ next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
flags |= FIEMAP_EXTENT_DELALLOC;
}
up_read(&EXT4_I(inode)->i_data_sem);
- if (unlikely(newex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+ if (unlikely(es.es_len == 0)) {
+ EXT4_ERROR_INODE(inode, "es.es_len == 0");
err = -EIO;
break;
}
- /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
- if (next == next_del) {
+ /*
+ * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+ * we need to check next == EXT_MAX_BLOCKS because it is
+ * possible that an extent is with unwritten and delayed
+ * status due to when an extent is delayed allocated and
+ * is allocated by fallocate status tree will track both of
+ * them in a extent.
+ *
+ * So we could return a unwritten and delayed extent, and
+ * its block is equal to 'next'.
+ */
+ if (next == next_del && next == EXT_MAX_BLOCKS) {
flags |= FIEMAP_EXTENT_LAST;
if (unlikely(next_del != EXT_MAX_BLOCKS ||
next != EXT_MAX_BLOCKS)) {
@@ -2089,9 +2100,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (exists) {
err = fiemap_fill_next_extent(fieinfo,
- (__u64)newex.ec_block << blksize_bits,
- (__u64)newex.ec_start << blksize_bits,
- (__u64)newex.ec_len << blksize_bits,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
flags);
if (err < 0)
break;
@@ -2101,7 +2112,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
}
- block = newex.ec_block + newex.ec_len;
+ block = es.es_lblk + es.es_len;
}
if (path) {
@@ -2112,21 +2123,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
return err;
}
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
- __u32 len, ext4_fsblk_t start)
-{
- struct ext4_ext_cache *cex;
- BUG_ON(len == 0);
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- trace_ext4_ext_put_in_cache(inode, block, len, start);
- cex = &EXT4_I(inode)->i_cached_extent;
- cex->ec_block = block;
- cex->ec_len = len;
- cex->ec_start = start;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-}
-
/*
* ext4_ext_put_gap_in_cache:
* calculate boundaries of the gap that the requested block fits into
@@ -2143,9 +2139,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
if (ex == NULL) {
- /* there is no extent yet, so gap is [0;-] */
- lblock = 0;
- len = EXT_MAX_BLOCKS;
+ /*
+ * there is no extent yet, so gap is [0;-] and we
+ * don't cache it
+ */
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -2154,6 +2151,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -2167,58 +2167,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else {
lblock = len = 0;
BUG();
}
ext_debug(" -> %u:%lu\n", lblock, len);
- ext4_ext_put_in_cache(inode, lblock, len, 0);
-}
-
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * cache extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache *cex;
- int ret = 0;
-
- /*
- * We borrow i_block_reservation_lock to protect i_cached_extent
- */
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- cex = &EXT4_I(inode)->i_cached_extent;
-
- /* has cache valid data? */
- if (cex->ec_len == 0)
- goto errout;
-
- if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
- ext_debug("%u cached by %u:%u:%llu\n",
- block,
- cex->ec_block, cex->ec_len, cex->ec_start);
- ret = 1;
- }
-errout:
- trace_ext4_ext_in_cache(inode, block, ret);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return ret;
}
/*
@@ -2226,13 +2183,14 @@ errout:
* removes index from the index block.
*/
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int depth)
{
int err;
ext4_fsblk_t leaf;
/* free index block */
- path--;
+ depth--;
+ path = path + depth;
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
@@ -2257,6 +2215,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ while (--depth >= 0) {
+ if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ break;
+ path--;
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err)
+ break;
+ path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path);
+ if (err)
+ break;
+ }
return err;
}
@@ -2599,7 +2570,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
- err = ext4_ext_rm_idx(handle, inode, path + depth);
+ err = ext4_ext_rm_idx(handle, inode, path, depth);
out:
return err;
@@ -2639,13 +2610,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, depth + 1);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
again:
- ext4_ext_invalidate_cache(inode);
-
trace_ext4_ext_remove_space(inode, start, depth);
/*
@@ -2802,7 +2771,7 @@ again:
/* index is empty, remove it;
* handle must be already prepared by the
* truncatei_leaf() */
- err = ext4_ext_rm_idx(handle, inode, path + i);
+ err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
@@ -3505,19 +3474,19 @@ out:
*
* Return 1 if there is a delalloc block in the range, otherwise 0.
*/
-static int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end)
+int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end)
{
struct extent_status es;
- es.start = lblk_start;
- ext4_es_find_extent(inode, &es);
- if (es.len == 0)
+ ext4_es_find_delayed_extent(inode, lblk_start, &es);
+ if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
- else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
return 1;
- else if (lblk_start <= es.start && es.start <= lblk_end)
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
return 1;
else
return 0;
@@ -3642,6 +3611,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_set_io_unwritten_flag(inode, io);
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
goto out;
@@ -3663,8 +3633,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* repeat fallocate creation request
* we already have an unwritten extent
*/
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto map_out;
+ }
/* buffered READ or buffered write_begin() lookup */
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3884,35 +3856,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
- /* check in cache */
- if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
- if (!newex.ee_start_lo && !newex.ee_start_hi) {
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- /*
- * block isn't allocated yet and
- * user doesn't want to allocate it
- */
- goto out2;
- }
- /* we should allocate requested block */
- } else {
- /* block is already allocated */
- if (sbi->s_cluster_ratio > 1)
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
- newblock = map->m_lblk
- - le32_to_cpu(newex.ee_block)
- + ext4_ext_pblock(&newex);
- /* number of remaining blocks in the extent */
- allocated = ext4_ext_get_actual_len(&newex) -
- (map->m_lblk - le32_to_cpu(newex.ee_block));
- goto out;
- }
- }
-
/* find extent for this block */
path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
if (IS_ERR(path)) {
@@ -3959,15 +3902,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /*
- * Do not put uninitialized extent
- * in the cache
- */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
+ if (!ext4_ext_is_uninitialized(ex))
goto out;
- }
+
allocated = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
@@ -3988,7 +3925,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
@@ -4094,6 +4032,7 @@ got_allocated_blocks:
/* Mark uninitialized */
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
* uninitialized extent. To avoid unnecessary conversion,
@@ -4227,10 +4166,9 @@ got_allocated_blocks:
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an uninitialized extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- } else
+ else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > map->m_len)
@@ -4270,7 +4208,7 @@ void ext4_ext_truncate(struct inode *inode)
* probably first extent we're gonna free will be last in block
*/
err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, err);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
if (IS_ERR(handle))
return;
@@ -4289,7 +4227,6 @@ void ext4_ext_truncate(struct inode *inode)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
@@ -4372,7 +4309,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
*/
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
handle_t *handle;
loff_t new_size;
unsigned int max_blocks;
@@ -4383,13 +4320,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- /*
- * currently supporting (pre)allocate mode for extent-based
- * files _only_
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return -EOPNOTSUPP;
-
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -4401,6 +4331,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /*
+ * currently supporting (pre)allocate mode for extent-based
+ * files _only_
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -4437,7 +4374,8 @@ retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
@@ -4445,11 +4383,11 @@ retry:
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, max_blocks);
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
#endif
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -4515,21 +4453,19 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
- if (ret <= 0) {
- WARN_ON(ret <= 0);
- ext4_msg(inode->i_sb, KERN_ERR,
- "%s:%d: inode #%lu: block %u: len %u: "
- "ext4_ext_map_blocks returned %d",
- __func__, __LINE__, inode->i_ino, map.m_lblk,
- map.m_len, ret);
- }
+ if (ret <= 0)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret <= 0 || ret2 )
@@ -4539,42 +4475,48 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
/*
- * If newex is not existing extent (newex->ec_start equals zero) find
- * delayed extent at start of newex and update newex accordingly and
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
* return start of the next delayed extent.
*
- * If newex is existing extent (newex->ec_start is not equal zero)
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
* return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newex unmodified.
+ * extent found. Leave newes unmodified.
*/
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex)
+ struct extent_status *newes)
{
struct extent_status es;
- ext4_lblk_t next_del;
+ ext4_lblk_t block, next_del;
- es.start = newex->ec_block;
- next_del = ext4_es_find_extent(inode, &es);
+ ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
- if (newex->ec_start == 0) {
+ if (newes->es_pblk == 0) {
/*
- * No extent in extent-tree contains block @newex->ec_start,
+ * No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent.
*/
- if (es.len == 0)
+ if (es.es_len == 0)
/* A hole found. */
return 0;
- if (es.start > newex->ec_block) {
+ if (es.es_lblk > newes->es_lblk) {
/* A hole found. */
- newex->ec_len = min(es.start - newex->ec_block,
- newex->ec_len);
+ newes->es_len = min(es.es_lblk - newes->es_lblk,
+ newes->es_len);
return 0;
}
- newex->ec_len = es.start + es.len - newex->ec_block;
+ newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
}
+ block = newes->es_lblk + newes->es_len;
+ ext4_es_find_delayed_extent(inode, block, &es);
+ if (es.es_len == 0)
+ next_del = EXT_MAX_BLOCKS;
+ else
+ next_del = es.es_lblk;
+
return next_del;
}
/* fiemap flags we can handle specified here */
@@ -4629,7 +4571,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
*/
int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
@@ -4695,7 +4637,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto out_dio;
@@ -4772,14 +4714,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
err = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
if (IS_SYNC(inode))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fcc..95796a1b7522 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -23,40 +23,53 @@
* (e.g. Reservation space warning), and provide extent-level locking.
* Delay extent tree is the first step to achieve this goal. It is
* original built by Yongqiang Yang. At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
+ * extent tree, whose goal is only track delayed extents in memory to
* simplify the implementation of fiemap and bigalloc, and introduce
* lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
- * delay extent tree at the following comment. But for better
- * understand what it does, it has been rename to extent status tree.
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
*
- * Currently the first step has been done. All delay extents are
- * tracked in the tree. It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
* invalidated. Therefore the implementation of fiemap and bigalloc
* are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
*
* The following comment describes the implemenmtation of extent
* status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory. written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure. Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
*/
/*
- * extents status tree implementation for ext4.
+ * Extent status tree implementation for ext4.
*
*
* ==========================================================================
- * Extents status encompass delayed extents and extent locks
+ * Extent status tree tracks all extent status.
*
- * 1. Why delayed extent implementation ?
+ * 1. Why we need to implement extent status tree?
*
- * Without delayed extent, ext4 identifies a delayed extent by looking
+ * Without extent status tree, ext4 identifies a delayed extent by looking
* up page cache, this has several deficiencies - complicated, buggy,
* and inefficient code.
*
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
*
- * Let us have a look at how they do without delayed extents implementation.
+ * Let us have a look at how they do without extent status tree.
* -- FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes.
*
@@ -68,47 +81,48 @@
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster.
*
- * -- punch hole
- * punch hole looks up page cache to identify a delayed extent.
- *
* -- writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped, If there are not very many delayed buffers, then it is
* time comsuming.
*
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
+ * not by searching the extent tree.
*
*
* ==========================================================================
- * 2. ext4 delayed extents impelmentation
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
*
- * -- delayed extent
- * A delayed extent is a range of blocks which are contiguous
- * logically and under delayed allocation. Unlike extent in
- * ext4, delayed extent in ext4 is a in-memory struct, there is
- * no corresponding on-disk data. There is no limit on length of
- * delayed extent, so a delayed extent can contain as many blocks
- * as they are contiguous logically.
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
*
- * -- delayed extent tree
- * Every inode has a delayed extent tree and all under delayed
- * allocation blocks are added to the tree as delayed extents.
- * Delayed extents in the tree are ordered by logical block no.
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
*
- * -- operations on a delayed extent tree
- * There are three operations on a delayed extent tree: find next
- * delayed extent, adding a space(a range of blocks) and removing
- * a space.
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
*
- * -- race on a delayed extent tree
- * Delayed extent tree is protected inode->i_es_lock.
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten/hole extents from
+ * the tree under a heavy memory pressure.
*
*
* ==========================================================================
- * 3. performance analysis
+ * 3. Performance analysis
+ *
* -- overhead
* 1. There is a cache extent for write access, so if writes are
* not very random, adding space operaions are in O(1) time.
@@ -120,18 +134,25 @@
*
* ==========================================================================
* 4. TODO list
- * -- Track all extent status
*
- * -- Improve get block process
+ * -- Refactor delayed space reservation
*
* -- Extent-level locking
*/
static struct kmem_cache *ext4_es_cachep;
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+
int __init ext4_init_es(void)
{
- ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+ ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+ sizeof(struct extent_status),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -161,7 +182,9 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ printk(KERN_DEBUG " [%u/%u) %llu %llx",
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
@@ -170,10 +193,10 @@ static void ext4_es_print_tree(struct inode *inode)
#define ext4_es_print_tree(inode)
#endif
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
- BUG_ON(es->start + es->len < es->start);
- return es->start + es->len - 1;
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
}
/*
@@ -181,25 +204,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
* it can't be found, try to find next extent.
*/
static struct extent_status *__es_tree_search(struct rb_root *root,
- ext4_lblk_t offset)
+ ext4_lblk_t lblk)
{
struct rb_node *node = root->rb_node;
struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
- if (offset < es->start)
+ if (lblk < es->es_lblk)
node = node->rb_left;
- else if (offset > extent_status_end(es))
+ else if (lblk > ext4_es_end(es))
node = node->rb_right;
else
return es;
}
- if (es && offset < es->start)
+ if (es && lblk < es->es_lblk)
return es;
- if (es && offset > extent_status_end(es)) {
+ if (es && lblk > ext4_es_end(es)) {
node = rb_next(&es->rb_node);
return node ? rb_entry(node, struct extent_status, rb_node) :
NULL;
@@ -209,79 +232,124 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
+ * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk
+ * if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
* @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
*/
-ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
- ext4_lblk_t ret = EXT_MAX_BLOCKS;
- trace_ext4_es_find_extent_enter(inode, es->start);
+ BUG_ON(es == NULL);
+ trace_ext4_es_find_delayed_extent_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find delay extent in cache firstly */
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
- if (in_range(es->start, es1->start, es1->len)) {
- es_debug("%u cached by [%u/%u)\n",
- es->start, es1->start, es1->len);
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %llx\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
}
}
- es->len = 0;
- es1 = __es_tree_search(&tree->root, es->start);
+ es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1) {
- tree->cache_es = es1;
- es->start = es1->start;
- es->len = es1->len;
- node = rb_next(&es1->rb_node);
- if (node) {
+ if (es1 && !ext4_es_is_delayed(es1)) {
+ while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
- ret = es1->start;
+ if (ext4_es_is_delayed(es1))
+ break;
}
}
+ if (es1 && ext4_es_is_delayed(es1)) {
+ tree->cache_es = es1;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_extent_exit(inode, es, ret);
- return ret;
+ ext4_es_lru_add(inode);
+ trace_ext4_es_find_delayed_extent_exit(inode, es);
}
static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
{
struct extent_status *es;
es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
if (es == NULL)
return NULL;
- es->start = start;
- es->len = len;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es)) {
+ EXT4_I(inode)->i_es_lru_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
return es;
}
-static void ext4_es_free_extent(struct extent_status *es)
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ }
+
kmem_cache_free(ext4_es_cachep, es);
}
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (es1->es_lblk + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if (ext4_es_status(es1) != ext4_es_status(es2))
+ return 0;
+
+ if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+ (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
+ return 0;
+
+ return 1;
+}
+
static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -290,10 +358,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es->start == extent_status_end(es1) + 1) {
- es1->len += es->len;
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
es = es1;
}
@@ -301,8 +369,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
}
static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -311,69 +380,57 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es1->start == extent_status_end(es) + 1) {
- es->len += es1->len;
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
rb_erase(node, &tree->root);
- ext4_es_free_extent(es1);
+ ext4_es_free_extent(inode, es1);
}
return es;
}
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
- ext4_lblk_t len)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct extent_status *es;
- ext4_lblk_t end = offset + len - 1;
-
- BUG_ON(end < offset);
- es = tree->cache_es;
- if (es && offset == (extent_status_end(es) + 1)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- } else if (es && es->start == end + 1) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- } else if (es && es->start <= offset &&
- end <= extent_status_end(es)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- goto out;
- }
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
- if (offset < es->start) {
- if (es->start == end + 1) {
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ /*
+ * Here we can modify es_lblk directly
+ * because it isn't overlapped.
+ */
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es))
+ ext4_es_store_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
goto out;
}
p = &(*p)->rb_left;
- } else if (offset > extent_status_end(es)) {
- if (offset == extent_status_end(es) + 1) {
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
goto out;
}
p = &(*p)->rb_right;
} else {
- if (extent_status_end(es) <= end)
- es->len = offset - es->start + len;
- goto out;
+ BUG_ON(1);
+ return -EINVAL;
}
}
- es = ext4_es_alloc_extent(offset, len);
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
@@ -385,85 +442,166 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_insert_extent() adds a space to a extent status tree.
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent.
*
* Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned long long status)
{
- struct ext4_es_tree *tree;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err = 0;
- trace_ext4_es_insert_extent(inode, offset, len);
- es_debug("add [%u/%u) to extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
+ es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ if (!len)
+ return 0;
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_insert_extent(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- err = __es_insert_extent(tree, offset, len);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+ err = __es_insert_extent(inode, &newes);
+
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
*
- * Return 0 on success, error code on failure.
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
- struct rb_node *node;
struct ext4_es_tree *tree;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, lblk);
+ es_debug("lookup extent in block %u\n", lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_lru_add(inode);
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
struct extent_status *es;
struct extent_status orig_es;
- ext4_lblk_t len1, len2, end;
+ ext4_lblk_t len1, len2;
+ ext4_fsblk_t block;
int err = 0;
- trace_ext4_es_remove_extent(inode, offset, len);
- es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
-
- end = offset + len - 1;
- BUG_ON(end < offset);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- es = __es_tree_search(&tree->root, offset);
+ es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
- if (es->start > end)
+ if (es->es_lblk > end)
goto out;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
- orig_es.start = es->start;
- orig_es.len = es->len;
- len1 = offset > es->start ? offset - es->start : 0;
- len2 = extent_status_end(es) > end ?
- extent_status_end(es) - end : 0;
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
if (len1 > 0)
- es->len = len1;
+ es->es_len = len1;
if (len2 > 0) {
if (len1 > 0) {
- err = __es_insert_extent(tree, end + 1, len2);
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ if (ext4_es_is_written(&orig_es) ||
+ ext4_es_is_unwritten(&orig_es)) {
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock(&newes, block);
+ }
+ ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
if (err) {
- es->start = orig_es.start;
- es->len = orig_es.len;
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
goto out;
}
} else {
- es->start = end + 1;
- es->len = len2;
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es)) {
+ block = orig_es.es_pblk + orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+ }
}
goto out;
}
@@ -476,10 +614,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = NULL;
}
- while (es && extent_status_end(es) <= end) {
+ while (es && ext4_es_end(es) <= end) {
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
if (!node) {
es = NULL;
break;
@@ -487,14 +625,166 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = rb_entry(node, struct extent_status, rb_node);
}
- if (es && es->start < end + 1) {
- len1 = extent_status_end(es) - end;
- es->start = end + 1;
- es->len = len1;
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+ block = es->es_pblk + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+ }
}
out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ if (!len)
+ return err;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp, scanned;
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk = 0;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ INIT_LIST_HEAD(&scanned);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ list_move_tail(cur, &scanned);
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ read_lock(&ei->i_es_lock);
+ if (ei->i_es_lru_nr == 0) {
+ read_unlock(&ei->i_es_lock);
+ continue;
+ }
+ read_unlock(&ei->i_es_lock);
+
+ write_lock(&ei->i_es_lock);
+ ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += ret;
+ nr_to_scan -= ret;
+ if (nr_to_scan == 0)
+ break;
+ }
+ list_splice_tail(&scanned, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ return ret;
+}
+
+void ext4_es_register_shrinker(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_shrinker.shrink = ext4_es_shrink;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct super_block *sb)
+{
+ unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ else
+ list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ int nr_shrunk = 0;
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 077f82db092a..f190dfe969da 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -20,10 +20,24 @@
#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+#define EXTENT_STATUS_WRITTEN (1ULL << 63)
+#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
+#define EXTENT_STATUS_DELAYED (1ULL << 61)
+#define EXTENT_STATUS_HOLE (1ULL << 60)
+
+#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
+ EXTENT_STATUS_UNWRITTEN | \
+ EXTENT_STATUS_DELAYED | \
+ EXTENT_STATUS_HOLE)
+
struct extent_status {
struct rb_node rb_node;
- ext4_lblk_t start; /* first block extent covers */
- ext4_lblk_t len; /* length of extent in block */
+ ext4_lblk_t es_lblk; /* first logical block extent covers */
+ ext4_lblk_t es_len; /* length of extent in block */
+ ext4_fsblk_t es_pblk; /* first physical block */
};
struct ext4_es_tree {
@@ -35,11 +49,69 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned long long status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t len);
-extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
- struct extent_status *es);
+extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+}
+
+static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_FLAGS);
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+ return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+ ext4_fsblk_t pb)
+{
+ ext4_fsblk_t block;
+
+ block = (pb & ~EXTENT_STATUS_FLAGS) |
+ (es->es_pblk & EXTENT_STATUS_FLAGS);
+ es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+ unsigned long long status)
+{
+ ext4_fsblk_t block;
+
+ block = (status & EXTENT_STATUS_FLAGS) |
+ (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+ es->es_pblk = block;
+}
+
+extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d07c27ca594a..64848b595b24 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
/* Unaligned direct AIO must be serialized; see comment above */
if (unaligned_aio) {
- static unsigned long unaligned_warn_time;
-
- /* Warn about this once per day */
- if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
- ext4_msg(inode->i_sb, KERN_WARNING,
- "Unaligned AIO/DIO on inode %ld by %s; "
- "performance will be poor.",
- inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
ext4_unwritten_wait(inode);
}
@@ -175,7 +167,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
/*
@@ -248,7 +240,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
handle_t *handle;
int err;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
@@ -472,10 +464,8 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* it will be as a data.
*/
- es.start = last;
- (void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
+ ext4_es_find_delayed_extent(inode, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
dataoff = last << blkbits;
break;
@@ -557,11 +547,9 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* we will skip this extent.
*/
- es.start = last;
- (void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
- last = es.start + es.len;
+ ext4_es_find_delayed_extent(inode, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ last = es.es_lblk + es.es_len;
holeoff = last << blkbits;
continue;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index dfbc1fe96674..3278e64e57b6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
- *
- * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index fa8e4911d354..3d586f02883e 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
for (i = 0; i < 4; i++) {
- if (hinfo->seed[i])
+ if (hinfo->seed[i]) {
+ memcpy(buf, hinfo->seed, sizeof(buf));
break;
+ }
}
- if (i < 4)
- memcpy(buf, hinfo->seed, sizeof(buf));
}
switch (hinfo->hash_version) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3f32c8012447..32fd2b9075dd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
- const struct qstr *qstr, __u32 goal, uid_t *owner)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ umode_t mode, const struct qstr *qstr,
+ __u32 goal, uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -725,6 +727,15 @@ repeat_in_this_group:
"inode=%lu", ino + 1);
continue;
}
+ if (!handle) {
+ BUG_ON(nblocks <= 0);
+ handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+ handle_type, nblocks);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto fail;
+ }
+ }
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
if (err)
@@ -1017,17 +1028,17 @@ iget_failed:
inode = NULL;
bad_orphan:
ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
bit, (unsigned long long)bitmap_bh->b_blocknr,
ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
+ printk(KERN_WARNING "inode=%p\n", inode);
if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
- printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+ printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
@@ -1137,7 +1148,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
goto out;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 20862f96e8ae..b505a145a593 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ int ret = -EIO;
*err = 0;
/* i_data is not going away, no lock needed */
@@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
goto no_block;
while (--depth) {
bh = sb_getblk(sb, le32_to_cpu(p->key));
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto failure;
+ }
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
return NULL;
failure:
- *err = -EIO;
+ *err = ret;
no_block:
return p;
}
@@ -355,9 +358,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* for the first direct block
*/
new_blocks[index] = current_block;
- printk(KERN_INFO "%s returned more blocks than "
+ WARN(1, KERN_INFO "%s returned more blocks than "
"requested\n", __func__);
- WARN_ON(1);
break;
}
}
@@ -471,7 +473,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
if (unlikely(!bh)) {
- err = -EIO;
+ err = -ENOMEM;
goto failed;
}
@@ -789,7 +791,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
if (final_size > inode->i_size) {
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -849,7 +851,7 @@ locked:
int err;
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
* but cannot extend i_size. Bail out and pretend
@@ -948,7 +950,8 @@ static handle_t *start_transaction(struct inode *inode)
{
handle_t *result;
- result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
+ result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode));
if (!IS_ERR(result))
return result;
@@ -1515,3 +1518,243 @@ out_stop:
trace_ext4_truncate_exit(inode);
}
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh, __le32 *i_data,
+ int level, ext4_lblk_t first,
+ ext4_lblk_t count, int max)
+{
+ struct buffer_head *bh = NULL;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ret = 0;
+ int i, inc;
+ ext4_lblk_t offset;
+ __le32 blk;
+
+ inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+ for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+ if (offset >= count + first)
+ break;
+ if (*i_data == 0 || (offset + inc) <= first)
+ continue;
+ blk = *i_data;
+ if (level > 0) {
+ ext4_lblk_t first2;
+ bh = sb_bread(inode->i_sb, blk);
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, blk,
+ "Read failure");
+ return -EIO;
+ }
+ first2 = (first > offset) ? first - offset : 0;
+ ret = free_hole_blocks(handle, inode, bh,
+ (__le32 *)bh->b_data, level - 1,
+ first2, count - offset,
+ inode->i_sb->s_blocksize >> 2);
+ if (ret) {
+ brelse(bh);
+ goto err;
+ }
+ }
+ if (level == 0 ||
+ (bh && all_zeroes((__le32 *)bh->b_data,
+ (__le32 *)bh->b_data + addr_per_block))) {
+ ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
+ *i_data = 0;
+ }
+ brelse(bh);
+ bh = NULL;
+ }
+
+err:
+ return ret;
+}
+
+static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
+{
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int level, ret = 0;
+ int num = EXT4_NDIR_BLOCKS;
+ ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+ __le32 *i_data = EXT4_I(inode)->i_data;
+
+ count = stop - first;
+ for (level = 0; level < 4; level++, max *= addr_per_block) {
+ if (first < max) {
+ ret = free_hole_blocks(handle, inode, NULL, i_data,
+ level, first, count, num);
+ if (ret)
+ goto err;
+ if (count > max - first)
+ count -= max - first;
+ else
+ break;
+ first = 0;
+ } else {
+ first -= max;
+ }
+ i_data += num;
+ if (level == 0) {
+ num = 1;
+ max = 1;
+ }
+ }
+
+err:
+ return ret;
+}
+
+int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle = NULL;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+ int err = 0;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+
+ /*
+ * If the hole extents beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_pagecache_range(inode, first_page_offset,
+ last_page_offset - 1);
+ }
+
+ /* Wait all existing dio works, newcomers will block on i_mutex */
+ inode_dio_wait(inode);
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle))
+ goto out_mutex;
+
+ /*
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the buffer
+ * heads for the block aligned regions of the page that were
+ * completely zerod.
+ */
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained within a page
+ * just zero out and unmap the middle of that page
+ */
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+ if (err)
+ goto out;
+ } else {
+ /*
+ * Zero out and unmap the paritial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If i_size contained in the last page, we need to
+ * unmap and zero the paritial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ if (first_block >= stop_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ err = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
+
+ ext4_discard_preallocations(inode);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
+}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 387c47c6cda9..c0fd1a123f7d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -545,7 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
retry:
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -657,7 +657,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
* The possible write could happen in the inode,
* so try to reserve the space in inode first.
*/
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -853,7 +853,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
if (ret)
return ret;
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
data_bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!data_bh) {
- error = -EIO;
+ error = -ENOMEM;
goto out_restore;
}
@@ -1298,7 +1298,7 @@ int ext4_read_inline_dir(struct file *filp,
int i, stored;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
@@ -1770,7 +1770,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
needed_blocks = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return;
@@ -1862,7 +1862,7 @@ int ext4_convert_inline_data(struct inode *inode)
if (error)
return error;
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto out_free;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cb1c1ab2720b..9ea0cde3fa9e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)
* protection against it
*/
sb_start_intwrite(inode->i_sb);
- handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
spin_lock(&ei->i_block_reservation_lock);
trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
"with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
@@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
}
if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
- "with only %d reserved metadata blocks\n", __func__,
- inode->i_ino, ei->i_allocated_meta_blocks,
- ei->i_reserved_meta_blocks);
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
WARN_ON(1);
ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
}
@@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
map->m_flags = 0;
ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+ goto found;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
+ if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
up_read((&EXT4_I(inode)->i_data_sem));
+found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /* delayed alloc may be allocated by fallocate and
- * coverted to initialized by directIO.
- * we need to handle delayed extent here.
- */
- down_write((&EXT4_I(inode)->i_data_sem));
- goto delayed_mapped;
- }
- ret = check_block_validity(inode, map);
+ int ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return retval;
/*
- * When we call get_blocks without the create flag, the
- * BH_Unwritten flag could have gotten set if the blocks
- * requested were part of a uninitialized extent. We need to
- * clear this flag now that we are committed to convert all or
- * part of the uninitialized extent to be an initialized
- * extent. This is because we need to avoid the combination
- * of BH_Unwritten and BH_Mapped flags being simultaneously
- * set on the buffer_head.
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
*/
- map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+ map->m_flags &= ~EXT4_MAP_FLAGS;
/*
* New blocks allocate and/or writing to uninitialized extent
@@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
-delayed_mapped:
- /* delayed allocation blocks has been allocated */
- ret = ext4_es_remove_extent(inode, map->m_lblk,
- map->m_len);
- if (ret < 0)
- retval = ret;
- }
+ if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, dio_credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
return ret;
@@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
/* ensure we send some value back into *errp */
*errp = 0;
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
return NULL;
}
if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
flags, pagep);
if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret == 1)
+ return 0;
}
-retry:
- handle = ext4_journal_start(inode, needed_blocks);
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
-
- *pagep = page;
+ wait_on_page_writeback(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +954,6 @@ retry:
if (ret) {
unlock_page(page);
- page_cache_release(page);
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -942,11 +977,14 @@ retry:
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
@@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* function is called from invalidate page, it's
* harmless to return without any action.
*/
- ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
"ino %lu, to_free %d with only %d reserved "
"data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
@@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
loff_t size = i_size_read(inode);
unsigned int len, block_start;
struct buffer_head *bh, *page_bufs = NULL;
- int journal_data = ext4_should_journal_data(inode);
sector_t pblock = 0, cur_logical = 0;
struct ext4_io_submit io_submit;
@@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, skip_page = 0;
+ int skip_page = 0;
struct page *page = pvec.pages[i];
index = page->index;
@@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- /*
- * If the page does not have buffers (for
- * whatever reason), try to create them using
- * __block_write_begin. If this fails,
- * skip the page and move on.
- */
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- skip_page:
- unlock_page(page);
- continue;
- }
- commit_write = 1;
- }
-
bh = page_bufs = page_buffers(page);
block_start = 0;
do {
- if (!bh)
- goto skip_page;
if (map && (cur_logical >= map->m_lblk) &&
(cur_logical <= (map->m_lblk +
(map->m_len - 1)))) {
@@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
pblock++;
} while (bh != page_bufs);
- if (skip_page)
- goto skip_page;
-
- if (commit_write)
- /* mark the buffer_heads as dirty & uptodate */
- block_commit_write(page, 0, len);
+ if (skip_page) {
+ unlock_page(page);
+ continue;
+ }
clear_page_dirty_for_io(page);
- /*
- * Delalloc doesn't support data journalling,
- * but eventually maybe we'll lift this
- * restriction.
- */
- if (unlikely(journal_data && PageChecked(page)))
- err = __ext4_journalled_writepage(page, len);
- else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
- err = ext4_bio_write_page(&io_submit, page,
- len, mpd->wbc);
- else if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- err = block_write_full_page_endio(page,
- noalloc_get_block_write,
- mpd->wbc, ext4_end_io_buffer_write);
- } else
- err = block_write_full_page(page,
- noalloc_get_block_write, mpd->wbc);
-
+ err = ext4_bio_write_page(&io_submit, page, len,
+ mpd->wbc);
if (!err)
mpd->pages_written++;
/*
@@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost\n");
+ "This should not happen!! Data will be lost");
if (err == -ENOSPC)
ext4_print_free_blocks(mpd->inode);
}
@@ -1690,16 +1690,16 @@ submit_io:
*
* @mpd->lbh - extent of blocks
* @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
+ * @b_state - b_state of the buffer head added
*
* the function is used to collect contig. blocks in same state
*/
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, size_t b_size,
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
unsigned long b_state)
{
sector_t next;
- int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ int blkbits = mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> blkbits;
/*
* XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* mpage_da_submit_io() into this function and then call
* ext4_map_blocks() multiple times in a loop
*/
- if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ if (nrblocks >= (8*1024*1024 >> blkbits))
goto flush_it;
- /* check if thereserved journal credits might overflow */
- if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ /* check if the reserved journal credits might overflow */
+ if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* nrblocks. So limit nrblocks.
*/
goto flush_it;
- } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- EXT4_MAX_TRANS_DATA) {
- /*
- * Adding the new buffer_head would make it cross the
- * allowed limit for which we have journal credit
- * reserved. So limit the new bh->b_size
- */
- b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- mpd->inode->i_blkbits;
- /* we will do mpage_da_submit_io in the next loop */
}
}
/*
@@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
*/
if (mpd->b_size == 0) {
mpd->b_blocknr = logical;
- mpd->b_size = b_size;
+ mpd->b_size = 1 << blkbits;
mpd->b_state = b_state & BH_FLAGS;
return;
}
@@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* Can we merge the block to our big extent?
*/
if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += b_size;
+ mpd->b_size += 1 << blkbits;
return;
}
@@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
struct ext4_map_blocks *map,
struct buffer_head *bh)
{
+ struct extent_status es;
int retval;
sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read((&EXT4_I(inode)->i_data_sem));
+ goto add_delayed;
+ }
+
+ /*
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
+ */
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
+
+ return retval;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
retval = 0;
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
if (retval == 0) {
+ int ret;
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
@@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* If the block was allocated from previously allocated cluster,
* then we dont need to reserve it again. */
if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
- retval = ext4_da_reserve_space(inode, iblock);
- if (retval)
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
/* not enough space to reserve */
+ retval = ret;
goto out_unlock;
+ }
}
- retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
- if (retval)
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
goto out_unlock;
+ }
/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
* and it should not appear on the bh->b_state.
@@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
}
out_unlock:
@@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks. It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling block_write_full_page(). Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
get_bh(bh);
@@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,
* references to buffers so we are safe */
unlock_page(page);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -2035,11 +2061,12 @@ out:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
- int ret = 0, commit_write = 0;
+ int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
trace_ext4_writepage(page);
size = i_size_read(inode);
@@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_CACHE_SIZE;
+ page_bufs = page_buffers(page);
/*
- * If the page does not have buffers (for whatever reason),
- * try to create them using __block_write_begin. If this
- * fails, redirty the page and move on.
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
*/
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(wbc, page);
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
unlock_page(page);
return 0;
}
- commit_write = 1;
}
- page_bufs = page_buffers(page);
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation, so redirty
- * the page and return. We may reach here when we do
- * a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list but it
- * should never be for direct reclaim so warn if that
- * happens
- */
- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC);
- goto redirty_page;
- }
- if (commit_write)
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
@@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
-
+ memset(&io_submit, 0, sizeof(io_submit));
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ ext4_io_submit(&io_submit);
return ret;
}
@@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,
logical = (sector_t) page->index <<
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical,
- PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- if (mpd->io_done)
- goto ret_extent_tail;
- } else {
+ /* Add all dirty buffers to mpd */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
/*
- * Page with regular buffer heads,
- * just add all dirty ones
+ * We need to try to allocate unmapped blocks
+ * in the same page. Otherwise we won't make
+ * progress with the page in ext4_writepage
*/
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
+ if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_state);
+ if (mpd->io_done)
+ goto ret_extent_tail;
+ } else if (buffer_dirty(bh) &&
+ buffer_mapped(bh)) {
/*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
+ * mapped dirty buffer. We need to
+ * update the b_state because we look
+ * at b_state in mpage_da_map_blocks.
+ * We don't update b_size because if we
+ * find an unmapped buffer_head later
+ * we need to use the b_state flag of
+ * that buffer_head.
*/
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need
- * to update the b_state
- * because we look at b_state
- * in mpage_da_map_blocks. We
- * don't update b_size because
- * if we find an unmapped
- * buffer_head later we need to
- * use the b_state flag of that
- * buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
- }
+ if (mpd->b_size == 0)
+ mpd->b_state =
+ bh->b_state & BH_FLAGS;
+ }
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
if (nr_to_write > 0) {
nr_to_write--;
@@ -2413,7 +2412,8 @@ retry:
needed_blocks = ext4_da_writepages_trans_blocks(inode);
/* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2512,12 +2512,8 @@ static int ext4_nonda_switch(struct super_block *sb)
/*
* Start pushing delalloc when 1/2 of free blocks are dirty.
*/
- if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
- !writeback_in_progress(sb->s_bdi) &&
- down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- }
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
@@ -2555,42 +2551,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pos, len, flags,
pagep, fsdata);
if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret == 1)
+ return 0;
}
-retry:
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
/*
* With delayed allocation, we don't log the i_disksize update
* if there is delayed block allocation. But we still need
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
- handle = ext4_journal_start(inode, 1);
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_on_page_writeback(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
- page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -2598,11 +2604,16 @@ retry:
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ *pagep = page;
return ret;
}
@@ -2858,47 +2869,37 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
+ trace_ext4_invalidatepage(page, offset);
- if (!page_has_buffers(page))
- return;
- head = bh = page_buffers(page);
- do {
- if (offset <= curr_off && test_clear_buffer_uninit(bh)
- && bh->b_private) {
- ext4_free_io_end(bh->b_private);
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- }
- curr_off = curr_off + bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- trace_ext4_invalidatepage(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset);
/*
- * free any io_end structure allocated for buffers to be discarded
- */
- if (ext4_should_dioread_nolock(page->mapping->host))
- ext4_invalidatepage_free_endio(page, offset);
- /*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
ClearPageChecked(page);
- if (journal)
- jbd2_journal_invalidatepage(journal, page, offset);
- else
- block_invalidatepage(page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2943,7 +2944,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private, int ret,
bool is_async)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO or dio with 0 bytes write, just return */
@@ -2961,9 +2962,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
out:
+ inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
- inode_dio_done(inode);
return;
}
@@ -2977,65 +2978,6 @@ out:
ext4_add_complete_io(io_end);
}
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
- ext4_io_end_t *io_end = bh->b_private;
- struct inode *inode;
-
- if (!test_clear_buffer_uninit(bh) || !io_end)
- goto out;
-
- if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- ext4_msg(io_end->inode->i_sb, KERN_INFO,
- "sb umounted, discard end_io request for inode %lu",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- goto out;
- }
-
- /*
- * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
- * but being more careful is always safe for the future change.
- */
- inode = io_end->inode;
- ext4_set_io_unwritten_flag(inode, io_end);
- ext4_add_complete_io(io_end);
-out:
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- clear_buffer_uninit(bh);
- end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
- loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- size_t size = bh->b_size;
-
-retry:
- io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- if (!io_end) {
- pr_warn_ratelimited("%s: allocation fail\n", __func__);
- schedule();
- goto retry;
- }
- io_end->offset = offset;
- io_end->size = size;
- /*
- * We need to hold a reference to the page to make sure it
- * doesn't get evicted before ext4_end_io_work() has a chance
- * to convert the extent from written to unwritten.
- */
- io_end->page = page;
- get_page(io_end->page);
-
- bh->b_private = io_end;
- bh->b_end_io = ext4_end_io_buffer_write;
- return 0;
-}
-
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
@@ -3264,7 +3206,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3537,20 +3479,20 @@ int ext4_can_truncate(struct inode *inode)
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- /* TODO: Add support for non extent hole punching */
- return -EOPNOTSUPP;
- }
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ind_punch_hole(file, offset, length);
if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
return -EOPNOTSUPP;
}
+ trace_ext4_punch_hole(inode, offset, length);
+
return ext4_ext_punch_hole(file, offset, length);
}
@@ -3644,11 +3586,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- return -EIO;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -3680,7 +3619,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* Is the inode bitmap in cache? */
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -4305,6 +4244,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
+
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
+}
+
+/*
* ext4_setattr()
*
* Called from notify_change.
@@ -4347,8 +4327,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -4383,7 +4364,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
(attr->ia_size < inode->i_size)) {
handle_t *handle;
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -4403,7 +4384,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size);
if (error) {
/* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode,
+ EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
goto err_out;
@@ -4417,16 +4399,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode)) {
- truncate_setsize(inode, attr->ia_size);
- /* Inode size will be reduced, wait for dio in flight.
- * Temporarily disable dioread_nolock to prevent
- * livelock. */
+ if (attr->ia_size != inode->i_size) {
+ loff_t oldsize = inode->i_size;
+
+ i_size_write(inode, attr->ia_size);
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
if (orphan) {
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
- ext4_inode_resume_unlocked_dio(inode);
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
}
ext4_truncate(inode);
}
@@ -4732,7 +4726,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
@@ -4833,7 +4827,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4857,7 +4851,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long len;
int ret;
struct file *file = vma->vm_file;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
get_block_t *get_block;
@@ -4899,7 +4893,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
ret = VM_FAULT_LOCKED;
goto out;
}
@@ -4911,7 +4905,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
goto out;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5747f52f7c72..721f4d33e148 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -22,7 +22,7 @@
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
@@ -104,7 +104,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
} else if (oldflags & EXT4_EOFBLOCKS_FL)
ext4_truncate(inode);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto flags_out;
@@ -173,7 +173,7 @@ flags_out:
}
mutex_lock(&inode->i_mutex);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto unlock_out;
@@ -313,6 +313,9 @@ mext_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input.group);
group_add_out:
ext4_resize_end(sb);
return err;
@@ -358,6 +361,7 @@ group_add_out:
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
@@ -388,6 +392,11 @@ group_add_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
resizefs_out:
ext4_resize_end(sb);
return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1bf6fe785c4f..7bb713a46fe4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,11 +23,18 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
-#include <linux/debugfs.h>
#include <linux/log2.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -1884,15 +1891,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
- if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
-
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -2007,7 +2018,7 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2656,40 +2667,6 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
-
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
-
-static void __init ext4_create_debugfs_entry(void)
-{
- debugfs_dir = debugfs_create_dir("ext4", NULL);
- if (debugfs_dir)
- debugfs_debug = debugfs_create_u8("mballoc-debug",
- S_IRUGO | S_IWUSR,
- debugfs_dir,
- &mb_enable_debug);
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
- debugfs_remove(debugfs_debug);
- debugfs_remove(debugfs_dir);
-}
-
-#else
-
-static void __init ext4_create_debugfs_entry(void)
-{
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2711,7 +2688,6 @@ int __init ext4_init_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
- ext4_create_debugfs_entry();
return 0;
}
@@ -2726,7 +2702,6 @@ void ext4_exit_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_data_cachep);
ext4_groupinfo_destroy_slabs();
- ext4_remove_debugfs_entry();
}
@@ -3444,7 +3419,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
win = offs;
ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_B2C(sbi, win);
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3872,7 +3847,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
struct super_block *sb = ac->ac_sb;
ext4_group_t ngroups, i;
- if (!mb_enable_debug ||
+ if (!ext4_mballoc_debug ||
(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
@@ -4005,8 +3980,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
- len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4136,7 +4111,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
- rcu_read_lock();
+ spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
@@ -4160,12 +4135,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
- rcu_read_unlock();
+ spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
- order, lg_prealloc_count);
+ order, lg_prealloc_count);
return;
}
return ;
@@ -4590,7 +4565,7 @@ do_more:
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- count_clusters = EXT4_B2C(sbi, count);
+ count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4832,11 +4807,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, blocks_freed));
+ EXT4_NUM_B2C(sbi, blocks_freed));
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(EXT4_B2C(sbi, blocks_freed),
+ atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),
&sbi->s_flex_groups[flex_group].free_clusters);
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3ccd889ba953..08481ee84cd5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,11 @@
/*
*/
#ifdef CONFIG_EXT4_DEBUG
-extern u8 mb_enable_debug;
+extern ushort ext4_mballoc_debug;
#define mb_debug(n, fmt, a...) \
do { \
- if ((n) <= mb_enable_debug) { \
+ if ((n) <= ext4_mballoc_debug) { \
printk(KERN_DEBUG "(%s, %d): %s: ", \
__FILE__, __LINE__, __func__); \
printk(fmt, ## a); \
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index db8226d595fa..480acf4a085f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -456,11 +456,14 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
- + 1);
+ /*
+ * Worst case we can touch the allocation bitmaps, a bgd
+ * block, and a block to link in the orphan list. We do need
+ * need to worry about credits for modifying the quota inode.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+ 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
return retval;
@@ -507,7 +510,7 @@ int ext4_ext_migrate(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
/*
* It is impossible to update on-disk structures without
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index fe7c63f4717e..f9b551561d2c 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
+ if (!*bh)
+ return -ENOMEM;
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
@@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
*bh = NULL;
}
}
- if (!*bh) {
+ if (unlikely(!*bh)) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d9cc5ee42f53..4e81d47aa8cb 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -681,6 +681,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
+ if (unlikely(!dext))
+ goto missing_donor_extent;
tmp_dext = *dext;
*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
@@ -691,7 +693,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- if (!dext) {
+ if (unlikely(!dext)) {
+ missing_donor_extent:
EXT4_ERROR_INODE(donor_inode,
"The extent for donor must be found");
*err = -EIO;
@@ -761,9 +764,6 @@ out:
kfree(donor_path);
}
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
-
return replaced_count;
}
@@ -900,7 +900,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
int block_len_in_page, int uninit, int *err)
{
- struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
@@ -920,7 +920,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
again:
*err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
- handle = ext4_journal_start(orig_inode, jblocks);
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
return 0;
@@ -1279,8 +1279,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 orig_start, __u64 donor_start, __u64 len,
__u64 *moved_len)
{
- struct inode *orig_inode = o_filp->f_dentry->d_inode;
- struct inode *donor_inode = d_filp->f_dentry->d_inode;
+ struct inode *orig_inode = file_inode(o_filp);
+ struct inode *donor_inode = file_inode(d_filp);
struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
ext4_lblk_t block_start = orig_start;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cac448282331..3825d6aa8336 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -47,38 +47,111 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
- ext4_lblk_t *block, int *err)
+ ext4_lblk_t *block)
{
struct buffer_head *bh;
+ int err = 0;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
((inode->i_size >> 10) >=
- EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
- *err = -ENOSPC;
- return NULL;
- }
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+ return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext4_bread(handle, inode, *block, 1, err);
- if (bh) {
- inode->i_size += inode->i_sb->s_blocksize;
- EXT4_I(inode)->i_disksize = inode->i_size;
- *err = ext4_journal_get_write_access(handle, bh);
- if (*err) {
+ bh = ext4_bread(handle, inode, *block, 1, &err);
+ if (!bh)
+ return ERR_PTR(err);
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
+ }
+ return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
+
+typedef enum {
+ EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+ __ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+ ext4_lblk_t block,
+ dirblock_type_t type,
+ unsigned int line)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry *dirent;
+ int err = 0, is_dx_block = 0;
+
+ bh = ext4_bread(NULL, inode, block, 0, &err);
+ if (!bh) {
+ if (err == 0) {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory hole found");
+ return ERR_PTR(-EIO);
+ }
+ __ext4_warning(inode->i_sb, __func__, line,
+ "error reading directory block "
+ "(ino %lu, block %lu)", inode->i_ino,
+ (unsigned long) block);
+ return ERR_PTR(err);
+ }
+ dirent = (struct ext4_dir_entry *) bh->b_data;
+ /* Determine whether or not we have an index block */
+ if (is_dx(inode)) {
+ if (block == 0)
+ is_dx_block = 1;
+ else if (ext4_rec_len_from_disk(dirent->rec_len,
+ inode->i_sb->s_blocksize) ==
+ inode->i_sb->s_blocksize)
+ is_dx_block = 1;
+ }
+ if (!is_dx_block && type == INDEX) {
+ ext4_error_inode(inode, __func__, line, block,
+ "directory leaf block found instead of index block");
+ return ERR_PTR(-EIO);
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ buffer_verified(bh))
+ return bh;
+
+ /*
+ * An empty leaf block can get mistaken for a index block; for
+ * this reason, we can only check the index checksum when the
+ * caller is sure it should be an index block.
+ */
+ if (is_dx_block && type == INDEX) {
+ if (ext4_dx_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory index failed checksum");
brelse(bh);
- bh = NULL;
+ return ERR_PTR(-EIO);
}
}
- if (!bh && !(*err)) {
- *err = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
+ if (!is_dx_block) {
+ if (ext4_dirent_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory block failed checksum");
+ brelse(bh);
+ return ERR_PTR(-EIO);
+ }
}
return bh;
}
@@ -604,9 +677,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
- if (*err == 0)
- *err = ERR_BAD_DX_DIR;
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail;
}
root = (struct dx_root *) bh->b_data;
@@ -643,15 +716,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail;
}
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Root failed checksum");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
- set_buffer_verified(bh);
-
entries = (struct dx_entry *) (((char *)&root->info) +
root->info.info_length);
@@ -709,22 +773,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
- if (!(*err))
- *err = ERR_BAD_DX_DIR;
+ bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail2;
}
- at = entries = ((struct dx_node *) bh->b_data)->entries;
-
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Node failed checksum");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
- set_buffer_verified(bh);
+ entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
@@ -783,7 +837,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
{
struct dx_frame *p;
struct buffer_head *bh;
- int err, num_frames = 0;
+ int num_frames = 0;
__u32 bhash;
p = frame;
@@ -822,25 +876,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err))) {
- if (!err) {
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- return -EIO;
- }
- return err; /* Failure */
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Node failed checksum");
- return -EIO;
- }
- set_buffer_verified(bh);
-
+ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
p++;
brelse(p->bh);
p->bh = bh;
@@ -866,20 +904,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
- return err;
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
- return -EIO;
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de +
@@ -937,7 +964,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
- dir = dir_file->f_path.dentry->d_inode;
+ dir = file_inode(dir_file);
if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
@@ -1333,26 +1360,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
- if (!(*err)) {
- *err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
- goto errout;
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(dir, "checksumming directory "
- "block %lu", (unsigned long)block);
- brelse(bh);
- *err = -EIO;
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto errout;
}
- set_buffer_verified(bh);
retval = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
@@ -1536,11 +1548,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2)) {
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- goto errout;
+ *error = PTR_ERR(bh2);
+ return NULL;
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -1621,7 +1634,6 @@ journal_error:
brelse(bh2);
*bh = NULL;
ext4_std_error(dir->i_sb, err);
-errout:
*error = err;
return NULL;
}
@@ -1699,7 +1711,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned int blocksize = dir->i_sb->s_blocksize;
- unsigned short reclen;
int csum_size = 0;
int err;
@@ -1707,7 +1718,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
- reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
err = ext4_find_dest_de(dir, inode,
bh, bh->b_data, blocksize - csum_size,
@@ -1798,10 +1808,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
- bh2 = ext4_append(handle, dir, &block, &retval);
- if (!(bh2)) {
+ bh2 = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh2)) {
brelse(bh);
- return retval;
+ return PTR_ERR(bh2);
}
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
@@ -1918,20 +1928,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
- if (!retval) {
- retval = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
- return retval;
- }
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data))
- return -EIO;
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC) {
brelse(bh);
@@ -1943,9 +1943,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
- bh = ext4_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
+ bh = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -1982,22 +1982,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
return err;
entries = frame->entries;
at = frame->at;
-
- if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
goto cleanup;
}
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
- goto journal_error;
- set_buffer_verified(bh);
-
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -2025,9 +2016,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
err = -ENOSPC;
goto cleanup;
}
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
goto cleanup;
+ }
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
memset(&node2->fake, 0, sizeof(struct fake_dirent));
@@ -2106,8 +2099,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
journal_error:
ext4_std_error(dir->i_sb, err);
cleanup:
- if (bh)
- brelse(bh);
+ brelse(bh);
dx_release(frames);
return err;
}
@@ -2254,29 +2246,28 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2287,31 +2278,30 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (!new_valid_dev(rdev))
return -EINVAL;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2351,6 +2341,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
+ ext4_lblk_t block = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err;
@@ -2367,17 +2358,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
- inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
- if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
- goto out;
- }
+ inode->i_size = 0;
+ dir_block = ext4_append(handle, inode, &block);
+ if (IS_ERR(dir_block))
+ return PTR_ERR(dir_block);
BUFFER_TRACE(dir_block, "get_write_access");
err = ext4_journal_get_write_access(handle, dir_block);
if (err)
@@ -2404,25 +2388,21 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
- &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2450,8 +2430,12 @@ out_clear_inode:
goto out_clear_inode;
unlock_new_inode(inode);
d_instantiate(dentry, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2477,25 +2461,14 @@ static int empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory lblock 0", err);
- else
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ EXT4_ERROR_INODE(inode, "invalid size");
return 1;
}
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(inode,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(inode, "checksum error reading directory "
- "lblock 0");
- return -EIO;
- }
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh))
+ return 1;
+
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -2518,28 +2491,9 @@ static int empty_dir(struct inode *inode)
err = 0;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- bh = ext4_bread(NULL, inode, lblock, 0, &err);
- if (!bh) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory "
- "lblock %u", err, lblock);
- else
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
-
- offset += sb->s_blocksize;
- continue;
- }
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(inode,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(inode, "checksum error "
- "reading directory lblock 0");
- return -EIO;
- }
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (IS_ERR(bh))
+ return 1;
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
if (ext4_check_dir_entry(inode, NULL, de, bh,
@@ -2648,7 +2602,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- if (!EXT4_SB(inode->i_sb)->s_journal)
+ if ((!EXT4_SB(inode->i_sb)->s_journal) &&
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2717,25 +2672,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
inode = dentry->d_inode;
retval = -EIO;
@@ -2746,6 +2694,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (!empty_dir(inode))
goto end_rmdir;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rmdir;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
@@ -2767,8 +2726,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_mark_inode_dirty(handle, dir);
end_rmdir:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
@@ -2778,7 +2738,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
@@ -2786,13 +2746,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
@@ -2804,6 +2757,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_unlink;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
if (!inode->i_nlink) {
ext4_warning(inode->i_sb,
"Deleting nonexistent file (%lu), %d",
@@ -2824,8 +2788,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = 0;
end_unlink:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
@@ -2865,15 +2830,10 @@ static int ext4_symlink(struct inode *dir,
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
}
retry:
- handle = ext4_journal_start(dir, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
- &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2903,7 +2863,7 @@ retry:
* Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
* + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
- handle = ext4_journal_start(dir,
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
@@ -2926,8 +2886,12 @@ retry:
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2950,8 +2914,9 @@ static int ext4_link(struct dentry *old_dentry,
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2991,13 +2956,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
- if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
- if (!*retval) {
- *retval = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh)) {
+ *retval = PTR_ERR(bh);
return NULL;
}
*parent_de = ext4_next_entry(
@@ -3034,9 +2995,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* in separate transaction */
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, 2 *
- EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -3076,11 +3037,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
&inlined);
if (!dir_bh)
goto end_rename;
- if (!inlined && !buffer_verified(dir_bh) &&
- !ext4_dirent_csum_verify(old_inode,
- (struct ext4_dir_entry *)dir_bh->b_data))
- goto end_rename;
- set_buffer_verified(dir_bh);
if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbca2a40..809b31003ecc 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -73,8 +74,6 @@ void ext4_free_io_end(ext4_io_end_t *io)
BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
- if (io->page)
- put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
@@ -103,14 +102,13 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
-
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
+ if (io->iocb)
+ aio_complete(io->iocb, io->result, 0);
return ret;
}
@@ -119,7 +117,6 @@ static void dump_completed_IO(struct inode *inode)
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
ext4_debug("inode %lu completed_io list is empty\n",
@@ -152,26 +149,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list)) {
- io_end->flag |= EXT4_IO_END_QUEUED;
- queue_work(wq, &io_end->work);
- }
+ if (list_empty(&ei->i_completed_io_list))
+ queue_work(wq, &ei->i_unwritten_work);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode,
- ext4_io_end_t *work_io)
+static int ext4_do_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
- struct list_head unwritten, complete, to_free;
+ struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, ret = 0;
- INIT_LIST_HEAD(&complete);
- INIT_LIST_HEAD(&to_free);
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode);
list_replace_init(&ei->i_completed_io_list, &unwritten);
@@ -185,32 +176,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
-
- list_add_tail(&io->list, &complete);
- }
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&complete)) {
- io = list_entry(complete.next, ext4_io_end_t, list);
io->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* end_io context can not be destroyed now because it still
- * used by queued worker. Worker thread will destroy it later */
- if (io->flag & EXT4_IO_END_QUEUED)
- list_del_init(&io->list);
- else
- list_move(&io->list, &to_free);
- }
- /* If we are called from worker context, it is time to clear queued
- * flag, and destroy it's end_io if it was converted already */
- if (work_io) {
- work_io->flag &= ~EXT4_IO_END_QUEUED;
- if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
- list_add_tail(&work_io->list, &to_free);
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- while (!list_empty(&to_free)) {
- io = list_entry(to_free.next, ext4_io_end_t, list);
- list_del_init(&io->list);
ext4_free_io_end(io);
}
return ret;
@@ -219,10 +185,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
-static void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_work(struct work_struct *work)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- ext4_do_flush_completed_IO(io->inode, io);
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unwritten_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode);
}
int ext4_flush_unwritten_io(struct inode *inode)
@@ -230,7 +197,7 @@ int ext4_flush_unwritten_io(struct inode *inode)
int ret;
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
!(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode, NULL);
+ ret = ext4_do_flush_completed_IO(inode);
ext4_unwritten_wait(inode);
return ret;
}
@@ -241,7 +208,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
if (io) {
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
- INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
return io;
@@ -382,14 +348,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
- if (!buffer_mapped(bh) || buffer_delay(bh)) {
- if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
- return 0;
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
@@ -436,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
if (!io_page) {
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
@@ -468,7 +426,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
set_buffer_uptodate(bh);
continue;
}
- clear_buffer_dirty(bh);
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ continue;
+ }
ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
if (ret) {
/*
@@ -476,9 +442,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* we can do but mark the page as dirty, and
* better luck next time.
*/
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
break;
}
+ clear_buffer_dirty(bh);
}
unlock_page(page);
/*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d99387b89edd..b2c8ee56eb98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
- if (!bh)
- return -EIO;
+ if (unlikely(!bh))
+ return -ENOMEM;
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -466,7 +466,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
/* This transaction may be extended/restarted along the way */
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto out;
}
@@ -1031,7 +1031,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
handle_t *handle;
int err = 0, err2;
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle)) {
group = 1;
err = PTR_ERR(handle);
@@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
ext4_bg_has_super(sb, group));
bh = sb_getblk(sb, backup_block);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext4_debug("update metadata backup %llu(+%llu)\n",
@@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
{
struct buffer_head *bh = sb_getblk(sb, block);
- if (!bh)
+ if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -1247,7 +1247,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_inode_table_set(sb, gdp, group_data->inode_table);
ext4_free_group_clusters_set(sb, gdp,
- EXT4_B2C(sbi, group_data->free_blocks_count));
+ EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
if (ext4_has_group_desc_csum(sb))
ext4_itable_unused_set(sb, gdp,
@@ -1349,7 +1349,7 @@ static void ext4_update_super(struct super_block *sb,
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_B2C(sbi, free_blocks));
+ EXT4_NUM_B2C(sbi, free_blocks));
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
@@ -1360,7 +1360,7 @@ static void ext4_update_super(struct super_block *sb,
sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, group_data[0].group);
- atomic_add(EXT4_B2C(sbi, free_blocks),
+ atomic_add(EXT4_NUM_B2C(sbi, free_blocks),
&sbi->s_flex_groups[flex_group].free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
&sbi->s_flex_groups[flex_group].free_inodes);
@@ -1412,7 +1412,7 @@ static int ext4_flex_group_add(struct super_block *sb,
* modify each of the reserved GDT dindirect blocks.
*/
credit = flex_gd->count * 4 + reserved_gdb;
- handle = ext4_journal_start_sb(sb, credit);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto exit;
@@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
group_data[i].blocks_count = blocks_per_group;
overhead = ext4_group_overhead_blocks(sb, group + i);
group_data[i].free_blocks_count = blocks_per_group - overhead;
- if (ext4_has_group_desc_csum(sb))
+ if (ext4_has_group_desc_csum(sb)) {
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
- else
+ if (!test_opt(sb, INIT_INODE_TABLE))
+ flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+ } else
flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
}
@@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
err = ext4_alloc_flex_bg_array(sb, input->group + 1);
if (err)
- return err;
+ goto out;
err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
if (err)
@@ -1622,7 +1624,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
/* We will update the superblock, one block bitmap, and
* one group descriptor via ext4_group_add_blocks().
*/
- handle = ext4_journal_start_sb(sb, 3);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_warning(sb, "error %d on journal start", err);
@@ -1786,7 +1788,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
credits += 3; /* block bitmap, bg descriptor, resize inode */
}
- handle = ext4_journal_start_sb(sb, credits);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3cdb0a2fc648..5e6c87836193 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,8 +69,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16]);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -296,107 +294,6 @@ void ext4_itable_unused_set(struct super_block *sb,
}
-/* Just increment the non-pointer handle value */
-static handle_t *ext4_get_nojournal(void)
-{
- handle_t *handle = current->journal_info;
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
-
- ref_cnt++;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
- return handle;
-}
-
-
-/* Decrement the non-pointer handle value */
-static void ext4_put_nojournal(handle_t *handle)
-{
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt == 0);
-
- ref_cnt--;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
-}
-
-/*
- * Wrappers for jbd2_journal_start/end.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
-{
- journal_t *journal;
-
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
- if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
- WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
- journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
- /*
- * Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly.
- */
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
- return jbd2_journal_start(journal, nblocks);
-}
-
-int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
-{
- struct super_block *sb;
- int err;
- int rc;
-
- if (!ext4_handle_valid(handle)) {
- ext4_put_nojournal(handle);
- return 0;
- }
- sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
- rc = jbd2_journal_stop(handle);
-
- if (!err)
- err = rc;
- if (err)
- __ext4_std_error(sb, where, line, err);
- return err;
-}
-
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn, struct buffer_head *bh,
- handle_t *handle, int err)
-{
- char nbuf[16];
- const char *errstr = ext4_decode_error(NULL, err, nbuf);
-
- BUG_ON(!ext4_handle_valid(handle));
-
- if (bh)
- BUFFER_TRACE(bh, "abort");
-
- if (!handle->h_err)
- handle->h_err = err;
-
- if (is_handle_aborted(handle))
- return;
-
- printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
- caller, line, errstr, err_fn);
-
- jbd2_journal_abort_handle(handle);
-}
-
static void __save_error_info(struct super_block *sb, const char *func,
unsigned int line)
{
@@ -553,7 +450,7 @@ void ext4_error_file(struct file *file, const char *function,
va_list args;
struct va_format vaf;
struct ext4_super_block *es;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
char pathname[80], *path;
es = EXT4_SB(inode->i_sb)->s_es;
@@ -582,8 +479,8 @@ void ext4_error_file(struct file *file, const char *function,
ext4_handle_error(inode->i_sb);
}
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16])
+const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16])
{
char *errstr = NULL;
@@ -858,6 +755,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -885,6 +783,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -939,11 +838,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_lru_nr = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -960,6 +860,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
+ INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
return &ei->vfs_inode;
}
@@ -1031,6 +932,7 @@ void ext4_clear_inode(struct inode *inode)
dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1280,8 +1182,8 @@ static const match_table_t tokens = {
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
- {Opt_mblk_io_submit, "mblk_io_submit"},
- {Opt_nomblk_io_submit, "nomblk_io_submit"},
+ {Opt_removed, "mblk_io_submit"},
+ {Opt_removed, "nomblk_io_submit"},
{Opt_block_validity, "block_validity"},
{Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1337,6 +1239,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *qname;
+ int ret = -1;
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
@@ -1345,29 +1248,37 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quota options when quota turned on");
return -1;
}
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
qname = match_strdup(args);
if (!qname) {
ext4_msg(sb, KERN_ERR,
"Not enough memory for storing quotafile name");
return -1;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
- kfree(qname);
- return -1;
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext4_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- return -1;
+ goto errout;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sb, QUOTA);
return 1;
+errout:
+ kfree(qname);
+ return ret;
}
static int clear_qf_name(struct super_block *sb, int qtype)
@@ -1381,10 +1292,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
" when quota turned on");
return -1;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
+ kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 1;
}
@@ -1404,6 +1312,9 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_QFMT MOPT_NOSUPPORT
#endif
#define MOPT_DATAJ 0x0080
+#define MOPT_NO_EXT2 0x0100
+#define MOPT_NO_EXT3 0x0200
+#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
static const struct mount_opts {
int token;
@@ -1414,25 +1325,31 @@ static const struct mount_opts {
{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
- {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
- {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
- {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
- {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
- {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
- {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
- {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+ MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
- EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
- {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+ EXT4_MOUNT_JOURNAL_CHECKSUM),
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
- {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_CLEAR},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1444,9 +1361,14 @@ static const struct mount_opts {
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
{Opt_stripe, 0, MOPT_GTE0},
- {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
- {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
- {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+ {Opt_resuid, 0, MOPT_GTE0},
+ {Opt_resgid, 0, MOPT_GTE0},
+ {Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_ioprio, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+ MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1496,8 +1418,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
else if (token == Opt_offgrpjquota)
return clear_qf_name(sb, GRPQUOTA);
#endif
- if (args->from && match_int(args, &arg))
- return -1;
switch (token) {
case Opt_noacl:
case Opt_nouser_xattr:
@@ -1506,138 +1426,156 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_sb:
return 1; /* handled by get_sb_block() */
case Opt_removed:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring removed %s option", opt);
+ ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
return 1;
- case Opt_resuid:
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
+ return 1;
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++)
+ if (token == m->token)
+ break;
+
+ if (m->token == Opt_err) {
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+ }
+
+ if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext2", opt);
+ return -1;
+ }
+ if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext3", opt);
+ return -1;
+ }
+
+ if (args->from && match_int(args, &arg))
+ return -1;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ if (arg == 0)
+ arg = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks must be "
+ "0 or a power of 2 smaller than 2^31");
+ return -1;
+ }
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (token == Opt_resuid) {
uid = make_kuid(current_user_ns(), arg);
if (!uid_valid(uid)) {
ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
return -1;
}
sbi->s_resuid = uid;
- return 1;
- case Opt_resgid:
+ } else if (token == Opt_resgid) {
gid = make_kgid(current_user_ns(), arg);
if (!gid_valid(gid)) {
ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
return -1;
}
sbi->s_resgid = gid;
- return 1;
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- return 1;
- case Opt_i_version:
- sb->s_flags |= MS_I_VERSION;
- return 1;
- case Opt_journal_dev:
+ } else if (token == Opt_journal_dev) {
if (is_remount) {
ext4_msg(sb, KERN_ERR,
"Cannot specify journal on remount");
return -1;
}
*journal_devnum = arg;
- return 1;
- case Opt_journal_ioprio:
- if (arg < 0 || arg > 7)
- return -1;
- *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
- return 1;
- }
-
- for (m = ext4_mount_opts; m->token != Opt_err; m++) {
- if (token != m->token)
- continue;
- if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
- return -1;
- if (m->flags & MOPT_EXPLICIT)
- set_opt2(sb, EXPLICIT_DELALLOC);
- if (m->flags & MOPT_CLEAR_ERR)
- clear_opt(sb, ERRORS_MASK);
- if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
+ } else if (token == Opt_journal_ioprio) {
+ if (arg > 7) {
+ ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ " (must be 0-7)");
return -1;
}
-
- if (m->flags & MOPT_NOSUPPORT) {
- ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
- } else if (token == Opt_commit) {
- if (arg == 0)
- arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * arg;
- } else if (token == Opt_max_batch_time) {
- if (arg == 0)
- arg = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_max_batch_time = arg;
- } else if (token == Opt_min_batch_time) {
- sbi->s_min_batch_time = arg;
- } else if (token == Opt_inode_readahead_blks) {
- if (arg > (1 << 30))
- return -1;
- if (arg && !is_power_of_2(arg)) {
+ *journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (m->flags & MOPT_DATAJ) {
+ if (is_remount) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
ext4_msg(sb, KERN_ERR,
- "EXT4-fs: inode_readahead_blks"
- " must be a power of 2");
- return -1;
- }
- sbi->s_inode_readahead_blks = arg;
- } else if (token == Opt_init_itable) {
- set_opt(sb, INIT_INODE_TABLE);
- if (!args->from)
- arg = EXT4_DEF_LI_WAIT_MULT;
- sbi->s_li_wait_mult = arg;
- } else if (token == Opt_max_dir_size_kb) {
- sbi->s_max_dir_size_kb = arg;
- } else if (token == Opt_stripe) {
- sbi->s_stripe = arg;
- } else if (m->flags & MOPT_DATAJ) {
- if (is_remount) {
- if (!sbi->s_journal)
- ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) !=
- m->mount_opt) {
- ext4_msg(sb, KERN_ERR,
"Cannot change data mode on remount");
- return -1;
- }
- } else {
- clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= m->mount_opt;
- }
-#ifdef CONFIG_QUOTA
- } else if (m->flags & MOPT_QFMT) {
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != m->mount_opt) {
- ext4_msg(sb, KERN_ERR, "Cannot "
- "change journaled quota options "
- "when quota turned on");
return -1;
}
- sbi->s_jquota_fmt = m->mount_opt;
-#endif
} else {
- if (!args->from)
- arg = 1;
- if (m->flags & MOPT_CLEAR)
- arg = !arg;
- else if (unlikely(!(m->flags & MOPT_SET))) {
- ext4_msg(sb, KERN_WARNING,
- "buggy handling of option %s", opt);
- WARN_ON(1);
- return -1;
- }
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_mount_opt |= m->mount_opt;
}
- return 1;
+#ifdef CONFIG_QUOTA
+ } else if (m->flags & MOPT_QFMT) {
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot set journaled quota options "
+ "when QUOTA feature is enabled");
+ return -1;
+ }
+ sbi->s_jquota_fmt = m->mount_opt;
+#endif
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
+ }
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
}
- ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
- "or missing value", opt);
- return -1;
+ return 1;
}
static int parse_options(char *options, struct super_block *sb,
@@ -1645,9 +1583,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
-#ifdef CONFIG_QUOTA
struct ext4_sb_info *sbi = EXT4_SB(sb);
-#endif
char *p;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -1669,6 +1605,12 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+ ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+ "feature is enabled");
+ return 0;
+ }
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -1696,6 +1638,16 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+ if (blocksize < PAGE_CACHE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ return 0;
+ }
+ }
return 1;
}
@@ -2212,7 +2164,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
+ mutex_lock(&inode->i_mutex);
ext4_truncate(inode);
+ mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
ext4_msg(sb, KERN_DEBUG,
@@ -2766,7 +2720,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
break;
}
- if (group == ngroups)
+ if (group >= ngroups)
ret = 1;
if (!ret) {
@@ -3006,33 +2960,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return elr;
}
-static int ext4_register_li_request(struct super_block *sb,
- ext4_group_t first_not_zeroed)
+int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_li_request *elr;
+ struct ext4_li_request *elr = NULL;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0;
+ mutex_lock(&ext4_li_mtx);
if (sbi->s_li_request != NULL) {
/*
* Reset timeout so it can be computed again, because
* s_li_wait_mult might have changed.
*/
sbi->s_li_request->lr_timeout = 0;
- return 0;
+ goto out;
}
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
!test_opt(sb, INIT_INODE_TABLE))
- return 0;
+ goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
- if (!elr)
- return -ENOMEM;
-
- mutex_lock(&ext4_li_mtx);
+ if (!elr) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (NULL == ext4_li_info) {
ret = ext4_li_info_new();
@@ -3223,6 +3178,10 @@ int ext4_calculate_overhead(struct super_block *sb)
memset(buf, 0, PAGE_SIZE);
cond_resched();
}
+ /* Add the journal blocks as well */
+ if (sbi->s_journal)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+
sbi->s_overhead = overhead;
smp_wmb();
free_page((unsigned long) buf);
@@ -3365,7 +3324,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
- set_opt(sb, MBLK_IO_SUBMIT);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3436,15 +3394,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
clear_opt(sb, DELALLOC);
}
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- goto failed_mount;
- }
- }
-
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3486,6 +3435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
@@ -3757,6 +3707,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!err) {
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
+ }
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
@@ -3766,6 +3719,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sb);
+
/*
* set up enough so that it can read an inode
*/
@@ -3777,13 +3733,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
- sb->s_qcop = &ext4_qctl_operations;
sb->dq_op = &ext4_quota_operations;
-
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
- /* Use qctl operations for hidden quota files. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
sb->s_qcop = &ext4_qctl_sysfile_operations;
- }
+ else
+ sb->s_qcop = &ext4_qctl_operations;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -3979,6 +3933,16 @@ no_journal:
if (err)
goto failed_mount7;
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto failed_mount8;
+ }
+#endif /* CONFIG_QUOTA */
+
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -3996,16 +3960,6 @@ no_journal:
} else
descr = "out journal";
-#ifdef CONFIG_QUOTA
- /* Enable quota usage during mount. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !(sb->s_flags & MS_RDONLY)) {
- err = ext4_enable_quotas(sb);
- if (err)
- goto failed_mount7;
- }
-#endif /* CONFIG_QUOTA */
-
if (test_opt(sb, DISCARD)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q))
@@ -4029,6 +3983,10 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+#ifdef CONFIG_QUOTA
+failed_mount8:
+ kobject_del(&sbi->s_kobj);
+#endif
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4055,6 +4013,7 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
@@ -4470,16 +4429,12 @@ static void ext4_clear_journal_err(struct super_block *sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret = 0;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
- ret = ext4_journal_force_commit(journal);
-
- return ret;
+ return ext4_journal_force_commit(journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -4582,7 +4537,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
#ifdef CONFIG_QUOTA
- int i;
+ int i, j;
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -4598,7 +4553,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ kfree(orig_data);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
@@ -4725,15 +4690,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL)
+ if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -4762,9 +4725,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
@@ -4829,7 +4790,7 @@ static int ext4_write_dquot(struct dquot *dquot)
struct inode *inode;
inode = dquot_to_inode(dquot);
- handle = ext4_journal_start(inode,
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4845,7 +4806,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4861,7 +4822,7 @@ static int ext4_release_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
@@ -4877,9 +4838,12 @@ static int ext4_release_dquot(struct dquot *dquot)
static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
+ struct super_block *sb = dquot->dq_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
/* Are we journaling quotas? */
- if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
@@ -4893,7 +4857,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -4999,9 +4963,9 @@ static int ext4_enable_quotas(struct super_block *sb)
DQUOT_USAGE_ENABLED);
if (err) {
ext4_warning(sb,
- "Failed to enable quota (type=%d) "
- "tracking. Please run e2fsck to fix.",
- type);
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
return err;
}
}
@@ -5039,7 +5003,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
/* Update modification times of quota files when userspace can
* start looking at them */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
goto out;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a91ebc2b66f..3a120b277240 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -549,7 +549,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext4_handle_dirty_xattr_block(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
}
@@ -832,7 +832,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = dquot_alloc_block(inode, 1);
+ error = dquot_alloc_block(inode,
+ EXT4_C2B(EXT4_SB(sb), 1));
if (error)
goto cleanup;
error = ext4_journal_get_write_access(handle,
@@ -886,17 +887,18 @@ inserted:
(unsigned long long)block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
+ error = -ENOMEM;
getblk_failed:
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA);
- error = -EIO;
goto cleanup;
}
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
unlock_buffer(new_bh);
+ error = -EIO;
goto getblk_failed;
}
memcpy(new_bh->b_data, s->base, new_bh->b_size);
@@ -928,7 +930,7 @@ cleanup:
return error;
cleanup_dquot:
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
goto cleanup;
bad_block:
@@ -1164,17 +1166,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
- int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+ int credits = ext4_jbd2_credits_xattr(inode);
retry:
- /*
- * In case of inline data, we may push out the data to a block,
- * So reserve the journal space first.
- */
- if (ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
-
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69eda787a96a..aa25deb5c6cd 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -125,74 +125,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
-extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
-extern int ext4_get_max_inline_size(struct inode *inode);
-extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
- struct ext4_iloc *iloc,
- void *buffer, loff_t pos,
- unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
-extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
-
-extern int ext4_readpage_inline(struct inode *inode, struct page *page);
-extern int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
-extern int ext4_try_create_inline_dir(handle_t *handle,
- struct inode *parent,
- struct inode *inode);
-extern int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
- int *has_inline_data);
-extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
- int *has_inline_data);
-extern int ext4_delete_inline_entry(handle_t *handle,
- struct inode *dir,
- struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh,
- int *has_inline_data);
-extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
-extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
- struct ext4_dir_entry_2 **parent_de,
- int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline);
-extern int ext4_try_to_evict_inline_data(handle_t *handle,
- struct inode *inode,
- int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
-
-extern int ext4_convert_inline_data(struct inode *inode);
-
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
new file mode 100644
index 000000000000..fd27e7e6326e
--- /dev/null
+++ b/fs/f2fs/Kconfig
@@ -0,0 +1,53 @@
+config F2FS_FS
+ tristate "F2FS filesystem support (EXPERIMENTAL)"
+ depends on BLOCK
+ help
+ F2FS is based on Log-structured File System (LFS), which supports
+ versatile "flash-friendly" features. The design has been focused on
+ addressing the fundamental issues in LFS, which are snowball effect
+ of wandering tree and high cleaning overhead.
+
+ Since flash-based storages show different characteristics according to
+ the internal geometry or flash memory management schemes aka FTL, F2FS
+ and tools support various parameters not only for configuring on-disk
+ layout, but also for selecting allocation and cleaning algorithms.
+
+ If unsure, say N.
+
+config F2FS_STAT_FS
+ bool "F2FS Status Information"
+ depends on F2FS_FS && DEBUG_FS
+ default y
+ help
+ /sys/kernel/debug/f2fs/ contains information about all the partitions
+ mounted as f2fs. Each file shows the whole f2fs information.
+
+ /sys/kernel/debug/f2fs/status includes:
+ - major file system information managed by f2fs currently
+ - average SIT information about whole segments
+ - current memory footprint consumed by f2fs.
+
+config F2FS_FS_XATTR
+ bool "F2FS extended attributes"
+ depends on F2FS_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config F2FS_FS_POSIX_ACL
+ bool "F2FS Access Control Lists"
+ depends on F2FS_FS_XATTR
+ select FS_POSIX_ACL
+ default y
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ gourps beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
new file mode 100644
index 000000000000..27a0820340b9
--- /dev/null
+++ b/fs/f2fs/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_F2FS_FS) += f2fs.o
+
+f2fs-y := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
+f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
+f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
new file mode 100644
index 000000000000..137af4255da6
--- /dev/null
+++ b/fs/f2fs/acl.c
@@ -0,0 +1,412 @@
+/*
+ * fs/f2fs/acl.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+
+static inline size_t f2fs_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(struct f2fs_acl_header) +
+ count * sizeof(struct f2fs_acl_entry_short);
+ } else {
+ return sizeof(struct f2fs_acl_header) +
+ 4 * sizeof(struct f2fs_acl_entry_short) +
+ (count - 4) * sizeof(struct f2fs_acl_entry);
+ }
+}
+
+static inline int f2fs_acl_count(size_t size)
+{
+ ssize_t s;
+ size -= sizeof(struct f2fs_acl_header);
+ s = size - 4 * sizeof(struct f2fs_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(struct f2fs_acl_entry_short))
+ return -1;
+ return size / sizeof(struct f2fs_acl_entry_short);
+ } else {
+ if (s % sizeof(struct f2fs_acl_entry))
+ return -1;
+ return s / sizeof(struct f2fs_acl_entry) + 4;
+ }
+}
+
+static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
+{
+ int i, count;
+ struct posix_acl *acl;
+ struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
+ struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
+ const char *end = value + size;
+
+ if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
+ return ERR_PTR(-EINVAL);
+
+ count = f2fs_acl_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+
+ if ((char *)entry > end)
+ goto fail;
+
+ acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
+
+ switch (acl->a_entries[i].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry_short));
+ break;
+
+ case ACL_USER:
+ acl->a_entries[i].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[i].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ default:
+ goto fail;
+ }
+ }
+ if ((char *)entry != end)
+ goto fail;
+ return acl;
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+ struct f2fs_acl_header *f2fs_acl;
+ struct f2fs_acl_entry *entry;
+ int i;
+
+ f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+ sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+ if (!f2fs_acl)
+ return ERR_PTR(-ENOMEM);
+
+ f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
+ entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
+
+ for (i = 0; i < acl->a_count; i++) {
+
+ entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
+
+ switch (acl->a_entries[i].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[i].e_uid));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[i].e_gid));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry_short));
+ break;
+ default:
+ goto fail;
+ }
+ }
+ *size = f2fs_acl_size(acl->a_count);
+ return (void *)f2fs_acl;
+
+fail:
+ kfree(f2fs_acl);
+ return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ void *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return NULL;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ if (type == ACL_TYPE_ACCESS)
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+
+ retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = f2fs_getxattr(inode, name_index, "", value, retval);
+ }
+
+ if (retval > 0)
+ acl = f2fs_acl_from_disk(value, retval);
+ else if (retval == -ENODATA)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int error;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (error < 0)
+ return error;
+ set_acl_inode(fi, inode->i_mode);
+ if (error == 0)
+ acl = NULL;
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = f2fs_acl_to_disk(acl, &size);
+ if (IS_ERR(value)) {
+ cond_clear_inode_flag(fi, FI_ACL_MODE);
+ return (int)PTR_ERR(value);
+ }
+ }
+
+ error = f2fs_setxattr(inode, name_index, "", value, size);
+
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+
+ cond_clear_inode_flag(fi, FI_ACL_MODE);
+ return error;
+}
+
+int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ int error = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (test_opt(sbi, POSIX_ACL)) {
+ acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current_umask();
+ }
+
+ if (test_opt(sbi, POSIX_ACL) && acl) {
+
+ if (S_ISDIR(inode->i_mode)) {
+ error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto cleanup;
+ }
+ error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+ if (error < 0)
+ return error;
+ if (error > 0)
+ error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+ }
+cleanup:
+ posix_acl_release(acl);
+ return error;
+}
+
+int f2fs_acl_chmod(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int error;
+ mode_t mode = get_inode_mode(inode);
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+ if (S_ISLNK(mode))
+ return -EOPNOTSUPP;
+
+ acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+
+ error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+ if (error)
+ return error;
+ error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+ posix_acl_release(acl);
+ return error;
+}
+
+static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ const char *xname = POSIX_ACL_XATTR_DEFAULT;
+ size_t size;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+
+ if (type == ACL_TYPE_ACCESS)
+ xname = POSIX_ACL_XATTR_ACCESS;
+
+ size = strlen(xname) + 1;
+ if (list && size <= list_size)
+ memcpy(list, xname, size);
+ return size;
+}
+
+static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct posix_acl *acl;
+ int error;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(sbi, POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = f2fs_get_acl(dentry->d_inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (!acl)
+ return -ENODATA;
+ error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl = NULL;
+ int error;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(sbi, POSIX_ACL))
+ return -EOPNOTSUPP;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_valid(acl);
+ if (error)
+ goto release_and_out;
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = f2fs_set_acl(inode, type, acl);
+
+release_and_out:
+ posix_acl_release(acl);
+ return error;
+}
+
+const struct xattr_handler f2fs_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .list = f2fs_xattr_list_acl,
+ .get = f2fs_xattr_get_acl,
+ .set = f2fs_xattr_set_acl,
+};
+
+const struct xattr_handler f2fs_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = f2fs_xattr_list_acl,
+ .get = f2fs_xattr_get_acl,
+ .set = f2fs_xattr_set_acl,
+};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
new file mode 100644
index 000000000000..80f430674417
--- /dev/null
+++ b/fs/f2fs/acl.h
@@ -0,0 +1,57 @@
+/*
+ * fs/f2fs/acl.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.h
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_ACL_H__
+#define __F2FS_ACL_H__
+
+#include <linux/posix_acl_xattr.h>
+
+#define F2FS_ACL_VERSION 0x0001
+
+struct f2fs_acl_entry {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+};
+
+struct f2fs_acl_entry_short {
+ __le16 e_tag;
+ __le16 e_perm;
+};
+
+struct f2fs_acl_header {
+ __le32 a_version;
+};
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+
+extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
+extern int f2fs_acl_chmod(struct inode *inode);
+extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
+#else
+#define f2fs_check_acl NULL
+#define f2fs_get_acl NULL
+#define f2fs_set_acl NULL
+
+static inline int f2fs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+#endif
+#endif /* __F2FS_ACL_H__ */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000000..2b6fc131e2ce
--- /dev/null
+++ b/fs/f2fs/checkpoint.c
@@ -0,0 +1,784 @@
+/*
+ * fs/f2fs/checkpoint.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *orphan_entry_slab;
+static struct kmem_cache *inode_entry_slab;
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct page *page = NULL;
+repeat:
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ cond_resched();
+ goto repeat;
+ }
+
+ /* We wait writeback only inside grab_meta_page() */
+ wait_on_page_writeback(page);
+ SetPageUptodate(page);
+ return page;
+}
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct page *page;
+repeat:
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ cond_resched();
+ goto repeat;
+ }
+ if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+ mark_page_accessed(page);
+
+ /* We do not allow returning an errorneous page */
+ return page;
+}
+
+static int f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ /* Should not write any meta pages, if any IO error was occurred */
+ if (wbc->for_reclaim ||
+ is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+
+ wait_on_page_writeback(page);
+
+ write_meta_page(sbi, page);
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ unlock_page(page);
+ return 0;
+}
+
+static int f2fs_write_meta_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct block_device *bdev = sbi->sb->s_bdev;
+ long written;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+ return 0;
+
+ /* if mounting is failed, skip writing node pages */
+ mutex_lock(&sbi->cp_mutex);
+ written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+ mutex_unlock(&sbi->cp_mutex);
+ wbc->nr_to_write -= written;
+ return 0;
+}
+
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+ long nr_to_write)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ pgoff_t index = 0, end = LONG_MAX;
+ struct pagevec pvec;
+ long nwritten = 0;
+ struct writeback_control wbc = {
+ .for_reclaim = 0,
+ };
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ lock_page(page);
+ BUG_ON(page->mapping != mapping);
+ BUG_ON(!PageDirty(page));
+ clear_page_dirty_for_io(page);
+ if (f2fs_write_meta_page(page, &wbc)) {
+ unlock_page(page);
+ break;
+ }
+ if (nwritten++ >= nr_to_write)
+ break;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (nwritten)
+ f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+
+ return nwritten;
+}
+
+static int f2fs_set_meta_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ inc_page_count(sbi, F2FS_DIRTY_META);
+ return 1;
+ }
+ return 0;
+}
+
+const struct address_space_operations f2fs_meta_aops = {
+ .writepage = f2fs_write_meta_page,
+ .writepages = f2fs_write_meta_pages,
+ .set_page_dirty = f2fs_set_meta_page_dirty,
+};
+
+int check_orphan_space(struct f2fs_sb_info *sbi)
+{
+ unsigned int max_orphans;
+ int err = 0;
+
+ /*
+ * considering 512 blocks in a segment 5 blocks are needed for cp
+ * and log segment summaries. Remaining blocks are used to keep
+ * orphan entries with the limitation one reserved segment
+ * for cp pack we can have max 1020*507 orphan entries
+ */
+ max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
+ mutex_lock(&sbi->orphan_inode_mutex);
+ if (sbi->n_orphans >= max_orphans)
+ err = -ENOSPC;
+ mutex_unlock(&sbi->orphan_inode_mutex);
+ return err;
+}
+
+void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head, *this;
+ struct orphan_inode_entry *new = NULL, *orphan = NULL;
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+ list_for_each(this, head) {
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+ if (orphan->ino == ino)
+ goto out;
+ if (orphan->ino > ino)
+ break;
+ orphan = NULL;
+ }
+retry:
+ new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->ino = ino;
+
+ /* add new_oentry into list which is sorted by inode number */
+ if (orphan)
+ list_add(&new->list, this->prev);
+ else
+ list_add_tail(&new->list, head);
+
+ sbi->n_orphans++;
+out:
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *this, *next, *head;
+ struct orphan_inode_entry *orphan;
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+ list_for_each_safe(this, next, head) {
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+ if (orphan->ino == ino) {
+ list_del(&orphan->list);
+ kmem_cache_free(orphan_entry_slab, orphan);
+ sbi->n_orphans--;
+ break;
+ }
+ }
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct inode *inode = f2fs_iget(sbi->sb, ino);
+ BUG_ON(IS_ERR(inode));
+ clear_nlink(inode);
+
+ /* truncate all the data during iput */
+ iput(inode);
+}
+
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+{
+ block_t start_blk, orphan_blkaddr, i, j;
+
+ if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+ return 0;
+
+ sbi->por_doing = 1;
+ start_blk = __start_cp_addr(sbi) + 1;
+ orphan_blkaddr = __start_sum_addr(sbi) - 1;
+
+ for (i = 0; i < orphan_blkaddr; i++) {
+ struct page *page = get_meta_page(sbi, start_blk + i);
+ struct f2fs_orphan_block *orphan_blk;
+
+ orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+ for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
+ nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+ recover_orphan_inode(sbi, ino);
+ }
+ f2fs_put_page(page, 1);
+ }
+ /* clear Orphan Flag */
+ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+ sbi->por_doing = 0;
+ return 0;
+}
+
+static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ struct list_head *head, *this, *next;
+ struct f2fs_orphan_block *orphan_blk = NULL;
+ struct page *page = NULL;
+ unsigned int nentries = 0;
+ unsigned short index = 1;
+ unsigned short orphan_blocks;
+
+ orphan_blocks = (unsigned short)((sbi->n_orphans +
+ (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+
+ /* loop for each orphan inode entry and write them in Jornal block */
+ list_for_each_safe(this, next, head) {
+ struct orphan_inode_entry *orphan;
+
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+
+ if (nentries == F2FS_ORPHANS_PER_BLOCK) {
+ /*
+ * an orphan block is full of 1020 entries,
+ * then we need to flush current orphan blocks
+ * and bring another one in memory
+ */
+ orphan_blk->blk_addr = cpu_to_le16(index);
+ orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+ orphan_blk->entry_count = cpu_to_le32(nentries);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ index++;
+ start_blk++;
+ nentries = 0;
+ page = NULL;
+ }
+ if (page)
+ goto page_exist;
+
+ page = grab_meta_page(sbi, start_blk);
+ orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+ memset(orphan_blk, 0, sizeof(*orphan_blk));
+page_exist:
+ orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+ }
+ if (!page)
+ goto end;
+
+ orphan_blk->blk_addr = cpu_to_le16(index);
+ orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+ orphan_blk->entry_count = cpu_to_le32(nentries);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+end:
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+ block_t cp_addr, unsigned long long *version)
+{
+ struct page *cp_page_1, *cp_page_2 = NULL;
+ unsigned long blk_size = sbi->blocksize;
+ struct f2fs_checkpoint *cp_block;
+ unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int crc = 0;
+ size_t crc_offset;
+
+ /* Read the 1st cp block in this CP pack */
+ cp_page_1 = get_meta_page(sbi, cp_addr);
+
+ /* get the version number */
+ cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
+ crc_offset = le32_to_cpu(cp_block->checksum_offset);
+ if (crc_offset >= blk_size)
+ goto invalid_cp1;
+
+ crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ goto invalid_cp1;
+
+ pre_version = le64_to_cpu(cp_block->checkpoint_ver);
+
+ /* Read the 2nd cp block in this CP pack */
+ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_page_2 = get_meta_page(sbi, cp_addr);
+
+ cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
+ crc_offset = le32_to_cpu(cp_block->checksum_offset);
+ if (crc_offset >= blk_size)
+ goto invalid_cp2;
+
+ crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ goto invalid_cp2;
+
+ cur_version = le64_to_cpu(cp_block->checkpoint_ver);
+
+ if (cur_version == pre_version) {
+ *version = cur_version;
+ f2fs_put_page(cp_page_2, 1);
+ return cp_page_1;
+ }
+invalid_cp2:
+ f2fs_put_page(cp_page_2, 1);
+invalid_cp1:
+ f2fs_put_page(cp_page_1, 1);
+ return NULL;
+}
+
+int get_valid_checkpoint(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *cp_block;
+ struct f2fs_super_block *fsb = sbi->raw_super;
+ struct page *cp1, *cp2, *cur_page;
+ unsigned long blk_size = sbi->blocksize;
+ unsigned long long cp1_version = 0, cp2_version = 0;
+ unsigned long long cp_start_blk_no;
+
+ sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
+ if (!sbi->ckpt)
+ return -ENOMEM;
+ /*
+ * Finding out valid cp block involves read both
+ * sets( cp pack1 and cp pack 2)
+ */
+ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+ cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
+
+ /* The second checkpoint pack should start at the next segment */
+ cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+ cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
+
+ if (cp1 && cp2) {
+ if (ver_after(cp2_version, cp1_version))
+ cur_page = cp2;
+ else
+ cur_page = cp1;
+ } else if (cp1) {
+ cur_page = cp1;
+ } else if (cp2) {
+ cur_page = cp2;
+ } else {
+ goto fail_no_cp;
+ }
+
+ cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
+ memcpy(sbi->ckpt, cp_block, blk_size);
+
+ f2fs_put_page(cp1, 1);
+ f2fs_put_page(cp2, 1);
+ return 0;
+
+fail_no_cp:
+ kfree(sbi->ckpt);
+ return -EINVAL;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct list_head *head = &sbi->dir_inode_list;
+ struct dir_inode_entry *new;
+ struct list_head *this;
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode) {
+ kmem_cache_free(inode_entry_slab, new);
+ goto out;
+ }
+ }
+ list_add_tail(&new->list, head);
+ sbi->n_dirty_dirs++;
+
+ BUG_ON(!S_ISDIR(inode->i_mode));
+out:
+ inc_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_inc_dirty_dents(inode);
+ SetPagePrivate(page);
+
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void remove_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ spin_lock(&sbi->dir_inode_lock);
+ if (atomic_read(&F2FS_I(inode)->dirty_dents))
+ goto out;
+
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode) {
+ list_del(&entry->list);
+ kmem_cache_free(inode_entry_slab, entry);
+ sbi->n_dirty_dirs--;
+ break;
+ }
+ }
+out:
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct dir_inode_entry *entry;
+ struct inode *inode;
+retry:
+ spin_lock(&sbi->dir_inode_lock);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
+ entry = list_entry(head->next, struct dir_inode_entry, list);
+ inode = igrab(entry->inode);
+ spin_unlock(&sbi->dir_inode_lock);
+ if (inode) {
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ } else {
+ /*
+ * We should submit bio, since it exists several
+ * wribacking dentry pages in the freeing inode.
+ */
+ f2fs_submit_bio(sbi, DATA, true);
+ }
+ goto retry;
+}
+
+/*
+ * Freeze all the FS-operations for checkpoint.
+ */
+static void block_operations(struct f2fs_sb_info *sbi)
+{
+ int t;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+
+ /* Stop renaming operation */
+ mutex_lock_op(sbi, RENAME);
+ mutex_lock_op(sbi, DENTRY_OPS);
+
+retry_dents:
+ /* write all the dirty dentry pages */
+ sync_dirty_dir_inodes(sbi);
+
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
+ mutex_unlock_op(sbi, DATA_WRITE);
+ goto retry_dents;
+ }
+
+ /* block all the operations */
+ for (t = DATA_NEW; t <= NODE_TRUNC; t++)
+ mutex_lock_op(sbi, t);
+
+ mutex_lock(&sbi->write_inode);
+
+ /*
+ * POR: we should ensure that there is no dirty node pages
+ * until finishing nat/sit flush.
+ */
+retry:
+ sync_node_pages(sbi, 0, &wbc);
+
+ mutex_lock_op(sbi, NODE_WRITE);
+
+ if (get_pages(sbi, F2FS_DIRTY_NODES)) {
+ mutex_unlock_op(sbi, NODE_WRITE);
+ goto retry;
+ }
+ mutex_unlock(&sbi->write_inode);
+}
+
+static void unblock_operations(struct f2fs_sb_info *sbi)
+{
+ int t;
+ for (t = NODE_WRITE; t >= RENAME; t--)
+ mutex_unlock_op(sbi, t);
+}
+
+static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ nid_t last_nid = 0;
+ block_t start_blk;
+ struct page *cp_page;
+ unsigned int data_sum_blocks, orphan_blocks;
+ unsigned int crc32 = 0;
+ void *kaddr;
+ int i;
+
+ /* Flush all the NAT/SIT pages */
+ while (get_pages(sbi, F2FS_DIRTY_META))
+ sync_meta_pages(sbi, META, LONG_MAX);
+
+ next_free_nid(sbi, &last_nid);
+
+ /*
+ * modify checkpoint
+ * version number is already updated
+ */
+ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
+ for (i = 0; i < 3; i++) {
+ ckpt->cur_node_segno[i] =
+ cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
+ ckpt->cur_node_blkoff[i] =
+ cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
+ ckpt->alloc_type[i + CURSEG_HOT_NODE] =
+ curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
+ }
+ for (i = 0; i < 3; i++) {
+ ckpt->cur_data_segno[i] =
+ cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
+ ckpt->cur_data_blkoff[i] =
+ cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
+ ckpt->alloc_type[i + CURSEG_HOT_DATA] =
+ curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
+ }
+
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+
+ /* 2 cp + n data seg summary + orphan inode blocks */
+ data_sum_blocks = npages_for_summary_flush(sbi);
+ if (data_sum_blocks < 3)
+ set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+
+ orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
+ / F2FS_ORPHANS_PER_BLOCK;
+ ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
+
+ if (is_umount) {
+ set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
+ } else {
+ clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ data_sum_blocks + orphan_blocks);
+ }
+
+ if (sbi->n_orphans)
+ set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+ /* update SIT/NAT bitmap */
+ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
+ get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
+
+ crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+ *(__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset))
+ = cpu_to_le32(crc32);
+
+ start_blk = __start_cp_addr(sbi);
+
+ /* write out checkpoint buffer at block 0 */
+ cp_page = grab_meta_page(sbi, start_blk++);
+ kaddr = page_address(cp_page);
+ memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ set_page_dirty(cp_page);
+ f2fs_put_page(cp_page, 1);
+
+ if (sbi->n_orphans) {
+ write_orphan_inodes(sbi, start_blk);
+ start_blk += orphan_blocks;
+ }
+
+ write_data_summaries(sbi, start_blk);
+ start_blk += data_sum_blocks;
+ if (is_umount) {
+ write_node_summaries(sbi, start_blk);
+ start_blk += NR_CURSEG_NODE_TYPE;
+ }
+
+ /* writeout checkpoint block */
+ cp_page = grab_meta_page(sbi, start_blk);
+ kaddr = page_address(cp_page);
+ memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ set_page_dirty(cp_page);
+ f2fs_put_page(cp_page, 1);
+
+ /* wait for previous submitted node/meta pages writeback */
+ while (get_pages(sbi, F2FS_WRITEBACK))
+ congestion_wait(BLK_RW_ASYNC, HZ / 50);
+
+ filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+ filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+
+ /* update user_block_counts */
+ sbi->last_valid_block_count = sbi->total_valid_block_count;
+ sbi->alloc_valid_block_count = 0;
+
+ /* Here, we only have one bio having CP pack */
+ sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+
+ if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
+ }
+}
+
+/*
+ * We guarantee that this checkpoint procedure should not fail.
+ */
+void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned long long ckpt_ver;
+
+ mutex_lock(&sbi->cp_mutex);
+ block_operations(sbi);
+
+ f2fs_submit_bio(sbi, DATA, true);
+ f2fs_submit_bio(sbi, NODE, true);
+ f2fs_submit_bio(sbi, META, true);
+
+ /*
+ * update checkpoint pack index
+ * Increase the version number so that
+ * SIT entries and seg summaries are written at correct place
+ */
+ ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
+ ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
+
+ /* write cached NAT/SIT entries to NAT/SIT area */
+ flush_nat_entries(sbi);
+ flush_sit_entries(sbi);
+
+ reset_victim_segmap(sbi);
+
+ /* unlock all the fs_lock[] in do_checkpoint() */
+ do_checkpoint(sbi, is_umount);
+
+ unblock_operations(sbi);
+ mutex_unlock(&sbi->cp_mutex);
+}
+
+void init_orphan_info(struct f2fs_sb_info *sbi)
+{
+ mutex_init(&sbi->orphan_inode_mutex);
+ INIT_LIST_HEAD(&sbi->orphan_inode_list);
+ sbi->n_orphans = 0;
+}
+
+int __init create_checkpoint_caches(void)
+{
+ orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
+ sizeof(struct orphan_inode_entry), NULL);
+ if (unlikely(!orphan_entry_slab))
+ return -ENOMEM;
+ inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
+ sizeof(struct dir_inode_entry), NULL);
+ if (unlikely(!inode_entry_slab)) {
+ kmem_cache_destroy(orphan_entry_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_checkpoint_caches(void)
+{
+ kmem_cache_destroy(orphan_entry_slab);
+ kmem_cache_destroy(inode_entry_slab);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
new file mode 100644
index 000000000000..7bd22a201125
--- /dev/null
+++ b/fs/f2fs/data.c
@@ -0,0 +1,718 @@
+/*
+ * fs/f2fs/data.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/prefetch.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+/*
+ * Lock ordering for the change of data block address:
+ * ->data_page
+ * ->node_page
+ * update block addresses in the node page
+ */
+static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+{
+ struct f2fs_node *rn;
+ __le32 *addr_array;
+ struct page *node_page = dn->node_page;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+
+ wait_on_page_writeback(node_page);
+
+ rn = (struct f2fs_node *)page_address(node_page);
+
+ /* Get physical address of data block */
+ addr_array = blkaddr_in_node(rn);
+ addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+ set_page_dirty(node_page);
+}
+
+int reserve_new_block(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+
+ if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ return -EPERM;
+ if (!inc_valid_block_count(sbi, dn->inode, 1))
+ return -ENOSPC;
+
+ __set_data_blkaddr(dn, NEW_ADDR);
+ dn->data_blkaddr = NEW_ADDR;
+ sync_inode_page(dn);
+ return 0;
+}
+
+static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct buffer_head *bh_result)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ pgoff_t start_fofs, end_fofs;
+ block_t start_blkaddr;
+
+ read_lock(&fi->ext.ext_lock);
+ if (fi->ext.len == 0) {
+ read_unlock(&fi->ext.ext_lock);
+ return 0;
+ }
+
+ sbi->total_hit_ext++;
+ start_fofs = fi->ext.fofs;
+ end_fofs = fi->ext.fofs + fi->ext.len - 1;
+ start_blkaddr = fi->ext.blk_addr;
+
+ if (pgofs >= start_fofs && pgofs <= end_fofs) {
+ unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+ size_t count;
+
+ clear_buffer_new(bh_result);
+ map_bh(bh_result, inode->i_sb,
+ start_blkaddr + pgofs - start_fofs);
+ count = end_fofs - pgofs + 1;
+ if (count < (UINT_MAX >> blkbits))
+ bh_result->b_size = (count << blkbits);
+ else
+ bh_result->b_size = UINT_MAX;
+
+ sbi->read_hit_ext++;
+ read_unlock(&fi->ext.ext_lock);
+ return 1;
+ }
+ read_unlock(&fi->ext.ext_lock);
+ return 0;
+}
+
+void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+{
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+ pgoff_t fofs, start_fofs, end_fofs;
+ block_t start_blkaddr, end_blkaddr;
+
+ BUG_ON(blk_addr == NEW_ADDR);
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
+
+ /* Update the page address in the parent node */
+ __set_data_blkaddr(dn, blk_addr);
+
+ write_lock(&fi->ext.ext_lock);
+
+ start_fofs = fi->ext.fofs;
+ end_fofs = fi->ext.fofs + fi->ext.len - 1;
+ start_blkaddr = fi->ext.blk_addr;
+ end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+
+ /* Drop and initialize the matched extent */
+ if (fi->ext.len == 1 && fofs == start_fofs)
+ fi->ext.len = 0;
+
+ /* Initial extent */
+ if (fi->ext.len == 0) {
+ if (blk_addr != NULL_ADDR) {
+ fi->ext.fofs = fofs;
+ fi->ext.blk_addr = blk_addr;
+ fi->ext.len = 1;
+ }
+ goto end_update;
+ }
+
+ /* Frone merge */
+ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+ fi->ext.fofs--;
+ fi->ext.blk_addr--;
+ fi->ext.len++;
+ goto end_update;
+ }
+
+ /* Back merge */
+ if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+ fi->ext.len++;
+ goto end_update;
+ }
+
+ /* Split the existing extent */
+ if (fi->ext.len > 1 &&
+ fofs >= start_fofs && fofs <= end_fofs) {
+ if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
+ fi->ext.len = fofs - start_fofs;
+ } else {
+ fi->ext.fofs = fofs + 1;
+ fi->ext.blk_addr = start_blkaddr +
+ fofs - start_fofs + 1;
+ fi->ext.len -= fofs - start_fofs + 1;
+ }
+ goto end_update;
+ }
+ write_unlock(&fi->ext.ext_lock);
+ return;
+
+end_update:
+ write_unlock(&fi->ext.ext_lock);
+ sync_inode_page(dn);
+ return;
+}
+
+struct page *find_data_page(struct inode *inode, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ int err;
+
+ page = find_get_page(mapping, index);
+ if (page && PageUptodate(page))
+ return page;
+ f2fs_put_page(page, 0);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err)
+ return ERR_PTR(err);
+ f2fs_put_dnode(&dn);
+
+ if (dn.data_blkaddr == NULL_ADDR)
+ return ERR_PTR(-ENOENT);
+
+ /* By fallocate(), there is no cached page, but with NEW_ADDR */
+ if (dn.data_blkaddr == NEW_ADDR)
+ return ERR_PTR(-EINVAL);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ unlock_page(page);
+ return page;
+}
+
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ int err;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err)
+ return ERR_PTR(err);
+ f2fs_put_dnode(&dn);
+
+ if (dn.data_blkaddr == NULL_ADDR)
+ return ERR_PTR(-ENOENT);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ if (PageUptodate(page))
+ return page;
+
+ BUG_ON(dn.data_blkaddr == NEW_ADDR);
+ BUG_ON(dn.data_blkaddr == NULL_ADDR);
+
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ return page;
+}
+
+/*
+ * Caller ensures that this data page is never allocated.
+ * A new zero-filled data page is allocated in the page cache.
+ */
+struct page *get_new_data_page(struct inode *inode, pgoff_t index,
+ bool new_i_size)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ struct dnode_of_data dn;
+ int err;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, 0);
+ if (err)
+ return ERR_PTR(err);
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ if (reserve_new_block(&dn)) {
+ f2fs_put_dnode(&dn);
+ return ERR_PTR(-ENOSPC);
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ if (PageUptodate(page))
+ return page;
+
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ } else {
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ }
+ SetPageUptodate(page);
+
+ if (new_i_size &&
+ i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
+ i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ mark_inode_dirty_sync(inode);
+ }
+ return page;
+}
+
+static void read_end_io(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } while (bvec >= bio->bi_io_vec);
+ kfree(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Read operation is synchronous, and caller must unlock the page.
+ */
+int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, int type)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ bool sync = (type == READ_SYNC);
+ struct bio *bio;
+
+ /* This page can be already read by other threads */
+ if (PageUptodate(page)) {
+ if (!sync)
+ unlock_page(page);
+ return 0;
+ }
+
+ down_read(&sbi->bio_sem);
+
+ /* Allocate a new bio */
+ bio = f2fs_bio_alloc(bdev, 1);
+
+ /* Initialize the bio */
+ bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_end_io = read_end_io;
+
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ kfree(bio->bi_private);
+ bio_put(bio);
+ up_read(&sbi->bio_sem);
+ return -EFAULT;
+ }
+
+ submit_bio(type, bio);
+ up_read(&sbi->bio_sem);
+
+ /* wait for read completion if sync */
+ if (sync) {
+ lock_page(page);
+ if (PageError(page))
+ return -EIO;
+ }
+ return 0;
+}
+
+/*
+ * This function should be used by the data read flow only where it
+ * does not check the "create" flag that indicates block allocation.
+ * The reason for this special functionality is to exploit VFS readahead
+ * mechanism.
+ */
+static int get_data_block_ro(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+ unsigned maxblocks = bh_result->b_size >> blkbits;
+ struct dnode_of_data dn;
+ pgoff_t pgofs;
+ int err;
+
+ /* Get the page offset from the block offset(iblock) */
+ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+
+ if (check_extent_cache(inode, pgofs, bh_result))
+ return 0;
+
+ /* When reading holes, we need its node page */
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
+ if (err)
+ return (err == -ENOENT) ? 0 : err;
+
+ /* It does not support data allocation */
+ BUG_ON(create);
+
+ if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
+ int i;
+ unsigned int end_offset;
+
+ end_offset = IS_INODE(dn.node_page) ?
+ ADDRS_PER_INODE :
+ ADDRS_PER_BLOCK;
+
+ clear_buffer_new(bh_result);
+
+ /* Give more consecutive addresses for the read ahead */
+ for (i = 0; i < end_offset - dn.ofs_in_node; i++)
+ if (((datablock_addr(dn.node_page,
+ dn.ofs_in_node + i))
+ != (dn.data_blkaddr + i)) || maxblocks == i)
+ break;
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ bh_result->b_size = (i << blkbits);
+ }
+ f2fs_put_dnode(&dn);
+ return 0;
+}
+
+static int f2fs_read_data_page(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, get_data_block_ro);
+}
+
+static int f2fs_read_data_pages(struct file *file,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+}
+
+int do_write_data_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ block_t old_blk_addr, new_blk_addr;
+ struct dnode_of_data dn;
+ int err = 0;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+ if (err)
+ return err;
+
+ old_blk_addr = dn.data_blkaddr;
+
+ /* This page is already truncated */
+ if (old_blk_addr == NULL_ADDR)
+ goto out_writepage;
+
+ set_page_writeback(page);
+
+ /*
+ * If current allocation needs SSR,
+ * it had better in-place writes for updated data.
+ */
+ if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
+ need_inplace_update(inode)) {
+ rewrite_data_page(F2FS_SB(inode->i_sb), page,
+ old_blk_addr);
+ } else {
+ write_data_page(inode, page, &dn,
+ old_blk_addr, &new_blk_addr);
+ update_extent_cache(new_blk_addr, &dn);
+ F2FS_I(inode)->data_version =
+ le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+ }
+out_writepage:
+ f2fs_put_dnode(&dn);
+ return err;
+}
+
+static int f2fs_write_data_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ loff_t i_size = i_size_read(inode);
+ const pgoff_t end_index = ((unsigned long long) i_size)
+ >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ int err = 0;
+
+ if (page->index < end_index)
+ goto out;
+
+ /*
+ * If the offset is out-of-range of file size,
+ * this page does not have to be written to disk.
+ */
+ offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if ((page->index >= end_index + 1) || !offset) {
+ if (S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ goto unlock_out;
+ }
+
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+out:
+ if (sbi->por_doing)
+ goto redirty_out;
+
+ if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+ goto redirty_out;
+
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ err = do_write_data_page(page);
+ if (err && err != -ENOENT) {
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ }
+ mutex_unlock_op(sbi, DATA_WRITE);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_bio(sbi, DATA, true);
+
+ if (err == -ENOENT)
+ goto unlock_out;
+
+ clear_cold_data(page);
+ unlock_page(page);
+
+ if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+ f2fs_balance_fs(sbi);
+ return 0;
+
+unlock_out:
+ unlock_page(page);
+ return (err == -ENOENT) ? 0 : err;
+
+redirty_out:
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+#define MAX_DESIRED_PAGES_WP 4096
+
+static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+static int f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int ret;
+ long excess_nrtw = 0, desired_nrtw;
+
+ if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
+ desired_nrtw = MAX_DESIRED_PAGES_WP;
+ excess_nrtw = desired_nrtw - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nrtw;
+ }
+
+ if (!S_ISDIR(inode->i_mode))
+ mutex_lock(&sbi->writepages);
+ ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+ if (!S_ISDIR(inode->i_mode))
+ mutex_unlock(&sbi->writepages);
+ f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+
+ remove_dirty_dir_inode(inode);
+
+ wbc->nr_to_write -= excess_nrtw;
+ return ret;
+}
+
+static int f2fs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+ struct dnode_of_data dn;
+ int err = 0;
+
+ /* for nobh_write_end */
+ *fsdata = NULL;
+
+ f2fs_balance_fs(sbi);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, 0);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ f2fs_put_page(page, 1);
+ return err;
+ }
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ err = reserve_new_block(&dn);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ f2fs_put_page(page, 1);
+ return err;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+ return 0;
+
+ if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned end = start + len;
+
+ /* Reading beyond i_size is simple: memset to zero */
+ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ return 0;
+ }
+
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ } else {
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
+ }
+ SetPageUptodate(page);
+ clear_cold_data(page);
+ return 0;
+}
+
+static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (rw == WRITE)
+ return 0;
+
+ /* Needs synchronization with the cleaner */
+ return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ get_data_block_ro);
+}
+
+static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ ClearPagePrivate(page);
+}
+
+static int f2fs_release_data_page(struct page *page, gfp_t wait)
+{
+ ClearPagePrivate(page);
+ return 0;
+}
+
+static int f2fs_set_data_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ set_dirty_dir_page(inode, page);
+ return 1;
+ }
+ return 0;
+}
+
+static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
+{
+ return generic_block_bmap(mapping, block, get_data_block_ro);
+}
+
+const struct address_space_operations f2fs_dblock_aops = {
+ .readpage = f2fs_read_data_page,
+ .readpages = f2fs_read_data_pages,
+ .writepage = f2fs_write_data_page,
+ .writepages = f2fs_write_data_pages,
+ .write_begin = f2fs_write_begin,
+ .write_end = nobh_write_end,
+ .set_page_dirty = f2fs_set_data_page_dirty,
+ .invalidatepage = f2fs_invalidate_data_page,
+ .releasepage = f2fs_release_data_page,
+ .direct_IO = f2fs_direct_IO,
+ .bmap = f2fs_bmap,
+};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
new file mode 100644
index 000000000000..025b9e2f935d
--- /dev/null
+++ b/fs/f2fs/debug.c
@@ -0,0 +1,355 @@
+/*
+ * f2fs debugging statistics
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ * Copyright (c) 2012 Linux Foundation
+ * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+
+static LIST_HEAD(f2fs_stat_list);
+static struct dentry *debugfs_root;
+static DEFINE_MUTEX(f2fs_stat_mutex);
+
+static void update_general_status(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ int i;
+
+ /* valid check of the segment numbers */
+ si->hit_ext = sbi->read_hit_ext;
+ si->total_ext = sbi->total_hit_ext;
+ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
+ si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
+ si->ndirty_dirs = sbi->n_dirty_dirs;
+ si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+ si->rsvd_segs = reserved_segments(sbi);
+ si->overp_segs = overprovision_segments(sbi);
+ si->valid_count = valid_user_blocks(sbi);
+ si->valid_node_count = valid_node_count(sbi);
+ si->valid_inode_count = valid_inode_count(sbi);
+ si->utilization = utilization(sbi);
+
+ si->free_segs = free_segments(sbi);
+ si->free_secs = free_sections(sbi);
+ si->prefree_count = prefree_segments(sbi);
+ si->dirty_count = dirty_segments(sbi);
+ si->node_pages = sbi->node_inode->i_mapping->nrpages;
+ si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+ si->nats = NM_I(sbi)->nat_cnt;
+ si->sits = SIT_I(sbi)->dirty_sentries;
+ si->fnids = NM_I(sbi)->fcnt;
+ si->bg_gc = sbi->bg_gc;
+ si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+ * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+ / 2;
+ si->util_valid = (int)(written_block_count(sbi) >>
+ sbi->log_blocks_per_seg)
+ * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+ / 2;
+ si->util_invalid = 50 - si->util_free - si->util_valid;
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
+ struct curseg_info *curseg = CURSEG_I(sbi, i);
+ si->curseg[i] = curseg->segno;
+ si->cursec[i] = curseg->segno / sbi->segs_per_sec;
+ si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ }
+
+ for (i = 0; i < 2; i++) {
+ si->segment_count[i] = sbi->segment_count[i];
+ si->block_count[i] = sbi->block_count[i];
+ }
+}
+
+/*
+ * This function calculates BDF of every segments
+ */
+static void update_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, vblocks;
+ int ndirty = 0;
+
+ bimodal = 0;
+ total_vblocks = 0;
+ blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ hblks_per_sec = blks_per_sec / 2;
+ mutex_lock(&sit_i->sentry_lock);
+ for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ dist = abs(vblocks - hblks_per_sec);
+ bimodal += dist * dist;
+
+ if (vblocks > 0 && vblocks < blks_per_sec) {
+ total_vblocks += vblocks;
+ ndirty++;
+ }
+ }
+ mutex_unlock(&sit_i->sentry_lock);
+ dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+ si->bimodal = bimodal / dist;
+ if (si->dirty_count)
+ si->avg_vblocks = total_vblocks / ndirty;
+ else
+ si->avg_vblocks = 0;
+}
+
+/*
+ * This function calculates memory footprint.
+ */
+static void update_mem_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ unsigned npages;
+
+ if (si->base_mem)
+ goto get_cache;
+
+ si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ si->base_mem += 2 * sizeof(struct f2fs_inode_info);
+ si->base_mem += sizeof(*sbi->ckpt);
+
+ /* build sm */
+ si->base_mem += sizeof(struct f2fs_sm_info);
+
+ /* build sit */
+ si->base_mem += sizeof(struct sit_info);
+ si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
+ si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+ if (sbi->segs_per_sec > 1)
+ si->base_mem += sbi->total_sections *
+ sizeof(struct sec_entry);
+ si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
+
+ /* build free segmap */
+ si->base_mem += sizeof(struct free_segmap_info);
+ si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+
+ /* build curseg */
+ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
+ si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+
+ /* build dirty segmap */
+ si->base_mem += sizeof(struct dirty_seglist_info);
+ si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ /* buld nm */
+ si->base_mem += sizeof(struct f2fs_nm_info);
+ si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+
+ /* build gc */
+ si->base_mem += sizeof(struct f2fs_gc_kthread);
+
+get_cache:
+ /* free nids */
+ si->cache_mem = NM_I(sbi)->fcnt;
+ si->cache_mem += NM_I(sbi)->nat_cnt;
+ npages = sbi->node_inode->i_mapping->nrpages;
+ si->cache_mem += npages << PAGE_CACHE_SHIFT;
+ npages = sbi->meta_inode->i_mapping->nrpages;
+ si->cache_mem += npages << PAGE_CACHE_SHIFT;
+ si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
+ si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+}
+
+static int stat_show(struct seq_file *s, void *v)
+{
+ struct f2fs_stat_info *si, *next;
+ int i = 0;
+ int j;
+
+ mutex_lock(&f2fs_stat_mutex);
+ list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ char devname[BDEVNAME_SIZE];
+
+ update_general_status(si->sbi);
+
+ seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
+ bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
+ si->sit_area_segs, si->nat_area_segs);
+ seq_printf(s, "[SSA: %d] [MAIN: %d",
+ si->ssa_area_segs, si->main_area_segs);
+ seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
+ si->overp_segs, si->rsvd_segs);
+ seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
+ si->utilization, si->valid_count);
+ seq_printf(s, " - Node: %u (Inode: %u, ",
+ si->valid_node_count, si->valid_inode_count);
+ seq_printf(s, "Other: %u)\n - Data: %u\n",
+ si->valid_node_count - si->valid_inode_count,
+ si->valid_count - si->valid_node_count);
+ seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
+ si->main_area_segs, si->main_area_sections,
+ si->main_area_zones);
+ seq_printf(s, " - COLD data: %d, %d, %d\n",
+ si->curseg[CURSEG_COLD_DATA],
+ si->cursec[CURSEG_COLD_DATA],
+ si->curzone[CURSEG_COLD_DATA]);
+ seq_printf(s, " - WARM data: %d, %d, %d\n",
+ si->curseg[CURSEG_WARM_DATA],
+ si->cursec[CURSEG_WARM_DATA],
+ si->curzone[CURSEG_WARM_DATA]);
+ seq_printf(s, " - HOT data: %d, %d, %d\n",
+ si->curseg[CURSEG_HOT_DATA],
+ si->cursec[CURSEG_HOT_DATA],
+ si->curzone[CURSEG_HOT_DATA]);
+ seq_printf(s, " - Dir dnode: %d, %d, %d\n",
+ si->curseg[CURSEG_HOT_NODE],
+ si->cursec[CURSEG_HOT_NODE],
+ si->curzone[CURSEG_HOT_NODE]);
+ seq_printf(s, " - File dnode: %d, %d, %d\n",
+ si->curseg[CURSEG_WARM_NODE],
+ si->cursec[CURSEG_WARM_NODE],
+ si->curzone[CURSEG_WARM_NODE]);
+ seq_printf(s, " - Indir nodes: %d, %d, %d\n",
+ si->curseg[CURSEG_COLD_NODE],
+ si->cursec[CURSEG_COLD_NODE],
+ si->curzone[CURSEG_COLD_NODE]);
+ seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
+ si->main_area_segs - si->dirty_count -
+ si->prefree_count - si->free_segs,
+ si->dirty_count);
+ seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
+ si->prefree_count, si->free_segs, si->free_secs);
+ seq_printf(s, "GC calls: %d (BG: %d)\n",
+ si->call_count, si->bg_gc);
+ seq_printf(s, " - data segments : %d\n", si->data_segs);
+ seq_printf(s, " - node segments : %d\n", si->node_segs);
+ seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
+ seq_printf(s, " - data blocks : %d\n", si->data_blks);
+ seq_printf(s, " - node blocks : %d\n", si->node_blks);
+ seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
+ si->hit_ext, si->total_ext);
+ seq_printf(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - nodes %4d in %4d\n",
+ si->ndirty_node, si->node_pages);
+ seq_printf(s, " - dents %4d in dirs:%4d\n",
+ si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - meta %4d in %4d\n",
+ si->ndirty_meta, si->meta_pages);
+ seq_printf(s, " - NATs %5d > %lu\n",
+ si->nats, NM_WOUT_THRESHOLD);
+ seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
+ si->sits, si->fnids);
+ seq_printf(s, "\nDistribution of User Blocks:");
+ seq_printf(s, " [ valid | invalid | free ]\n");
+ seq_printf(s, " [");
+
+ for (j = 0; j < si->util_valid; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "|");
+
+ for (j = 0; j < si->util_invalid; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "|");
+
+ for (j = 0; j < si->util_free; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "]\n\n");
+ seq_printf(s, "SSR: %u blocks in %u segments\n",
+ si->block_count[SSR], si->segment_count[SSR]);
+ seq_printf(s, "LFS: %u blocks in %u segments\n",
+ si->block_count[LFS], si->segment_count[LFS]);
+
+ /* segment usage info */
+ update_sit_info(si->sbi);
+ seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
+ si->bimodal, si->avg_vblocks);
+
+ /* memory footprint */
+ update_mem_info(si->sbi);
+ seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
+ (si->base_mem + si->cache_mem) >> 10,
+ si->base_mem >> 10, si->cache_mem >> 10);
+ }
+ mutex_unlock(&f2fs_stat_mutex);
+ return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, stat_show, inode->i_private);
+}
+
+static const struct file_operations stat_fops = {
+ .open = stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_stat_info *si;
+
+ sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+ if (!sbi->stat_info)
+ return -ENOMEM;
+
+ si = sbi->stat_info;
+ si->all_area_segs = le32_to_cpu(raw_super->segment_count);
+ si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
+ si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
+ si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
+ si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+ si->main_area_sections = le32_to_cpu(raw_super->section_count);
+ si->main_area_zones = si->main_area_sections /
+ le32_to_cpu(raw_super->secs_per_zone);
+ si->sbi = sbi;
+
+ mutex_lock(&f2fs_stat_mutex);
+ list_add_tail(&si->stat_list, &f2fs_stat_list);
+ mutex_unlock(&f2fs_stat_mutex);
+
+ return 0;
+}
+
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+
+ mutex_lock(&f2fs_stat_mutex);
+ list_del(&si->stat_list);
+ mutex_unlock(&f2fs_stat_mutex);
+
+ kfree(sbi->stat_info);
+}
+
+void __init f2fs_create_root_stats(void)
+{
+ debugfs_root = debugfs_create_dir("f2fs", NULL);
+ if (debugfs_root)
+ debugfs_create_file("status", S_IRUGO, debugfs_root,
+ NULL, &stat_fops);
+}
+
+void f2fs_destroy_root_stats(void)
+{
+ debugfs_remove_recursive(debugfs_root);
+ debugfs_root = NULL;
+}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
new file mode 100644
index 000000000000..a1f38443ecee
--- /dev/null
+++ b/fs/f2fs/dir.c
@@ -0,0 +1,671 @@
+/*
+ * fs/f2fs/dir.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "acl.h"
+
+static unsigned long dir_blocks(struct inode *inode)
+{
+ return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
+ >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned int dir_buckets(unsigned int level)
+{
+ if (level < MAX_DIR_HASH_DEPTH / 2)
+ return 1 << level;
+ else
+ return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+}
+
+static unsigned int bucket_blocks(unsigned int level)
+{
+ if (level < MAX_DIR_HASH_DEPTH / 2)
+ return 2;
+ else
+ return 4;
+}
+
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+ [F2FS_FT_UNKNOWN] = DT_UNKNOWN,
+ [F2FS_FT_REG_FILE] = DT_REG,
+ [F2FS_FT_DIR] = DT_DIR,
+ [F2FS_FT_CHRDEV] = DT_CHR,
+ [F2FS_FT_BLKDEV] = DT_BLK,
+ [F2FS_FT_FIFO] = DT_FIFO,
+ [F2FS_FT_SOCK] = DT_SOCK,
+ [F2FS_FT_SYMLINK] = DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
+};
+
+static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+{
+ mode_t mode = inode->i_mode;
+ de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+{
+ unsigned long i;
+ unsigned long bidx = 0;
+
+ for (i = 0; i < level; i++)
+ bidx += dir_buckets(i) * bucket_blocks(i);
+ bidx += idx * bucket_blocks(level);
+ return bidx;
+}
+
+static bool early_match_name(const char *name, size_t namelen,
+ f2fs_hash_t namehash, struct f2fs_dir_entry *de)
+{
+ if (le16_to_cpu(de->name_len) != namelen)
+ return false;
+
+ if (de->hash_code != namehash)
+ return false;
+
+ return true;
+}
+
+static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+ const char *name, size_t namelen, int *max_slots,
+ f2fs_hash_t namehash, struct page **res_page)
+{
+ struct f2fs_dir_entry *de;
+ unsigned long bit_pos, end_pos, next_pos;
+ struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
+ int slots;
+
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK, 0);
+ while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ de = &dentry_blk->dentry[bit_pos];
+ slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+
+ if (early_match_name(name, namelen, namehash, de)) {
+ if (!memcmp(dentry_blk->filename[bit_pos],
+ name, namelen)) {
+ *res_page = dentry_page;
+ goto found;
+ }
+ }
+ next_pos = bit_pos + slots;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK, next_pos);
+ if (bit_pos >= NR_DENTRY_IN_BLOCK)
+ end_pos = NR_DENTRY_IN_BLOCK;
+ else
+ end_pos = bit_pos;
+ if (*max_slots < end_pos - next_pos)
+ *max_slots = end_pos - next_pos;
+ }
+
+ de = NULL;
+ kunmap(dentry_page);
+found:
+ return de;
+}
+
+static struct f2fs_dir_entry *find_in_level(struct inode *dir,
+ unsigned int level, const char *name, size_t namelen,
+ f2fs_hash_t namehash, struct page **res_page)
+{
+ int s = GET_DENTRY_SLOTS(namelen);
+ unsigned int nbucket, nblock;
+ unsigned int bidx, end_block;
+ struct page *dentry_page;
+ struct f2fs_dir_entry *de = NULL;
+ bool room = false;
+ int max_slots = 0;
+
+ BUG_ON(level > MAX_DIR_HASH_DEPTH);
+
+ nbucket = dir_buckets(level);
+ nblock = bucket_blocks(level);
+
+ bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+ end_block = bidx + nblock;
+
+ for (; bidx < end_block; bidx++) {
+ /* no need to allocate new dentry pages to all the indices */
+ dentry_page = find_data_page(dir, bidx);
+ if (IS_ERR(dentry_page)) {
+ room = true;
+ continue;
+ }
+
+ de = find_in_block(dentry_page, name, namelen,
+ &max_slots, namehash, res_page);
+ if (de)
+ break;
+
+ if (max_slots >= s)
+ room = true;
+ f2fs_put_page(dentry_page, 0);
+ }
+
+ if (!de && room && F2FS_I(dir)->chash != namehash) {
+ F2FS_I(dir)->chash = namehash;
+ F2FS_I(dir)->clevel = level;
+ }
+
+ return de;
+}
+
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ struct qstr *child, struct page **res_page)
+{
+ const char *name = child->name;
+ size_t namelen = child->len;
+ unsigned long npages = dir_blocks(dir);
+ struct f2fs_dir_entry *de = NULL;
+ f2fs_hash_t name_hash;
+ unsigned int max_depth;
+ unsigned int level;
+
+ if (npages == 0)
+ return NULL;
+
+ *res_page = NULL;
+
+ name_hash = f2fs_dentry_hash(name, namelen);
+ max_depth = F2FS_I(dir)->i_current_depth;
+
+ for (level = 0; level < max_depth; level++) {
+ de = find_in_level(dir, level, name,
+ namelen, name_hash, res_page);
+ if (de)
+ break;
+ }
+ if (!de && F2FS_I(dir)->chash != name_hash) {
+ F2FS_I(dir)->chash = name_hash;
+ F2FS_I(dir)->clevel = level - 1;
+ }
+ return de;
+}
+
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+{
+ struct page *page = NULL;
+ struct f2fs_dir_entry *de = NULL;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+
+ page = get_lock_data_page(dir, 0);
+ if (IS_ERR(page))
+ return NULL;
+
+ dentry_blk = kmap(page);
+ de = &dentry_blk->dentry[1];
+ *p = page;
+ unlock_page(page);
+ return de;
+}
+
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+{
+ ino_t res = 0;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+
+ de = f2fs_find_entry(dir, qstr, &page);
+ if (de) {
+ res = le32_to_cpu(de->ino);
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ }
+
+ return res;
+}
+
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+ struct page *page, struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+
+ mutex_lock_op(sbi, DENTRY_OPS);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ de->ino = cpu_to_le32(inode->i_ino);
+ set_de_type(de, inode);
+ kunmap(page);
+ set_page_dirty(page);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+
+ /* update parent inode number before releasing dentry page */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+
+ f2fs_put_page(page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+}
+
+void init_dent_inode(const struct qstr *name, struct page *ipage)
+{
+ struct f2fs_node *rn;
+
+ if (IS_ERR(ipage))
+ return;
+
+ wait_on_page_writeback(ipage);
+
+ /* copy name info. to this inode page */
+ rn = (struct f2fs_node *)page_address(ipage);
+ rn->i.i_namelen = cpu_to_le32(name->len);
+ memcpy(rn->i.i_name, name->name, name->len);
+ set_page_dirty(ipage);
+}
+
+static int init_inode_metadata(struct inode *inode,
+ struct inode *dir, const struct qstr *name)
+{
+ if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ int err;
+ err = new_inode_page(inode, name);
+ if (err)
+ return err;
+
+ if (S_ISDIR(inode->i_mode)) {
+ err = f2fs_make_empty(inode, dir);
+ if (err) {
+ remove_inode_page(inode);
+ return err;
+ }
+ }
+
+ err = f2fs_init_acl(inode, dir);
+ if (err) {
+ remove_inode_page(inode);
+ return err;
+ }
+ } else {
+ struct page *ipage;
+ ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+ set_cold_node(inode, ipage);
+ init_dent_inode(name, ipage);
+ f2fs_put_page(ipage, 1);
+ }
+ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ inc_nlink(inode);
+ f2fs_write_inode(inode, NULL);
+ }
+ return 0;
+}
+
+static void update_parent_metadata(struct inode *dir, struct inode *inode,
+ unsigned int current_depth)
+{
+ bool need_dir_update = false;
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (S_ISDIR(inode->i_mode)) {
+ inc_nlink(dir);
+ need_dir_update = true;
+ }
+ clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ }
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ if (F2FS_I(dir)->i_current_depth != current_depth) {
+ F2FS_I(dir)->i_current_depth = current_depth;
+ need_dir_update = true;
+ }
+
+ if (need_dir_update)
+ f2fs_write_inode(dir, NULL);
+ else
+ mark_inode_dirty(dir);
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+}
+
+static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
+{
+ int bit_start = 0;
+ int zero_start, zero_end;
+next:
+ zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_start);
+ if (zero_start >= NR_DENTRY_IN_BLOCK)
+ return NR_DENTRY_IN_BLOCK;
+
+ zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ zero_start);
+ if (zero_end - zero_start >= slots)
+ return zero_start;
+
+ bit_start = zero_end + 1;
+
+ if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
+ return NR_DENTRY_IN_BLOCK;
+ goto next;
+}
+
+int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
+{
+ unsigned int bit_pos;
+ unsigned int level;
+ unsigned int current_depth;
+ unsigned long bidx, block;
+ f2fs_hash_t dentry_hash;
+ struct f2fs_dir_entry *de;
+ unsigned int nbucket, nblock;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ size_t namelen = name->len;
+ struct page *dentry_page = NULL;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+ int slots = GET_DENTRY_SLOTS(namelen);
+ int err = 0;
+ int i;
+
+ dentry_hash = f2fs_dentry_hash(name->name, name->len);
+ level = 0;
+ current_depth = F2FS_I(dir)->i_current_depth;
+ if (F2FS_I(dir)->chash == dentry_hash) {
+ level = F2FS_I(dir)->clevel;
+ F2FS_I(dir)->chash = 0;
+ }
+
+start:
+ if (current_depth == MAX_DIR_HASH_DEPTH)
+ return -ENOSPC;
+
+ /* Increase the depth, if required */
+ if (level == current_depth)
+ ++current_depth;
+
+ nbucket = dir_buckets(level);
+ nblock = bucket_blocks(level);
+
+ bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+
+ for (block = bidx; block <= (bidx + nblock - 1); block++) {
+ mutex_lock_op(sbi, DENTRY_OPS);
+ dentry_page = get_new_data_page(dir, block, true);
+ if (IS_ERR(dentry_page)) {
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ return PTR_ERR(dentry_page);
+ }
+
+ dentry_blk = kmap(dentry_page);
+ bit_pos = room_for_filename(dentry_blk, slots);
+ if (bit_pos < NR_DENTRY_IN_BLOCK)
+ goto add_dentry;
+
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ }
+
+ /* Move to next level to find the empty slot for new dentry */
+ ++level;
+ goto start;
+add_dentry:
+ err = init_inode_metadata(inode, dir, name);
+ if (err)
+ goto fail;
+
+ wait_on_page_writeback(dentry_page);
+
+ de = &dentry_blk->dentry[bit_pos];
+ de->hash_code = dentry_hash;
+ de->name_len = cpu_to_le16(namelen);
+ memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
+ de->ino = cpu_to_le32(inode->i_ino);
+ set_de_type(de, inode);
+ for (i = 0; i < slots; i++)
+ test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ set_page_dirty(dentry_page);
+
+ update_parent_metadata(dir, inode, current_depth);
+
+ /* update parent inode number before releasing dentry page */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+fail:
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ return err;
+}
+
+/*
+ * It only removes the dentry from the dentry page,corresponding name
+ * entry in name page does not need to be touched during deletion.
+ */
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *inode)
+{
+ struct f2fs_dentry_block *dentry_blk;
+ unsigned int bit_pos;
+ struct address_space *mapping = page->mapping;
+ struct inode *dir = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ void *kaddr = page_address(page);
+ int i;
+
+ mutex_lock_op(sbi, DENTRY_OPS);
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+ bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
+ for (i = 0; i < slots; i++)
+ test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+ /* Let's check and deallocate this dentry page */
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ 0);
+ kunmap(page); /* kunmap - pair of f2fs_find_entry */
+ set_page_dirty(page);
+
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+ if (inode && S_ISDIR(inode->i_mode)) {
+ drop_nlink(dir);
+ f2fs_write_inode(dir, NULL);
+ } else {
+ mark_inode_dirty(dir);
+ }
+
+ if (inode) {
+ inode->i_ctime = CURRENT_TIME;
+ drop_nlink(inode);
+ if (S_ISDIR(inode->i_mode)) {
+ drop_nlink(inode);
+ i_size_write(inode, 0);
+ }
+ f2fs_write_inode(inode, NULL);
+ if (inode->i_nlink == 0)
+ add_orphan_inode(sbi, inode->i_ino);
+ }
+
+ if (bit_pos == NR_DENTRY_IN_BLOCK) {
+ truncate_hole(dir, page->index, page->index + 1);
+ clear_page_dirty_for_io(page);
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(dir);
+ }
+ f2fs_put_page(page, 1);
+
+ mutex_unlock_op(sbi, DENTRY_OPS);
+}
+
+int f2fs_make_empty(struct inode *inode, struct inode *parent)
+{
+ struct page *dentry_page;
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de;
+ void *kaddr;
+
+ dentry_page = get_new_data_page(inode, 0, true);
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
+
+ kaddr = kmap_atomic(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+
+ de = &dentry_blk->dentry[0];
+ de->name_len = cpu_to_le16(1);
+ de->hash_code = f2fs_dentry_hash(".", 1);
+ de->ino = cpu_to_le32(inode->i_ino);
+ memcpy(dentry_blk->filename[0], ".", 1);
+ set_de_type(de, inode);
+
+ de = &dentry_blk->dentry[1];
+ de->hash_code = f2fs_dentry_hash("..", 2);
+ de->name_len = cpu_to_le16(2);
+ de->ino = cpu_to_le32(parent->i_ino);
+ memcpy(dentry_blk->filename[1], "..", 2);
+ set_de_type(de, inode);
+
+ test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+ test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+ kunmap_atomic(kaddr);
+
+ set_page_dirty(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ return 0;
+}
+
+bool f2fs_empty_dir(struct inode *dir)
+{
+ unsigned long bidx;
+ struct page *dentry_page;
+ unsigned int bit_pos;
+ struct f2fs_dentry_block *dentry_blk;
+ unsigned long nblock = dir_blocks(dir);
+
+ for (bidx = 0; bidx < nblock; bidx++) {
+ void *kaddr;
+ dentry_page = get_lock_data_page(dir, bidx);
+ if (IS_ERR(dentry_page)) {
+ if (PTR_ERR(dentry_page) == -ENOENT)
+ continue;
+ else
+ return false;
+ }
+
+ kaddr = kmap_atomic(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+ if (bidx == 0)
+ bit_pos = 2;
+ else
+ bit_pos = 0;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_pos);
+ kunmap_atomic(kaddr);
+
+ f2fs_put_page(dentry_page, 1);
+
+ if (bit_pos < NR_DENTRY_IN_BLOCK)
+ return false;
+ }
+ return true;
+}
+
+static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ unsigned long pos = file->f_pos;
+ struct inode *inode = file_inode(file);
+ unsigned long npages = dir_blocks(inode);
+ unsigned char *types = NULL;
+ unsigned int bit_pos = 0, start_bit_pos = 0;
+ int over = 0;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+ struct f2fs_dir_entry *de = NULL;
+ struct page *dentry_page = NULL;
+ unsigned int n = 0;
+ unsigned char d_type = DT_UNKNOWN;
+ int slots;
+
+ types = f2fs_filetype_table;
+ bit_pos = (pos % NR_DENTRY_IN_BLOCK);
+ n = (pos / NR_DENTRY_IN_BLOCK);
+
+ for ( ; n < npages; n++) {
+ dentry_page = get_lock_data_page(inode, n);
+ if (IS_ERR(dentry_page))
+ continue;
+
+ start_bit_pos = bit_pos;
+ dentry_blk = kmap(dentry_page);
+ while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ d_type = DT_UNKNOWN;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_pos);
+ if (bit_pos >= NR_DENTRY_IN_BLOCK)
+ break;
+
+ de = &dentry_blk->dentry[bit_pos];
+ if (types && de->file_type < F2FS_FT_MAX)
+ d_type = types[de->file_type];
+
+ over = filldir(dirent,
+ dentry_blk->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ (n * NR_DENTRY_IN_BLOCK) + bit_pos,
+ le32_to_cpu(de->ino), d_type);
+ if (over) {
+ file->f_pos += bit_pos - start_bit_pos;
+ goto success;
+ }
+ slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ bit_pos += slots;
+ }
+ bit_pos = 0;
+ file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ dentry_page = NULL;
+ }
+success:
+ if (dentry_page && !IS_ERR(dentry_page)) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ }
+
+ return 0;
+}
+
+const struct file_operations f2fs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = f2fs_readdir,
+ .fsync = f2fs_sync_file,
+ .unlocked_ioctl = f2fs_ioctl,
+};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
new file mode 100644
index 000000000000..cc2213afdcc7
--- /dev/null
+++ b/fs/f2fs/f2fs.h
@@ -0,0 +1,1113 @@
+/*
+ * fs/f2fs/f2fs.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_F2FS_H
+#define _LINUX_F2FS_H
+
+#include <linux/types.h>
+#include <linux/page-flags.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/magic.h>
+
+/*
+ * For mount options
+ */
+#define F2FS_MOUNT_BG_GC 0x00000001
+#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
+#define F2FS_MOUNT_DISCARD 0x00000004
+#define F2FS_MOUNT_NOHEAP 0x00000008
+#define F2FS_MOUNT_XATTR_USER 0x00000010
+#define F2FS_MOUNT_POSIX_ACL 0x00000020
+#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
+
+#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+
+#define ver_after(a, b) (typecheck(unsigned long long, a) && \
+ typecheck(unsigned long long, b) && \
+ ((long long)((a) - (b)) > 0))
+
+typedef u64 block_t;
+typedef u32 nid_t;
+
+struct f2fs_mount_info {
+ unsigned int opt;
+};
+
+static inline __u32 f2fs_crc32(void *buff, size_t len)
+{
+ return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+}
+
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+{
+ return f2fs_crc32(buff, buff_size) == blk_crc;
+}
+
+/*
+ * For checkpoint manager
+ */
+enum {
+ NAT_BITMAP,
+ SIT_BITMAP
+};
+
+/* for the list of orphan inodes */
+struct orphan_inode_entry {
+ struct list_head list; /* list head */
+ nid_t ino; /* inode number */
+};
+
+/* for the list of directory inodes */
+struct dir_inode_entry {
+ struct list_head list; /* list head */
+ struct inode *inode; /* vfs inode pointer */
+};
+
+/* for the list of fsync inodes, used only during recovery */
+struct fsync_inode_entry {
+ struct list_head list; /* list head */
+ struct inode *inode; /* vfs inode pointer */
+ block_t blkaddr; /* block address locating the last inode */
+};
+
+#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
+#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
+
+#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
+#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
+#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
+#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+
+static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+ int before = nats_in_cursum(rs);
+ rs->n_nats = cpu_to_le16(before + i);
+ return before;
+}
+
+static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+ int before = sits_in_cursum(rs);
+ rs->n_sits = cpu_to_le16(before + i);
+ return before;
+}
+
+/*
+ * ioctl commands
+ */
+#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#endif
+
+/*
+ * For INODE and NODE manager
+ */
+#define XATTR_NODE_OFFSET (-1) /*
+ * store xattrs to one node block per
+ * file keeping -1 as its node offset to
+ * distinguish from index node blocks.
+ */
+#define RDONLY_NODE 1 /*
+ * specify a read-only mode when getting
+ * a node block. 0 is read-write mode.
+ * used by get_dnode_of_data().
+ */
+#define F2FS_LINK_MAX 32000 /* maximum link count per file */
+
+/* for in-memory extent cache entry */
+struct extent_info {
+ rwlock_t ext_lock; /* rwlock for consistency */
+ unsigned int fofs; /* start offset in a file */
+ u32 blk_addr; /* start block address of the extent */
+ unsigned int len; /* lenth of the extent */
+};
+
+/*
+ * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
+ */
+#define FADVISE_COLD_BIT 0x01
+
+struct f2fs_inode_info {
+ struct inode vfs_inode; /* serve a vfs inode */
+ unsigned long i_flags; /* keep an inode flags for ioctl */
+ unsigned char i_advise; /* use to give file attribute hints */
+ unsigned int i_current_depth; /* use only in directory structure */
+ unsigned int i_pino; /* parent inode number */
+ umode_t i_acl_mode; /* keep file acl mode temporarily */
+
+ /* Use below internally in f2fs*/
+ unsigned long flags; /* use to pass per-file flags */
+ unsigned long long data_version;/* latest version of data for fsync */
+ atomic_t dirty_dents; /* # of dirty dentry pages */
+ f2fs_hash_t chash; /* hash value of given file name */
+ unsigned int clevel; /* maximum level of given file name */
+ nid_t i_xattr_nid; /* node id that contains xattrs */
+ struct extent_info ext; /* in-memory extent cache entry */
+};
+
+static inline void get_extent_info(struct extent_info *ext,
+ struct f2fs_extent i_ext)
+{
+ write_lock(&ext->ext_lock);
+ ext->fofs = le32_to_cpu(i_ext.fofs);
+ ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
+ ext->len = le32_to_cpu(i_ext.len);
+ write_unlock(&ext->ext_lock);
+}
+
+static inline void set_raw_extent(struct extent_info *ext,
+ struct f2fs_extent *i_ext)
+{
+ read_lock(&ext->ext_lock);
+ i_ext->fofs = cpu_to_le32(ext->fofs);
+ i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+ i_ext->len = cpu_to_le32(ext->len);
+ read_unlock(&ext->ext_lock);
+}
+
+struct f2fs_nm_info {
+ block_t nat_blkaddr; /* base disk address of NAT */
+ nid_t max_nid; /* maximum possible node ids */
+ nid_t init_scan_nid; /* the first nid to be scanned */
+ nid_t next_scan_nid; /* the next nid to be scanned */
+
+ /* NAT cache management */
+ struct radix_tree_root nat_root;/* root of the nat entry cache */
+ rwlock_t nat_tree_lock; /* protect nat_tree_lock */
+ unsigned int nat_cnt; /* the # of cached nat entries */
+ struct list_head nat_entries; /* cached nat entry list (clean) */
+ struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
+
+ /* free node ids management */
+ struct list_head free_nid_list; /* a list for free nids */
+ spinlock_t free_nid_list_lock; /* protect free nid list */
+ unsigned int fcnt; /* the number of free node id */
+ struct mutex build_lock; /* lock for build free nids */
+
+ /* for checkpoint */
+ char *nat_bitmap; /* NAT bitmap pointer */
+ int bitmap_size; /* bitmap size */
+};
+
+/*
+ * this structure is used as one of function parameters.
+ * all the information are dedicated to a given direct node block determined
+ * by the data offset in a file.
+ */
+struct dnode_of_data {
+ struct inode *inode; /* vfs inode pointer */
+ struct page *inode_page; /* its inode page, NULL is possible */
+ struct page *node_page; /* cached direct node page */
+ nid_t nid; /* node id of the direct node block */
+ unsigned int ofs_in_node; /* data offset in the node page */
+ bool inode_page_locked; /* inode page is locked or not */
+ block_t data_blkaddr; /* block address of the node block */
+};
+
+static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
+ struct page *ipage, struct page *npage, nid_t nid)
+{
+ memset(dn, 0, sizeof(*dn));
+ dn->inode = inode;
+ dn->inode_page = ipage;
+ dn->node_page = npage;
+ dn->nid = nid;
+}
+
+/*
+ * For SIT manager
+ *
+ * By default, there are 6 active log areas across the whole main area.
+ * When considering hot and cold data separation to reduce cleaning overhead,
+ * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
+ * respectively.
+ * In the current design, you should not change the numbers intentionally.
+ * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
+ * logs individually according to the underlying devices. (default: 6)
+ * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
+ * data and 8 for node logs.
+ */
+#define NR_CURSEG_DATA_TYPE (3)
+#define NR_CURSEG_NODE_TYPE (3)
+#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+
+enum {
+ CURSEG_HOT_DATA = 0, /* directory entry blocks */
+ CURSEG_WARM_DATA, /* data blocks */
+ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */
+ CURSEG_HOT_NODE, /* direct node blocks of directory files */
+ CURSEG_WARM_NODE, /* direct node blocks of normal files */
+ CURSEG_COLD_NODE, /* indirect node blocks */
+ NO_CHECK_TYPE
+};
+
+struct f2fs_sm_info {
+ struct sit_info *sit_info; /* whole segment information */
+ struct free_segmap_info *free_info; /* free segment information */
+ struct dirty_seglist_info *dirty_info; /* dirty segment information */
+ struct curseg_info *curseg_array; /* active segment information */
+
+ struct list_head wblist_head; /* list of under-writeback pages */
+ spinlock_t wblist_lock; /* lock for checkpoint */
+
+ block_t seg0_blkaddr; /* block address of 0'th segment */
+ block_t main_blkaddr; /* start block address of main area */
+ block_t ssa_blkaddr; /* start block address of SSA area */
+
+ unsigned int segment_count; /* total # of segments */
+ unsigned int main_segments; /* # of segments in main area */
+ unsigned int reserved_segments; /* # of reserved segments */
+ unsigned int ovp_segments; /* # of overprovision segments */
+};
+
+/*
+ * For directory operation
+ */
+#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
+#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
+#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
+#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
+#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
+
+/*
+ * For superblock
+ */
+/*
+ * COUNT_TYPE for monitoring
+ *
+ * f2fs monitors the number of several block types such as on-writeback,
+ * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
+ */
+enum count_type {
+ F2FS_WRITEBACK,
+ F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_NODES,
+ F2FS_DIRTY_META,
+ NR_COUNT_TYPE,
+};
+
+/*
+ * FS_LOCK nesting subclasses for the lock validator:
+ *
+ * The locking order between these classes is
+ * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
+ * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ */
+enum lock_type {
+ RENAME, /* for renaming operations */
+ DENTRY_OPS, /* for directory operations */
+ DATA_WRITE, /* for data write */
+ DATA_NEW, /* for data allocation */
+ DATA_TRUNC, /* for data truncate */
+ NODE_NEW, /* for node allocation */
+ NODE_TRUNC, /* for node truncate */
+ NODE_WRITE, /* for node write */
+ NR_LOCK_TYPE,
+};
+
+/*
+ * The below are the page types of bios used in submti_bio().
+ * The available types are:
+ * DATA User data pages. It operates as async mode.
+ * NODE Node pages. It operates as async mode.
+ * META FS metadata pages such as SIT, NAT, CP.
+ * NR_PAGE_TYPE The number of page types.
+ * META_FLUSH Make sure the previous pages are written
+ * with waiting the bio's completion
+ * ... Only can be used with META.
+ */
+enum page_type {
+ DATA,
+ NODE,
+ META,
+ NR_PAGE_TYPE,
+ META_FLUSH,
+};
+
+struct f2fs_sb_info {
+ struct super_block *sb; /* pointer to VFS super block */
+ struct buffer_head *raw_super_buf; /* buffer head of raw sb */
+ struct f2fs_super_block *raw_super; /* raw super block pointer */
+ int s_dirty; /* dirty flag for checkpoint */
+
+ /* for node-related operations */
+ struct f2fs_nm_info *nm_info; /* node manager */
+ struct inode *node_inode; /* cache node blocks */
+
+ /* for segment-related operations */
+ struct f2fs_sm_info *sm_info; /* segment manager */
+ struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */
+ sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */
+ struct rw_semaphore bio_sem; /* IO semaphore */
+
+ /* for checkpoint */
+ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ struct inode *meta_inode; /* cache meta blocks */
+ struct mutex cp_mutex; /* for checkpoint procedure */
+ struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */
+ struct mutex write_inode; /* mutex for write inode */
+ struct mutex writepages; /* mutex for writepages() */
+ int por_doing; /* recovery is doing or not */
+
+ /* for orphan inode management */
+ struct list_head orphan_inode_list; /* orphan inode list */
+ struct mutex orphan_inode_mutex; /* for orphan inode list */
+ unsigned int n_orphans; /* # of orphan inodes */
+
+ /* for directory inode management */
+ struct list_head dir_inode_list; /* dir inode list */
+ spinlock_t dir_inode_lock; /* for dir inode list lock */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+
+ /* basic file system units */
+ unsigned int log_sectors_per_block; /* log2 sectors per block */
+ unsigned int log_blocksize; /* log2 block size */
+ unsigned int blocksize; /* block size */
+ unsigned int root_ino_num; /* root inode number*/
+ unsigned int node_ino_num; /* node inode number*/
+ unsigned int meta_ino_num; /* meta inode number*/
+ unsigned int log_blocks_per_seg; /* log2 blocks per segment */
+ unsigned int blocks_per_seg; /* blocks per segment */
+ unsigned int segs_per_sec; /* segments per section */
+ unsigned int secs_per_zone; /* sections per zone */
+ unsigned int total_sections; /* total section count */
+ unsigned int total_node_count; /* total node block count */
+ unsigned int total_valid_node_count; /* valid node block count */
+ unsigned int total_valid_inode_count; /* valid inode count */
+ int active_logs; /* # of active logs */
+
+ block_t user_block_count; /* # of user blocks */
+ block_t total_valid_block_count; /* # of valid blocks */
+ block_t alloc_valid_block_count; /* # of allocated blocks */
+ block_t last_valid_block_count; /* for recovery */
+ u32 s_next_generation; /* for NFS support */
+ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
+
+ struct f2fs_mount_info mount_opt; /* mount options */
+
+ /* for cleaning operations */
+ struct mutex gc_mutex; /* mutex for GC */
+ struct f2fs_gc_kthread *gc_thread; /* GC thread */
+
+ /*
+ * for stat information.
+ * one is for the LFS mode, and the other is for the SSR mode.
+ */
+ struct f2fs_stat_info *stat_info; /* FS status information */
+ unsigned int segment_count[2]; /* # of allocated segments */
+ unsigned int block_count[2]; /* # of allocated blocks */
+ unsigned int last_victim[2]; /* last victim segment # */
+ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
+ int bg_gc; /* background gc calls */
+ spinlock_t stat_lock; /* lock for stat operations */
+};
+
+/*
+ * Inline functions
+ */
+static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
+{
+ return container_of(inode, struct f2fs_inode_info, vfs_inode);
+}
+
+static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_super_block *)(sbi->raw_super);
+}
+
+static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_checkpoint *)(sbi->ckpt);
+}
+
+static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_nm_info *)(sbi->nm_info);
+}
+
+static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_sm_info *)(sbi->sm_info);
+}
+
+static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
+{
+ return (struct sit_info *)(SM_I(sbi)->sit_info);
+}
+
+static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
+{
+ return (struct free_segmap_info *)(SM_I(sbi)->free_info);
+}
+
+static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
+{
+ return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
+}
+
+static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+ sbi->s_dirty = 1;
+}
+
+static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+ sbi->s_dirty = 0;
+}
+
+static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ return ckpt_flags & f;
+}
+
+static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ ckpt_flags |= f;
+ cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ ckpt_flags &= (~f);
+ cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+ mutex_lock_nested(&sbi->fs_lock[t], t);
+}
+
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+ mutex_unlock(&sbi->fs_lock[t]);
+}
+
+/*
+ * Check whether the given nid is within node id range.
+ */
+static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ BUG_ON((nid >= NM_I(sbi)->max_nid));
+}
+
+#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
+
+/*
+ * Check whether the inode has blocks or not
+ */
+static inline int F2FS_HAS_BLOCKS(struct inode *inode)
+{
+ if (F2FS_I(inode)->i_xattr_nid)
+ return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+ else
+ return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+}
+
+static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode, blkcnt_t count)
+{
+ block_t valid_block_count;
+
+ spin_lock(&sbi->stat_lock);
+ valid_block_count =
+ sbi->total_valid_block_count + (block_t)count;
+ if (valid_block_count > sbi->user_block_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+ inode->i_blocks += count;
+ sbi->total_valid_block_count = valid_block_count;
+ sbi->alloc_valid_block_count += (block_t)count;
+ spin_unlock(&sbi->stat_lock);
+ return true;
+}
+
+static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ blkcnt_t count)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(sbi->total_valid_block_count < (block_t) count);
+ BUG_ON(inode->i_blocks < count);
+ inode->i_blocks -= count;
+ sbi->total_valid_block_count -= (block_t)count;
+ spin_unlock(&sbi->stat_lock);
+ return 0;
+}
+
+static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+ atomic_inc(&sbi->nr_pages[count_type]);
+ F2FS_SET_SB_DIRT(sbi);
+}
+
+static inline void inode_inc_dirty_dents(struct inode *inode)
+{
+ atomic_inc(&F2FS_I(inode)->dirty_dents);
+}
+
+static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+ atomic_dec(&sbi->nr_pages[count_type]);
+}
+
+static inline void inode_dec_dirty_dents(struct inode *inode)
+{
+ atomic_dec(&F2FS_I(inode)->dirty_dents);
+}
+
+static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+{
+ return atomic_read(&sbi->nr_pages[count_type]);
+}
+
+static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
+{
+ unsigned int pages_per_sec = sbi->segs_per_sec *
+ (1 << sbi->log_blocks_per_seg);
+ return ((get_pages(sbi, block_type) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+}
+
+static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_block_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+ /* return NAT or SIT bitmap */
+ if (flag == NAT_BITMAP)
+ return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
+ else if (flag == SIT_BITMAP)
+ return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
+
+ return 0;
+}
+
+static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ int offset = (flag == NAT_BITMAP) ?
+ le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
+ return &ckpt->sit_nat_version_bitmap + offset;
+}
+
+static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
+{
+ block_t start_addr;
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
+
+ start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+
+ /*
+ * odd numbered checkpoint should at cp segment 0
+ * and even segent must be at cp segment 1
+ */
+ if (!(ckpt_version & 1))
+ start_addr += sbi->blocks_per_seg;
+
+ return start_addr;
+}
+
+static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
+{
+ return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ unsigned int count)
+{
+ block_t valid_block_count;
+ unsigned int valid_node_count;
+
+ spin_lock(&sbi->stat_lock);
+
+ valid_block_count = sbi->total_valid_block_count + (block_t)count;
+ sbi->alloc_valid_block_count += (block_t)count;
+ valid_node_count = sbi->total_valid_node_count + count;
+
+ if (valid_block_count > sbi->user_block_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+
+ if (valid_node_count > sbi->total_node_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+
+ if (inode)
+ inode->i_blocks += count;
+ sbi->total_valid_node_count = valid_node_count;
+ sbi->total_valid_block_count = valid_block_count;
+ spin_unlock(&sbi->stat_lock);
+
+ return true;
+}
+
+static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ unsigned int count)
+{
+ spin_lock(&sbi->stat_lock);
+
+ BUG_ON(sbi->total_valid_block_count < count);
+ BUG_ON(sbi->total_valid_node_count < count);
+ BUG_ON(inode->i_blocks < count);
+
+ inode->i_blocks -= count;
+ sbi->total_valid_node_count -= count;
+ sbi->total_valid_block_count -= (block_t)count;
+
+ spin_unlock(&sbi->stat_lock);
+}
+
+static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
+{
+ unsigned int ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_node_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
+ sbi->total_valid_inode_count++;
+ spin_unlock(&sbi->stat_lock);
+}
+
+static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(!sbi->total_valid_inode_count);
+ sbi->total_valid_inode_count--;
+ spin_unlock(&sbi->stat_lock);
+ return 0;
+}
+
+static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ unsigned int ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_inode_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+ if (!page || IS_ERR(page))
+ return;
+
+ if (unlock) {
+ BUG_ON(!PageLocked(page));
+ unlock_page(page);
+ }
+ page_cache_release(page);
+}
+
+static inline void f2fs_put_dnode(struct dnode_of_data *dn)
+{
+ if (dn->node_page)
+ f2fs_put_page(dn->node_page, 1);
+ if (dn->inode_page && dn->node_page != dn->inode_page)
+ f2fs_put_page(dn->inode_page, 0);
+ dn->node_page = NULL;
+ dn->inode_page = NULL;
+}
+
+static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
+ size_t size, void (*ctor)(void *))
+{
+ return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+}
+
+#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
+
+static inline bool IS_INODE(struct page *page)
+{
+ struct f2fs_node *p = (struct f2fs_node *)page_address(page);
+ return RAW_IS_INODE(p);
+}
+
+static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
+{
+ return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
+}
+
+static inline block_t datablock_addr(struct page *node_page,
+ unsigned int offset)
+{
+ struct f2fs_node *raw_node;
+ __le32 *addr_array;
+ raw_node = (struct f2fs_node *)page_address(node_page);
+ addr_array = blkaddr_in_node(raw_node);
+ return le32_to_cpu(addr_array[offset]);
+}
+
+static inline int f2fs_test_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ return mask & *addr;
+}
+
+static inline int f2fs_set_bit(unsigned int nr, char *addr)
+{
+ int mask;
+ int ret;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr |= mask;
+ return ret;
+}
+
+static inline int f2fs_clear_bit(unsigned int nr, char *addr)
+{
+ int mask;
+ int ret;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr &= ~mask;
+ return ret;
+}
+
+/* used for f2fs_inode_info->flags */
+enum {
+ FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_NEED_CP, /* need to do checkpoint during fsync */
+ FI_INC_LINK, /* need to increment i_nlink */
+ FI_ACL_MODE, /* indicate acl mode */
+ FI_NO_ALLOC, /* should not allocate any blocks */
+};
+
+static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ set_bit(flag, &fi->flags);
+}
+
+static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+{
+ return test_bit(flag, &fi->flags);
+}
+
+static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ clear_bit(flag, &fi->flags);
+}
+
+static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+{
+ fi->i_acl_mode = mode;
+ set_inode_flag(fi, FI_ACL_MODE);
+}
+
+static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+ clear_inode_flag(fi, FI_ACL_MODE);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * file.c
+ */
+int f2fs_sync_file(struct file *, loff_t, loff_t, int);
+void truncate_data_blocks(struct dnode_of_data *);
+void f2fs_truncate(struct inode *);
+int f2fs_setattr(struct dentry *, struct iattr *);
+int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/*
+ * inode.c
+ */
+void f2fs_set_inode_flags(struct inode *);
+struct inode *f2fs_iget(struct super_block *, unsigned long);
+void update_inode(struct inode *, struct page *);
+int f2fs_write_inode(struct inode *, struct writeback_control *);
+void f2fs_evict_inode(struct inode *);
+
+/*
+ * namei.c
+ */
+struct dentry *f2fs_get_parent(struct dentry *child);
+
+/*
+ * dir.c
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+ struct page **);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
+ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
+ struct page *, struct inode *);
+void init_dent_inode(const struct qstr *, struct page *);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
+void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
+int f2fs_make_empty(struct inode *, struct inode *);
+bool f2fs_empty_dir(struct inode *);
+
+static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
+ inode);
+}
+
+/*
+ * super.c
+ */
+int f2fs_sync_fs(struct super_block *, int);
+extern __printf(3, 4)
+void f2fs_msg(struct super_block *, const char *, const char *, ...);
+
+/*
+ * hash.c
+ */
+f2fs_hash_t f2fs_dentry_hash(const char *, size_t);
+
+/*
+ * node.c
+ */
+struct dnode_of_data;
+struct node_info;
+
+int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
+int truncate_inode_blocks(struct inode *, pgoff_t);
+int remove_inode_page(struct inode *);
+int new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int);
+void ra_node_page(struct f2fs_sb_info *, nid_t);
+struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_node_page_ra(struct page *, int);
+void sync_inode_page(struct dnode_of_data *);
+int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+bool alloc_nid(struct f2fs_sb_info *, nid_t *);
+void alloc_nid_done(struct f2fs_sb_info *, nid_t);
+void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+void recover_node_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, struct node_info *, block_t);
+int recover_inode_page(struct f2fs_sb_info *, struct page *);
+int restore_node_summary(struct f2fs_sb_info *, unsigned int,
+ struct f2fs_summary_block *);
+void flush_nat_entries(struct f2fs_sb_info *);
+int build_node_manager(struct f2fs_sb_info *);
+void destroy_node_manager(struct f2fs_sb_info *);
+int __init create_node_manager_caches(void);
+void destroy_node_manager_caches(void);
+
+/*
+ * segment.c
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *);
+void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
+void clear_prefree_segments(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *);
+void allocate_new_segments(struct f2fs_sb_info *);
+struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+struct bio *f2fs_bio_alloc(struct block_device *, int);
+void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
+void write_meta_page(struct f2fs_sb_info *, struct page *);
+void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
+ block_t, block_t *);
+void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
+ block_t, block_t *);
+void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void recover_data_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, block_t, block_t);
+void rewrite_node_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, block_t, block_t);
+void write_data_summaries(struct f2fs_sb_info *, block_t);
+void write_node_summaries(struct f2fs_sb_info *, block_t);
+int lookup_journal_in_cursum(struct f2fs_summary_block *,
+ int, unsigned int, int);
+void flush_sit_entries(struct f2fs_sb_info *);
+int build_segment_manager(struct f2fs_sb_info *);
+void reset_victim_segmap(struct f2fs_sb_info *);
+void destroy_segment_manager(struct f2fs_sb_info *);
+
+/*
+ * checkpoint.c
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
+int check_orphan_space(struct f2fs_sb_info *);
+void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
+int recover_orphan_inodes(struct f2fs_sb_info *);
+int get_valid_checkpoint(struct f2fs_sb_info *);
+void set_dirty_dir_page(struct inode *, struct page *);
+void remove_dirty_dir_inode(struct inode *);
+void sync_dirty_dir_inodes(struct f2fs_sb_info *);
+void write_checkpoint(struct f2fs_sb_info *, bool);
+void init_orphan_info(struct f2fs_sb_info *);
+int __init create_checkpoint_caches(void);
+void destroy_checkpoint_caches(void);
+
+/*
+ * data.c
+ */
+int reserve_new_block(struct dnode_of_data *);
+void update_extent_cache(block_t, struct dnode_of_data *);
+struct page *find_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+int do_write_data_page(struct page *);
+
+/*
+ * gc.c
+ */
+int start_gc_thread(struct f2fs_sb_info *);
+void stop_gc_thread(struct f2fs_sb_info *);
+block_t start_bidx_of_node(unsigned int);
+int f2fs_gc(struct f2fs_sb_info *);
+void build_gc_manager(struct f2fs_sb_info *);
+int __init create_gc_caches(void);
+void destroy_gc_caches(void);
+
+/*
+ * recovery.c
+ */
+void recover_fsync_data(struct f2fs_sb_info *);
+bool space_for_roll_forward(struct f2fs_sb_info *);
+
+/*
+ * debug.c
+ */
+#ifdef CONFIG_F2FS_STAT_FS
+struct f2fs_stat_info {
+ struct list_head stat_list;
+ struct f2fs_sb_info *sbi;
+ struct mutex stat_lock;
+ int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
+ int main_area_segs, main_area_sections, main_area_zones;
+ int hit_ext, total_ext;
+ int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+ int nats, sits, fnids;
+ int total_count, utilization;
+ int bg_gc;
+ unsigned int valid_count, valid_node_count, valid_inode_count;
+ unsigned int bimodal, avg_vblocks;
+ int util_free, util_valid, util_invalid;
+ int rsvd_segs, overp_segs;
+ int dirty_count, node_pages, meta_pages;
+ int prefree_count, call_count;
+ int tot_segs, node_segs, data_segs, free_segs, free_secs;
+ int tot_blks, data_blks, node_blks;
+ int curseg[NR_CURSEG_TYPE];
+ int cursec[NR_CURSEG_TYPE];
+ int curzone[NR_CURSEG_TYPE];
+
+ unsigned int segment_count[2];
+ unsigned int block_count[2];
+ unsigned base_mem, cache_mem;
+};
+
+#define stat_inc_call_count(si) ((si)->call_count++)
+
+#define stat_inc_seg_count(sbi, type) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ (si)->tot_segs++; \
+ if (type == SUM_TYPE_DATA) \
+ si->data_segs++; \
+ else \
+ si->node_segs++; \
+ } while (0)
+
+#define stat_inc_tot_blk_count(si, blks) \
+ (si->tot_blks += (blks))
+
+#define stat_inc_data_blk_count(sbi, blks) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ stat_inc_tot_blk_count(si, blks); \
+ si->data_blks += (blks); \
+ } while (0)
+
+#define stat_inc_node_blk_count(sbi, blks) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ stat_inc_tot_blk_count(si, blks); \
+ si->node_blks += (blks); \
+ } while (0)
+
+int f2fs_build_stats(struct f2fs_sb_info *);
+void f2fs_destroy_stats(struct f2fs_sb_info *);
+void __init f2fs_create_root_stats(void);
+void f2fs_destroy_root_stats(void);
+#else
+#define stat_inc_call_count(si)
+#define stat_inc_seg_count(si, type)
+#define stat_inc_tot_blk_count(si, blks)
+#define stat_inc_data_blk_count(si, blks)
+#define stat_inc_node_blk_count(sbi, blks)
+
+static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
+static inline void __init f2fs_create_root_stats(void) { }
+static inline void f2fs_destroy_root_stats(void) { }
+#endif
+
+extern const struct file_operations f2fs_dir_operations;
+extern const struct file_operations f2fs_file_operations;
+extern const struct inode_operations f2fs_file_inode_operations;
+extern const struct address_space_operations f2fs_dblock_aops;
+extern const struct address_space_operations f2fs_node_aops;
+extern const struct address_space_operations f2fs_meta_aops;
+extern const struct inode_operations f2fs_dir_inode_operations;
+extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_special_inode_operations;
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
new file mode 100644
index 000000000000..958a46da19ae
--- /dev/null
+++ b/fs/f2fs/file.c
@@ -0,0 +1,671 @@
+/*
+ * fs/f2fs/file.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/mount.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ block_t old_blk_addr;
+ struct dnode_of_data dn;
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ sb_start_pagefault(inode->i_sb);
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ /* block allocation */
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, page->index, 0);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ goto out;
+ }
+
+ old_blk_addr = dn.data_blkaddr;
+
+ if (old_blk_addr == NULL_ADDR) {
+ err = reserve_new_block(&dn);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ goto out;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ lock_page(page);
+ if (page->mapping != inode->i_mapping ||
+ page_offset(page) >= i_size_read(inode) ||
+ !PageUptodate(page)) {
+ unlock_page(page);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * check to see if the page is mapped already (no holes)
+ */
+ if (PageMappedToDisk(page))
+ goto out;
+
+ /* fill the page */
+ wait_on_page_writeback(page);
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ unsigned offset;
+ offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ }
+ set_page_dirty(page);
+ SetPageUptodate(page);
+
+ file_update_time(vma->vm_file);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return block_page_mkwrite_return(err);
+}
+
+static const struct vm_operations_struct f2fs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = f2fs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
+{
+ struct dentry *dentry;
+ nid_t pino;
+
+ inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ if (!dentry) {
+ iput(inode);
+ return 0;
+ }
+ pino = dentry->d_parent->d_inode->i_ino;
+ dput(dentry);
+ iput(inode);
+ return !is_checkpointed_node(sbi, pino);
+}
+
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ unsigned long long cur_version;
+ int ret = 0;
+ bool need_cp = false;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+
+ /* guarantee free sections for fsync */
+ f2fs_balance_fs(sbi);
+
+ mutex_lock(&inode->i_mutex);
+
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ goto out;
+
+ mutex_lock(&sbi->cp_mutex);
+ cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+ mutex_unlock(&sbi->cp_mutex);
+
+ if (F2FS_I(inode)->data_version != cur_version &&
+ !(inode->i_state & I_DIRTY))
+ goto out;
+ F2FS_I(inode)->data_version--;
+
+ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+ need_cp = true;
+ else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+ need_cp = true;
+ else if (!space_for_roll_forward(sbi))
+ need_cp = true;
+ else if (need_to_sync_dir(sbi, inode))
+ need_cp = true;
+
+ if (need_cp) {
+ /* all the dirty node pages should be flushed for POR */
+ ret = f2fs_sync_fs(inode->i_sb, 1);
+ clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
+ } else {
+ /* if there is no written node page, write its inode page */
+ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ ret = f2fs_write_inode(inode, NULL);
+ if (ret)
+ goto out;
+ }
+ filemap_fdatawait_range(sbi->node_inode->i_mapping,
+ 0, LONG_MAX);
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &f2fs_file_vm_ops;
+ return 0;
+}
+
+static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+{
+ int nr_free = 0, ofs = dn->ofs_in_node;
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_node *raw_node;
+ __le32 *addr;
+
+ raw_node = page_address(dn->node_page);
+ addr = blkaddr_in_node(raw_node) + ofs;
+
+ for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+ block_t blkaddr = le32_to_cpu(*addr);
+ if (blkaddr == NULL_ADDR)
+ continue;
+
+ update_extent_cache(NULL_ADDR, dn);
+ invalidate_blocks(sbi, blkaddr);
+ dec_valid_block_count(sbi, dn->inode, 1);
+ nr_free++;
+ }
+ if (nr_free) {
+ set_page_dirty(dn->node_page);
+ sync_inode_page(dn);
+ }
+ dn->ofs_in_node = ofs;
+ return nr_free;
+}
+
+void truncate_data_blocks(struct dnode_of_data *dn)
+{
+ truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+}
+
+static void truncate_partial_data_page(struct inode *inode, u64 from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ struct page *page;
+
+ if (!offset)
+ return;
+
+ page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return;
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
+static int truncate_blocks(struct inode *inode, u64 from)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ unsigned int blocksize = inode->i_sb->s_blocksize;
+ struct dnode_of_data dn;
+ pgoff_t free_from;
+ int count = 0;
+ int err;
+
+ free_from = (pgoff_t)
+ ((from + blocksize - 1) >> (sbi->log_blocksize));
+
+ mutex_lock_op(sbi, DATA_TRUNC);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+ if (err) {
+ if (err == -ENOENT)
+ goto free_next;
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ return err;
+ }
+
+ if (IS_INODE(dn.node_page))
+ count = ADDRS_PER_INODE;
+ else
+ count = ADDRS_PER_BLOCK;
+
+ count -= dn.ofs_in_node;
+ BUG_ON(count < 0);
+ if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
+ truncate_data_blocks_range(&dn, count);
+ free_from += count;
+ }
+
+ f2fs_put_dnode(&dn);
+free_next:
+ err = truncate_inode_blocks(inode, free_from);
+ mutex_unlock_op(sbi, DATA_TRUNC);
+
+ /* lastly zero out the first data page */
+ truncate_partial_data_page(inode, from);
+
+ return err;
+}
+
+void f2fs_truncate(struct inode *inode)
+{
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+
+ if (!truncate_blocks(inode, i_size_read(inode))) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ }
+}
+
+static int f2fs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->blocks <<= 3;
+ return 0;
+}
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ inode->i_atime = timespec_trunc(attr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MTIME)
+ inode->i_mtime = timespec_trunc(attr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_CTIME)
+ inode->i_ctime = timespec_trunc(attr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ set_acl_inode(fi, mode);
+ }
+}
+#else
+#define __setattr_copy setattr_copy
+#endif
+
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int err;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_SB(inode->i_sb));
+ }
+
+ __setattr_copy(inode, attr);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ err = f2fs_acl_chmod(inode);
+ if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
+ inode->i_mode = fi->i_acl_mode;
+ clear_inode_flag(fi, FI_ACL_MODE);
+ }
+ }
+
+ mark_inode_dirty(inode);
+ return err;
+}
+
+const struct inode_operations f2fs_file_inode_operations = {
+ .getattr = f2fs_getattr,
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+static void fill_zero(struct inode *inode, pgoff_t index,
+ loff_t start, loff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+
+ if (!len)
+ return;
+
+ f2fs_balance_fs(sbi);
+
+ mutex_lock_op(sbi, DATA_NEW);
+ page = get_new_data_page(inode, index, false);
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ if (!IS_ERR(page)) {
+ wait_on_page_writeback(page);
+ zero_user(page, start, len);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ }
+}
+
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
+{
+ pgoff_t index;
+ int err;
+
+ for (index = pg_start; index < pg_end; index++) {
+ struct dnode_of_data dn;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ f2fs_balance_fs(sbi);
+
+ mutex_lock_op(sbi, DATA_TRUNC);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ if (err == -ENOENT)
+ continue;
+ return err;
+ }
+
+ if (dn.data_blkaddr != NULL_ADDR)
+ truncate_data_blocks_range(&dn, 1);
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ }
+ return 0;
+}
+
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+{
+ pgoff_t pg_start, pg_end;
+ loff_t off_start, off_end;
+ int ret = 0;
+
+ pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+ off_start = offset & (PAGE_CACHE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+ if (pg_start == pg_end) {
+ fill_zero(inode, pg_start, off_start,
+ off_end - off_start);
+ } else {
+ if (off_start)
+ fill_zero(inode, pg_start++, off_start,
+ PAGE_CACHE_SIZE - off_start);
+ if (off_end)
+ fill_zero(inode, pg_end, 0, off_end);
+
+ if (pg_start < pg_end) {
+ struct address_space *mapping = inode->i_mapping;
+ loff_t blk_start, blk_end;
+
+ blk_start = pg_start << PAGE_CACHE_SHIFT;
+ blk_end = pg_end << PAGE_CACHE_SHIFT;
+ truncate_inode_pages_range(mapping, blk_start,
+ blk_end - 1);
+ ret = truncate_hole(inode, pg_start, pg_end);
+ }
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ i_size_read(inode) <= (offset + len)) {
+ i_size_write(inode, offset);
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
+static int expand_inode_data(struct inode *inode, loff_t offset,
+ loff_t len, int mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ pgoff_t index, pg_start, pg_end;
+ loff_t new_size = i_size_read(inode);
+ loff_t off_start, off_end;
+ int ret = 0;
+
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret)
+ return ret;
+
+ pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+ off_start = offset & (PAGE_CACHE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+ for (index = pg_start; index <= pg_end; index++) {
+ struct dnode_of_data dn;
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, index, 0);
+ if (ret) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ break;
+ }
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ ret = reserve_new_block(&dn);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ break;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ if (pg_start == pg_end)
+ new_size = offset + len;
+ else if (index == pg_start && off_start)
+ new_size = (index + 1) << PAGE_CACHE_SHIFT;
+ else if (index == pg_end)
+ new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+ else
+ new_size += PAGE_CACHE_SIZE;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ i_size_read(inode) < new_size) {
+ i_size_write(inode, new_size);
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
+static long f2fs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ long ret;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = punch_hole(inode, offset, len, mode);
+ else
+ ret = expand_inode_data(inode, offset, len, mode);
+
+ if (!ret) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ }
+ return ret;
+}
+
+#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & F2FS_REG_FLMASK;
+ else
+ return flags & F2FS_OTHER_FLMASK;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int flags;
+ int ret;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *) arg);
+ case FS_IOC_SETFLAGS:
+ {
+ unsigned int oldflags;
+
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
+
+ oldflags = fi->i_flags;
+
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
+ goto out;
+ }
+ }
+
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ fi->i_flags = flags;
+ mutex_unlock(&inode->i_mutex);
+
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+out:
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case F2FS_IOC32_GETFLAGS:
+ cmd = F2FS_IOC_GETFLAGS;
+ break;
+ case F2FS_IOC32_SETFLAGS:
+ cmd = F2FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+const struct file_operations f2fs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .open = generic_file_open,
+ .mmap = f2fs_file_mmap,
+ .fsync = f2fs_sync_file,
+ .fallocate = f2fs_fallocate,
+ .unlocked_ioctl = f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = f2fs_compat_ioctl,
+#endif
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
new file mode 100644
index 000000000000..94b8a0c48453
--- /dev/null
+++ b/fs/f2fs/gc.c
@@ -0,0 +1,698 @@
+/*
+ * fs/f2fs/gc.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/f2fs_fs.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/blkdev.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+
+static struct kmem_cache *winode_slab;
+
+static int gc_thread_func(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+ long wait_ms;
+
+ wait_ms = GC_THREAD_MIN_SLEEP_TIME;
+
+ do {
+ if (try_to_freeze())
+ continue;
+ else
+ wait_event_interruptible_timeout(*wq,
+ kthread_should_stop(),
+ msecs_to_jiffies(wait_ms));
+ if (kthread_should_stop())
+ break;
+
+ if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
+ wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+ continue;
+ }
+
+ /*
+ * [GC triggering condition]
+ * 0. GC is not conducted currently.
+ * 1. There are enough dirty segments.
+ * 2. IO subsystem is idle by checking the # of writeback pages.
+ * 3. IO subsystem is idle by checking the # of requests in
+ * bdev's request list.
+ *
+ * Note) We have to avoid triggering GCs too much frequently.
+ * Because it is possible that some segments can be
+ * invalidated soon after by user update or deletion.
+ * So, I'd like to wait some time to collect dirty segments.
+ */
+ if (!mutex_trylock(&sbi->gc_mutex))
+ continue;
+
+ if (!is_idle(sbi)) {
+ wait_ms = increase_sleep_time(wait_ms);
+ mutex_unlock(&sbi->gc_mutex);
+ continue;
+ }
+
+ if (has_enough_invalid_blocks(sbi))
+ wait_ms = decrease_sleep_time(wait_ms);
+ else
+ wait_ms = increase_sleep_time(wait_ms);
+
+ sbi->bg_gc++;
+
+ /* if return value is not zero, no victim was selected */
+ if (f2fs_gc(sbi))
+ wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
+ else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
+ wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+int start_gc_thread(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_gc_kthread *gc_th;
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+
+ if (!test_opt(sbi, BG_GC))
+ return 0;
+ gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ if (!gc_th)
+ return -ENOMEM;
+
+ sbi->gc_thread = gc_th;
+ init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+ sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
+ "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(gc_th->f2fs_gc_task)) {
+ kfree(gc_th);
+ sbi->gc_thread = NULL;
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void stop_gc_thread(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+ if (!gc_th)
+ return;
+ kthread_stop(gc_th->f2fs_gc_task);
+ kfree(gc_th);
+ sbi->gc_thread = NULL;
+}
+
+static int select_gc_type(int gc_type)
+{
+ return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+}
+
+static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
+ int type, struct victim_sel_policy *p)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ if (p->alloc_mode) {
+ p->gc_mode = GC_GREEDY;
+ p->dirty_segmap = dirty_i->dirty_segmap[type];
+ p->ofs_unit = 1;
+ } else {
+ p->gc_mode = select_gc_type(gc_type);
+ p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
+ p->ofs_unit = sbi->segs_per_sec;
+ }
+ p->offset = sbi->last_victim[p->gc_mode];
+}
+
+static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ /* SSR allocates in a segment unit */
+ if (p->alloc_mode == SSR)
+ return 1 << sbi->log_blocks_per_seg;
+ if (p->gc_mode == GC_GREEDY)
+ return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ else if (p->gc_mode == GC_CB)
+ return UINT_MAX;
+ else /* No other gc_mode */
+ return 0;
+}
+
+static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno;
+
+ /*
+ * If the gc_type is FG_GC, we can select victim segments
+ * selected by background GC before.
+ * Those segments guarantee they have small valid blocks.
+ */
+ segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
+ TOTAL_SEGS(sbi), 0);
+ if (segno < TOTAL_SEGS(sbi)) {
+ clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+ return segno;
+ }
+ return NULL_SEGNO;
+}
+
+static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int secno = GET_SECNO(sbi, segno);
+ unsigned int start = secno * sbi->segs_per_sec;
+ unsigned long long mtime = 0;
+ unsigned int vblocks;
+ unsigned char age = 0;
+ unsigned char u;
+ unsigned int i;
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ mtime += get_seg_entry(sbi, start + i)->mtime;
+ vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+
+ mtime = div_u64(mtime, sbi->segs_per_sec);
+ vblocks = div_u64(vblocks, sbi->segs_per_sec);
+
+ u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+
+ /* Handle if the system time is changed by user */
+ if (mtime < sit_i->min_mtime)
+ sit_i->min_mtime = mtime;
+ if (mtime > sit_i->max_mtime)
+ sit_i->max_mtime = mtime;
+ if (sit_i->max_mtime != sit_i->min_mtime)
+ age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
+ sit_i->max_mtime - sit_i->min_mtime);
+
+ return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
+}
+
+static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
+ struct victim_sel_policy *p)
+{
+ if (p->alloc_mode == SSR)
+ return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+ /* alloc_mode == LFS */
+ if (p->gc_mode == GC_GREEDY)
+ return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ else
+ return get_cb_cost(sbi, segno);
+}
+
+/*
+ * This function is called from two pathes.
+ * One is garbage collection and the other is SSR segment selection.
+ * When it is called during GC, it just gets a victim segment
+ * and it does not remove it from dirty seglist.
+ * When it is called from SSR segment selection, it finds a segment
+ * which has minimum valid blocks and removes it from dirty seglist.
+ */
+static int get_victim_by_default(struct f2fs_sb_info *sbi,
+ unsigned int *result, int gc_type, int type, char alloc_mode)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct victim_sel_policy p;
+ unsigned int segno;
+ int nsearched = 0;
+
+ p.alloc_mode = alloc_mode;
+ select_policy(sbi, gc_type, type, &p);
+
+ p.min_segno = NULL_SEGNO;
+ p.min_cost = get_max_cost(sbi, &p);
+
+ mutex_lock(&dirty_i->seglist_lock);
+
+ if (p.alloc_mode == LFS && gc_type == FG_GC) {
+ p.min_segno = check_bg_victims(sbi);
+ if (p.min_segno != NULL_SEGNO)
+ goto got_it;
+ }
+
+ while (1) {
+ unsigned long cost;
+
+ segno = find_next_bit(p.dirty_segmap,
+ TOTAL_SEGS(sbi), p.offset);
+ if (segno >= TOTAL_SEGS(sbi)) {
+ if (sbi->last_victim[p.gc_mode]) {
+ sbi->last_victim[p.gc_mode] = 0;
+ p.offset = 0;
+ continue;
+ }
+ break;
+ }
+ p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+
+ if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
+ continue;
+ if (gc_type == BG_GC &&
+ test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+ continue;
+ if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+ continue;
+
+ cost = get_gc_cost(sbi, segno, &p);
+
+ if (p.min_cost > cost) {
+ p.min_segno = segno;
+ p.min_cost = cost;
+ }
+
+ if (cost == get_max_cost(sbi, &p))
+ continue;
+
+ if (nsearched++ >= MAX_VICTIM_SEARCH) {
+ sbi->last_victim[p.gc_mode] = segno;
+ break;
+ }
+ }
+got_it:
+ if (p.min_segno != NULL_SEGNO) {
+ *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+ if (p.alloc_mode == LFS) {
+ int i;
+ for (i = 0; i < p.ofs_unit; i++)
+ set_bit(*result + i,
+ dirty_i->victim_segmap[gc_type]);
+ }
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ return (p.min_segno == NULL_SEGNO) ? 0 : 1;
+}
+
+static const struct victim_selection default_v_ops = {
+ .get_victim = get_victim_by_default,
+};
+
+static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
+{
+ struct list_head *this;
+ struct inode_entry *ie;
+
+ list_for_each(this, ilist) {
+ ie = list_entry(this, struct inode_entry, list);
+ if (ie->inode->i_ino == ino)
+ return ie->inode;
+ }
+ return NULL;
+}
+
+static void add_gc_inode(struct inode *inode, struct list_head *ilist)
+{
+ struct list_head *this;
+ struct inode_entry *new_ie, *ie;
+
+ list_for_each(this, ilist) {
+ ie = list_entry(this, struct inode_entry, list);
+ if (ie->inode == inode) {
+ iput(inode);
+ return;
+ }
+ }
+repeat:
+ new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
+ if (!new_ie) {
+ cond_resched();
+ goto repeat;
+ }
+ new_ie->inode = inode;
+ list_add_tail(&new_ie->list, ilist);
+}
+
+static void put_gc_inode(struct list_head *ilist)
+{
+ struct inode_entry *ie, *next_ie;
+ list_for_each_entry_safe(ie, next_ie, ilist, list) {
+ iput(ie->inode);
+ list_del(&ie->list);
+ kmem_cache_free(winode_slab, ie);
+ }
+}
+
+static int check_valid_map(struct f2fs_sb_info *sbi,
+ unsigned int segno, int offset)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct seg_entry *sentry;
+ int ret;
+
+ mutex_lock(&sit_i->sentry_lock);
+ sentry = get_seg_entry(sbi, segno);
+ ret = f2fs_test_bit(offset, sentry->cur_valid_map);
+ mutex_unlock(&sit_i->sentry_lock);
+ return ret;
+}
+
+/*
+ * This function compares node address got in summary with that in NAT.
+ * On validity, copy that node with cold status, otherwise (invalid node)
+ * ignore that.
+ */
+static void gc_node_segment(struct f2fs_sb_info *sbi,
+ struct f2fs_summary *sum, unsigned int segno, int gc_type)
+{
+ bool initial = true;
+ struct f2fs_summary *entry;
+ int off;
+
+next_step:
+ entry = sum;
+ for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ nid_t nid = le32_to_cpu(entry->nid);
+ struct page *node_page;
+
+ /* stop BG_GC if there is not enough free sections. */
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ return;
+
+ if (check_valid_map(sbi, segno, off) == 0)
+ continue;
+
+ if (initial) {
+ ra_node_page(sbi, nid);
+ continue;
+ }
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ continue;
+
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ f2fs_put_page(node_page, 1);
+ stat_inc_node_blk_count(sbi, 1);
+ }
+ if (initial) {
+ initial = false;
+ goto next_step;
+ }
+
+ if (gc_type == FG_GC) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+ sync_node_pages(sbi, 0, &wbc);
+ }
+}
+
+/*
+ * Calculate start block index indicating the given node offset.
+ * Be careful, caller should give this node offset only indicating direct node
+ * blocks. If any node offsets, which point the other types of node blocks such
+ * as indirect or double indirect node blocks, are given, it must be a caller's
+ * bug.
+ */
+block_t start_bidx_of_node(unsigned int node_ofs)
+{
+ unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
+ unsigned int bidx;
+
+ if (node_ofs == 0)
+ return 0;
+
+ if (node_ofs <= 2) {
+ bidx = node_ofs - 1;
+ } else if (node_ofs <= indirect_blks) {
+ int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+ bidx = node_ofs - 2 - dec;
+ } else {
+ int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+ bidx = node_ofs - 5 - dec;
+ }
+ return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
+}
+
+static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ struct node_info *dni, block_t blkaddr, unsigned int *nofs)
+{
+ struct page *node_page;
+ nid_t nid;
+ unsigned int ofs_in_node;
+ block_t source_blkaddr;
+
+ nid = le32_to_cpu(sum->nid);
+ ofs_in_node = le16_to_cpu(sum->ofs_in_node);
+
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return 0;
+
+ get_node_info(sbi, nid, dni);
+
+ if (sum->version != dni->version) {
+ f2fs_put_page(node_page, 1);
+ return 0;
+ }
+
+ *nofs = ofs_of_node(node_page);
+ source_blkaddr = datablock_addr(node_page, ofs_in_node);
+ f2fs_put_page(node_page, 1);
+
+ if (source_blkaddr != blkaddr)
+ return 0;
+ return 1;
+}
+
+static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+{
+ if (page->mapping != inode->i_mapping)
+ goto out;
+
+ if (inode != page->mapping->host)
+ goto out;
+
+ if (PageWriteback(page))
+ goto out;
+
+ if (gc_type == BG_GC) {
+ set_page_dirty(page);
+ set_cold_data(page);
+ } else {
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (clear_page_dirty_for_io(page) &&
+ S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ set_cold_data(page);
+ do_write_data_page(page);
+ mutex_unlock_op(sbi, DATA_WRITE);
+ clear_cold_data(page);
+ }
+out:
+ f2fs_put_page(page, 1);
+}
+
+/*
+ * This function tries to get parent node of victim data block, and identifies
+ * data block validity. If the block is valid, copy that with cold status and
+ * modify parent node.
+ * If the parent node is not valid or the data block address is different,
+ * the victim data block is ignored.
+ */
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ struct list_head *ilist, unsigned int segno, int gc_type)
+{
+ struct super_block *sb = sbi->sb;
+ struct f2fs_summary *entry;
+ block_t start_addr;
+ int off;
+ int phase = 0;
+
+ start_addr = START_BLOCK(sbi, segno);
+
+next_step:
+ entry = sum;
+ for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ struct page *data_page;
+ struct inode *inode;
+ struct node_info dni; /* dnode info for the data */
+ unsigned int ofs_in_node, nofs;
+ block_t start_bidx;
+
+ /* stop BG_GC if there is not enough free sections. */
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ return;
+
+ if (check_valid_map(sbi, segno, off) == 0)
+ continue;
+
+ if (phase == 0) {
+ ra_node_page(sbi, le32_to_cpu(entry->nid));
+ continue;
+ }
+
+ /* Get an inode by ino with checking validity */
+ if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
+ continue;
+
+ if (phase == 1) {
+ ra_node_page(sbi, dni.ino);
+ continue;
+ }
+
+ start_bidx = start_bidx_of_node(nofs);
+ ofs_in_node = le16_to_cpu(entry->ofs_in_node);
+
+ if (phase == 2) {
+ inode = f2fs_iget(sb, dni.ino);
+ if (IS_ERR(inode))
+ continue;
+
+ data_page = find_data_page(inode,
+ start_bidx + ofs_in_node);
+ if (IS_ERR(data_page))
+ goto next_iput;
+
+ f2fs_put_page(data_page, 0);
+ add_gc_inode(inode, ilist);
+ } else {
+ inode = find_gc_inode(dni.ino, ilist);
+ if (inode) {
+ data_page = get_lock_data_page(inode,
+ start_bidx + ofs_in_node);
+ if (IS_ERR(data_page))
+ continue;
+ move_data_page(inode, data_page, gc_type);
+ stat_inc_data_blk_count(sbi, 1);
+ }
+ }
+ continue;
+next_iput:
+ iput(inode);
+ }
+ if (++phase < 4)
+ goto next_step;
+
+ if (gc_type == FG_GC)
+ f2fs_submit_bio(sbi, DATA, true);
+}
+
+static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
+ int gc_type, int type)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ int ret;
+ mutex_lock(&sit_i->sentry_lock);
+ ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
+ mutex_unlock(&sit_i->sentry_lock);
+ return ret;
+}
+
+static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+ struct list_head *ilist, int gc_type)
+{
+ struct page *sum_page;
+ struct f2fs_summary_block *sum;
+
+ /* read segment summary of victim */
+ sum_page = get_sum_page(sbi, segno);
+ if (IS_ERR(sum_page))
+ return;
+
+ /*
+ * CP needs to lock sum_page. In this time, we don't need
+ * to lock this page, because this summary page is not gone anywhere.
+ * Also, this page is not gonna be updated before GC is done.
+ */
+ unlock_page(sum_page);
+ sum = page_address(sum_page);
+
+ switch (GET_SUM_TYPE((&sum->footer))) {
+ case SUM_TYPE_NODE:
+ gc_node_segment(sbi, sum->entries, segno, gc_type);
+ break;
+ case SUM_TYPE_DATA:
+ gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+ break;
+ }
+ stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+ stat_inc_call_count(sbi->stat_info);
+
+ f2fs_put_page(sum_page, 0);
+}
+
+int f2fs_gc(struct f2fs_sb_info *sbi)
+{
+ struct list_head ilist;
+ unsigned int segno, i;
+ int gc_type = BG_GC;
+ int nfree = 0;
+ int ret = -1;
+
+ INIT_LIST_HEAD(&ilist);
+gc_more:
+ if (!(sbi->sb->s_flags & MS_ACTIVE))
+ goto stop;
+
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree))
+ gc_type = FG_GC;
+
+ if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+ goto stop;
+ ret = 0;
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ do_garbage_collect(sbi, segno + i, &ilist, gc_type);
+
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+ nfree++;
+
+ if (has_not_enough_free_secs(sbi, nfree))
+ goto gc_more;
+
+ if (gc_type == FG_GC)
+ write_checkpoint(sbi, false);
+stop:
+ mutex_unlock(&sbi->gc_mutex);
+
+ put_gc_inode(&ilist);
+ return ret;
+}
+
+void build_gc_manager(struct f2fs_sb_info *sbi)
+{
+ DIRTY_I(sbi)->v_ops = &default_v_ops;
+}
+
+int __init create_gc_caches(void)
+{
+ winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
+ sizeof(struct inode_entry), NULL);
+ if (!winode_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void destroy_gc_caches(void)
+{
+ kmem_cache_destroy(winode_slab);
+}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
new file mode 100644
index 000000000000..30b2db003acd
--- /dev/null
+++ b/fs/f2fs/gc.h
@@ -0,0 +1,96 @@
+/*
+ * fs/f2fs/gc.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define GC_THREAD_MIN_WB_PAGES 1 /*
+ * a threshold to determine
+ * whether IO subsystem is idle
+ * or not
+ */
+#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME 30000
+#define GC_THREAD_NOGC_SLEEP_TIME 10000
+#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
+#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
+
+/* Search max. number of dirty segments to select a victim segment */
+#define MAX_VICTIM_SEARCH 20
+
+struct f2fs_gc_kthread {
+ struct task_struct *f2fs_gc_task;
+ wait_queue_head_t gc_wait_queue_head;
+};
+
+struct inode_entry {
+ struct list_head list;
+ struct inode *inode;
+};
+
+/*
+ * inline functions
+ */
+static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
+{
+ if (free_segments(sbi) < overprovision_segments(sbi))
+ return 0;
+ else
+ return (free_segments(sbi) - overprovision_segments(sbi))
+ << sbi->log_blocks_per_seg;
+}
+
+static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+{
+ return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+}
+
+static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t reclaimable_user_blocks = sbi->user_block_count -
+ written_block_count(sbi);
+ return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
+}
+
+static inline long increase_sleep_time(long wait)
+{
+ wait += GC_THREAD_MIN_SLEEP_TIME;
+ if (wait > GC_THREAD_MAX_SLEEP_TIME)
+ wait = GC_THREAD_MAX_SLEEP_TIME;
+ return wait;
+}
+
+static inline long decrease_sleep_time(long wait)
+{
+ wait -= GC_THREAD_MIN_SLEEP_TIME;
+ if (wait <= GC_THREAD_MIN_SLEEP_TIME)
+ wait = GC_THREAD_MIN_SLEEP_TIME;
+ return wait;
+}
+
+static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t invalid_user_blocks = sbi->user_block_count -
+ written_block_count(sbi);
+ /*
+ * Background GC is triggered with the following condition.
+ * 1. There are a number of invalid blocks.
+ * 2. There is not enough free space.
+ */
+ if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
+ free_user_blocks(sbi) < limit_free_user_blocks(sbi))
+ return true;
+ return false;
+}
+
+static inline int is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+ return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
+}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
new file mode 100644
index 000000000000..6eb8d269b53b
--- /dev/null
+++ b/fs/f2fs/hash.c
@@ -0,0 +1,101 @@
+/*
+ * fs/f2fs/hash.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/cryptohash.h>
+#include <linux/pagemap.h>
+
+#include "f2fs.h"
+
+/*
+ * Hashing code copied from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(unsigned int buf[4], unsigned int const in[])
+{
+ __u32 sum = 0;
+ __u32 b0 = buf[0], b1 = buf[1];
+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while (--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+{
+ unsigned pad, val;
+ int i;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num * 4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = msg[i] + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len)
+{
+ __u32 hash;
+ f2fs_hash_t f2fs_hash;
+ const char *p;
+ __u32 in[8], buf[4];
+
+ if ((len <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == '\0'))
+ return 0;
+
+ /* Initialize the default seed for the hash checksum functions */
+ buf[0] = 0x67452301;
+ buf[1] = 0xefcdab89;
+ buf[2] = 0x98badcfe;
+ buf[3] = 0x10325476;
+
+ p = name;
+ while (1) {
+ str2hashbuf(p, len, in, 4);
+ TEA_transform(buf, in);
+ p += 16;
+ if (len <= 16)
+ break;
+ len -= 16;
+ }
+ hash = buf[0];
+ f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
+ return f2fs_hash;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
new file mode 100644
index 000000000000..ddae412d30c8
--- /dev/null
+++ b/fs/f2fs/inode.c
@@ -0,0 +1,259 @@
+/*
+ * fs/f2fs/inode.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+
+#include "f2fs.h"
+#include "node.h"
+
+void f2fs_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = F2FS_I(inode)->i_flags;
+
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
+ S_NOATIME | S_DIRSYNC);
+
+ if (flags & FS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+static int do_read_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct page *node_page;
+ struct f2fs_node *rn;
+ struct f2fs_inode *ri;
+
+ /* Check if ino is within scope */
+ check_nid_range(sbi, inode->i_ino);
+
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
+
+ rn = page_address(node_page);
+ ri = &(rn->i);
+
+ inode->i_mode = le16_to_cpu(ri->i_mode);
+ i_uid_write(inode, le32_to_cpu(ri->i_uid));
+ i_gid_write(inode, le32_to_cpu(ri->i_gid));
+ set_nlink(inode, le32_to_cpu(ri->i_links));
+ inode->i_size = le64_to_cpu(ri->i_size);
+ inode->i_blocks = le64_to_cpu(ri->i_blocks);
+
+ inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+ inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ inode->i_generation = le32_to_cpu(ri->i_generation);
+ if (ri->i_addr[0])
+ inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ else
+ inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+
+ fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+ fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
+ fi->i_flags = le32_to_cpu(ri->i_flags);
+ fi->flags = 0;
+ fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
+ fi->i_advise = ri->i_advise;
+ fi->i_pino = le32_to_cpu(ri->i_pino);
+ get_extent_info(&fi->ext, ri->i_ext);
+ f2fs_put_page(node_page, 1);
+ return 0;
+}
+
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ int ret;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+ if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
+ goto make_now;
+
+ ret = do_read_inode(inode);
+ if (ret)
+ goto bad_inode;
+
+ if (!sbi->por_doing && inode->i_nlink == 0) {
+ ret = -ENOENT;
+ goto bad_inode;
+ }
+
+make_now:
+ if (ino == F2FS_NODE_INO(sbi)) {
+ inode->i_mapping->a_ops = &f2fs_node_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ } else if (ino == F2FS_META_INO(sbi)) {
+ inode->i_mapping->a_ops = &f2fs_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ } else if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &f2fs_dir_inode_operations;
+ inode->i_fop = &f2fs_dir_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
+ __GFP_ZERO);
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ inode->i_op = &f2fs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ ret = -EIO;
+ goto bad_inode;
+ }
+ unlock_new_inode(inode);
+
+ return inode;
+
+bad_inode:
+ iget_failed(inode);
+ return ERR_PTR(ret);
+}
+
+void update_inode(struct inode *inode, struct page *node_page)
+{
+ struct f2fs_node *rn;
+ struct f2fs_inode *ri;
+
+ wait_on_page_writeback(node_page);
+
+ rn = page_address(node_page);
+ ri = &(rn->i);
+
+ ri->i_mode = cpu_to_le16(inode->i_mode);
+ ri->i_advise = F2FS_I(inode)->i_advise;
+ ri->i_uid = cpu_to_le32(i_uid_read(inode));
+ ri->i_gid = cpu_to_le32(i_gid_read(inode));
+ ri->i_links = cpu_to_le32(inode->i_nlink);
+ ri->i_size = cpu_to_le64(i_size_read(inode));
+ ri->i_blocks = cpu_to_le64(inode->i_blocks);
+ set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+
+ ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+ ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+ ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
+ ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
+ ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
+ ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
+ ri->i_generation = cpu_to_le32(inode->i_generation);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ ri->i_addr[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ ri->i_addr[1] = 0;
+ } else {
+ ri->i_addr[0] = 0;
+ ri->i_addr[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ ri->i_addr[2] = 0;
+ }
+ }
+
+ set_cold_node(inode, node_page);
+ set_page_dirty(node_page);
+}
+
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *node_page;
+ bool need_lock = false;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return 0;
+
+ if (wbc)
+ f2fs_balance_fs(sbi);
+
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
+
+ if (!PageDirty(node_page)) {
+ need_lock = true;
+ f2fs_put_page(node_page, 1);
+ mutex_lock(&sbi->write_inode);
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page)) {
+ mutex_unlock(&sbi->write_inode);
+ return PTR_ERR(node_page);
+ }
+ }
+ update_inode(inode, node_page);
+ f2fs_put_page(node_page, 1);
+ if (need_lock)
+ mutex_unlock(&sbi->write_inode);
+ return 0;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero
+ */
+void f2fs_evict_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ goto no_delete;
+
+ BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
+ remove_dirty_dir_inode(inode);
+
+ if (inode->i_nlink || is_bad_inode(inode))
+ goto no_delete;
+
+ sb_start_intwrite(inode->i_sb);
+ set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+ i_size_write(inode, 0);
+
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ remove_inode_page(inode);
+ sb_end_intwrite(inode->i_sb);
+no_delete:
+ clear_inode(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 000000000000..1a49b881bac0
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,503 @@
+/*
+ * fs/f2fs/namei.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ int err;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock_op(sbi, NODE_NEW);
+ if (!alloc_nid(sbi, &ino)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = -ENOSPC;
+ goto fail;
+ }
+ mutex_unlock_op(sbi, NODE_NEW);
+
+ inode->i_uid = current_fsuid();
+
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
+ }
+
+ inode->i_ino = ino;
+ inode->i_mode = mode;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = sbi->s_next_generation++;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ nid_free = true;
+ goto out;
+ }
+
+ mark_inode_dirty(inode);
+ return inode;
+
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+fail:
+ iput(inode);
+ if (nid_free)
+ alloc_nid_failed(sbi, ino);
+ return ERR_PTR(err);
+}
+
+static int is_multimedia_file(const unsigned char *s, const char *sub)
+{
+ size_t slen = strlen(s);
+ size_t sublen = strlen(sub);
+ int ret;
+
+ if (sublen > slen)
+ return 1;
+
+ ret = memcmp(s + slen - sublen, sub, sublen);
+ if (ret) { /* compare upper case */
+ int i;
+ char upper_sub[8];
+ for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
+ upper_sub[i] = toupper(sub[i]);
+ return memcmp(s + slen - sublen, upper_sub, sublen);
+ }
+
+ return ret;
+}
+
+/*
+ * Set multimedia files as cold files for hot/cold data separation
+ */
+static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ int i;
+ __u8 (*extlist)[8] = sbi->raw_super->extension_list;
+
+ int count = le32_to_cpu(sbi->raw_super->extension_count);
+ for (i = 0; i < count; i++) {
+ if (!is_multimedia_file(name, extlist[i])) {
+ F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ break;
+ }
+ }
+}
+
+static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ nid_t ino = 0;
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ set_cold_file(sbi, inode, dentry->d_name.name);
+
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ ino = inode->i_ino;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, ino);
+
+ if (!sbi->por_doing)
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, ino);
+ return err;
+}
+
+static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ inode->i_ctime = CURRENT_TIME;
+ atomic_inc(&inode->i_count);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ d_instantiate(dentry, inode);
+ return 0;
+out:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ iput(inode);
+ return err;
+}
+
+struct dentry *f2fs_get_parent(struct dentry *child)
+{
+ struct qstr dotdot = QSTR_INIT("..", 2);
+ unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
+ if (!ino)
+ return ERR_PTR(-ENOENT);
+ return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode = NULL;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (de) {
+ nid_t ino = le32_to_cpu(de->ino);
+ kunmap(page);
+ f2fs_put_page(page, 0);
+
+ inode = f2fs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ int err = -ENOENT;
+
+ f2fs_balance_fs(sbi);
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (!de)
+ goto fail;
+
+ err = check_orphan_space(sbi);
+ if (err) {
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ goto fail;
+ }
+
+ f2fs_delete_entry(de, page, inode);
+
+ /* In order to evict this inode, we set it dirty */
+ mark_inode_dirty(inode);
+fail:
+ return err;
+}
+
+static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ size_t symlen = strlen(symname) + 1;
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ err = page_symlink(inode, symname, symlen);
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+ return err;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct inode *inode;
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &f2fs_dir_inode_operations;
+ inode->i_fop = &f2fs_dir_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out_fail;
+
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ return 0;
+
+out_fail:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ if (f2fs_empty_dir(inode))
+ return f2fs_unlink(dir, dentry);
+ return -ENOTEMPTY;
+}
+
+static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ int err = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ f2fs_balance_fs(sbi);
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ init_special_inode(inode, inode->i_mode, rdev);
+ inode->i_op = &f2fs_special_inode_operations;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, inode->i_ino);
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct super_block *sb = old_dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct page *old_dir_page;
+ struct page *old_page;
+ struct f2fs_dir_entry *old_dir_entry = NULL;
+ struct f2fs_dir_entry *old_entry;
+ struct f2fs_dir_entry *new_entry;
+ int err = -ENOENT;
+
+ f2fs_balance_fs(sbi);
+
+ old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ if (!old_entry)
+ goto out;
+
+ if (S_ISDIR(old_inode->i_mode)) {
+ err = -EIO;
+ old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+ if (!old_dir_entry)
+ goto out_old;
+ }
+
+ mutex_lock_op(sbi, RENAME);
+
+ if (new_inode) {
+ struct page *new_page;
+
+ err = -ENOTEMPTY;
+ if (old_dir_entry && !f2fs_empty_dir(new_inode))
+ goto out_dir;
+
+ err = -ENOENT;
+ new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
+ &new_page);
+ if (!new_entry)
+ goto out_dir;
+
+ f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+
+ new_inode->i_ctime = CURRENT_TIME;
+ if (old_dir_entry)
+ drop_nlink(new_inode);
+ drop_nlink(new_inode);
+ if (!new_inode->i_nlink)
+ add_orphan_inode(sbi, new_inode->i_ino);
+ f2fs_write_inode(new_inode, NULL);
+ } else {
+ err = f2fs_add_link(new_dentry, old_inode);
+ if (err)
+ goto out_dir;
+
+ if (old_dir_entry) {
+ inc_nlink(new_dir);
+ f2fs_write_inode(new_dir, NULL);
+ }
+ }
+
+ old_inode->i_ctime = CURRENT_TIME;
+ set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
+ mark_inode_dirty(old_inode);
+
+ f2fs_delete_entry(old_entry, old_page, NULL);
+
+ if (old_dir_entry) {
+ if (old_dir != new_dir) {
+ f2fs_set_link(old_inode, old_dir_entry,
+ old_dir_page, new_dir);
+ } else {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ drop_nlink(old_dir);
+ f2fs_write_inode(old_dir, NULL);
+ }
+
+ mutex_unlock_op(sbi, RENAME);
+ return 0;
+
+out_dir:
+ if (old_dir_entry) {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ mutex_unlock_op(sbi, RENAME);
+out_old:
+ kunmap(old_page);
+ f2fs_put_page(old_page, 0);
+out:
+ return err;
+}
+
+const struct inode_operations f2fs_dir_inode_operations = {
+ .create = f2fs_create,
+ .lookup = f2fs_lookup,
+ .link = f2fs_link,
+ .unlink = f2fs_unlink,
+ .symlink = f2fs_symlink,
+ .mkdir = f2fs_mkdir,
+ .rmdir = f2fs_rmdir,
+ .mknod = f2fs_mknod,
+ .rename = f2fs_rename,
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_special_inode_operations = {
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
new file mode 100644
index 000000000000..e275218904ed
--- /dev/null
+++ b/fs/f2fs/node.c
@@ -0,0 +1,1756 @@
+/*
+ * fs/f2fs/node.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *nat_entry_slab;
+static struct kmem_cache *free_nid_slab;
+
+static void clear_node_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ unsigned int long flags;
+
+ if (PageDirty(page)) {
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ clear_page_dirty_for_io(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ }
+ ClearPageUptodate(page);
+}
+
+static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ pgoff_t index = current_nat_addr(sbi, nid);
+ return get_meta_page(sbi, index);
+}
+
+static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct page *src_page;
+ struct page *dst_page;
+ pgoff_t src_off;
+ pgoff_t dst_off;
+ void *src_addr;
+ void *dst_addr;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ src_off = current_nat_addr(sbi, nid);
+ dst_off = next_nat_addr(sbi, src_off);
+
+ /* get current nat block page with lock */
+ src_page = get_meta_page(sbi, src_off);
+
+ /* Dirty src_page means that it is already the new target NAT page. */
+ if (PageDirty(src_page))
+ return src_page;
+
+ dst_page = grab_meta_page(sbi, dst_off);
+
+ src_addr = page_address(src_page);
+ dst_addr = page_address(dst_page);
+ memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ set_page_dirty(dst_page);
+ f2fs_put_page(src_page, 1);
+
+ set_to_next_nat(nm_i, nid);
+
+ return dst_page;
+}
+
+/*
+ * Readahead NAT pages
+ */
+static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct page *page;
+ pgoff_t index;
+ int i;
+
+ for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
+ if (nid >= nm_i->max_nid)
+ nid = 0;
+ index = current_nat_addr(sbi, nid);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ continue;
+ if (f2fs_readpage(sbi, page, index, READ)) {
+ f2fs_put_page(page, 1);
+ continue;
+ }
+ f2fs_put_page(page, 0);
+ }
+}
+
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+{
+ return radix_tree_lookup(&nm_i->nat_root, n);
+}
+
+static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
+}
+
+static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
+{
+ list_del(&e->list);
+ radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
+ nm_i->nat_cnt--;
+ kmem_cache_free(nat_entry_slab, e);
+}
+
+int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+ int is_cp = 1;
+
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e && !e->checkpointed)
+ is_cp = 0;
+ read_unlock(&nm_i->nat_tree_lock);
+ return is_cp;
+}
+
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct nat_entry *new;
+
+ new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+ if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
+ kmem_cache_free(nat_entry_slab, new);
+ return NULL;
+ }
+ memset(new, 0, sizeof(struct nat_entry));
+ nat_set_nid(new, nid);
+ list_add_tail(&new->list, &nm_i->nat_entries);
+ nm_i->nat_cnt++;
+ return new;
+}
+
+static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+ struct f2fs_nat_entry *ne)
+{
+ struct nat_entry *e;
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (!e) {
+ e = grab_nat_entry(nm_i, nid);
+ if (!e) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
+ nat_set_ino(e, le32_to_cpu(ne->ino));
+ nat_set_version(e, ne->version);
+ e->checkpointed = true;
+ }
+ write_unlock(&nm_i->nat_tree_lock);
+}
+
+static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
+ block_t new_blkaddr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, ni->nid);
+ if (!e) {
+ e = grab_nat_entry(nm_i, ni->nid);
+ if (!e) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ e->ni = *ni;
+ e->checkpointed = true;
+ BUG_ON(ni->blk_addr == NEW_ADDR);
+ } else if (new_blkaddr == NEW_ADDR) {
+ /*
+ * when nid is reallocated,
+ * previous nat entry can be remained in nat cache.
+ * So, reinitialize it with new information.
+ */
+ e->ni = *ni;
+ BUG_ON(ni->blk_addr != NULL_ADDR);
+ }
+
+ if (new_blkaddr == NEW_ADDR)
+ e->checkpointed = false;
+
+ /* sanity check */
+ BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
+ BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
+ new_blkaddr == NULL_ADDR);
+ BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
+ new_blkaddr == NEW_ADDR);
+ BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
+ nat_get_blkaddr(e) != NULL_ADDR &&
+ new_blkaddr == NEW_ADDR);
+
+ /* increament version no as node is removed */
+ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
+ unsigned char version = nat_get_version(e);
+ nat_set_version(e, inc_node_version(version));
+ }
+
+ /* change address */
+ nat_set_blkaddr(e, new_blkaddr);
+ __set_nat_cache_dirty(nm_i, e);
+ write_unlock(&nm_i->nat_tree_lock);
+}
+
+static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+ return 0;
+
+ write_lock(&nm_i->nat_tree_lock);
+ while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
+ struct nat_entry *ne;
+ ne = list_first_entry(&nm_i->nat_entries,
+ struct nat_entry, list);
+ __del_from_nat_cache(nm_i, ne);
+ nr_shrink--;
+ }
+ write_unlock(&nm_i->nat_tree_lock);
+ return nr_shrink;
+}
+
+/*
+ * This function returns always success
+ */
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t start_nid = START_NID(nid);
+ struct f2fs_nat_block *nat_blk;
+ struct page *page = NULL;
+ struct f2fs_nat_entry ne;
+ struct nat_entry *e;
+ int i;
+
+ memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ ni->nid = nid;
+
+ /* Check nat cache */
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e) {
+ ni->ino = nat_get_ino(e);
+ ni->blk_addr = nat_get_blkaddr(e);
+ ni->version = nat_get_version(e);
+ }
+ read_unlock(&nm_i->nat_tree_lock);
+ if (e)
+ return;
+
+ /* Check current segment summary */
+ mutex_lock(&curseg->curseg_mutex);
+ i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+ if (i >= 0) {
+ ne = nat_in_journal(sum, i);
+ node_info_from_raw_nat(ni, &ne);
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+ if (i >= 0)
+ goto cache;
+
+ /* Fill node_info from nat page */
+ page = get_current_nat_page(sbi, start_nid);
+ nat_blk = (struct f2fs_nat_block *)page_address(page);
+ ne = nat_blk->entries[nid - start_nid];
+ node_info_from_raw_nat(ni, &ne);
+ f2fs_put_page(page, 1);
+cache:
+ /* cache nat entry */
+ cache_nat_entry(NM_I(sbi), nid, &ne);
+}
+
+/*
+ * The maximum depth is four.
+ * Offset[0] will have raw inode offset.
+ */
+static int get_node_path(long block, int offset[4], unsigned int noffset[4])
+{
+ const long direct_index = ADDRS_PER_INODE;
+ const long direct_blks = ADDRS_PER_BLOCK;
+ const long dptrs_per_blk = NIDS_PER_BLOCK;
+ const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+ const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
+ int n = 0;
+ int level = 0;
+
+ noffset[0] = 0;
+
+ if (block < direct_index) {
+ offset[n++] = block;
+ level = 0;
+ goto got;
+ }
+ block -= direct_index;
+ if (block < direct_blks) {
+ offset[n++] = NODE_DIR1_BLOCK;
+ noffset[n] = 1;
+ offset[n++] = block;
+ level = 1;
+ goto got;
+ }
+ block -= direct_blks;
+ if (block < direct_blks) {
+ offset[n++] = NODE_DIR2_BLOCK;
+ noffset[n] = 2;
+ offset[n++] = block;
+ level = 1;
+ goto got;
+ }
+ block -= direct_blks;
+ if (block < indirect_blks) {
+ offset[n++] = NODE_IND1_BLOCK;
+ noffset[n] = 3;
+ offset[n++] = block / direct_blks;
+ noffset[n] = 4 + offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 2;
+ goto got;
+ }
+ block -= indirect_blks;
+ if (block < indirect_blks) {
+ offset[n++] = NODE_IND2_BLOCK;
+ noffset[n] = 4 + dptrs_per_blk;
+ offset[n++] = block / direct_blks;
+ noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 2;
+ goto got;
+ }
+ block -= indirect_blks;
+ if (block < dindirect_blks) {
+ offset[n++] = NODE_DIND_BLOCK;
+ noffset[n] = 5 + (dptrs_per_blk * 2);
+ offset[n++] = block / indirect_blks;
+ noffset[n] = 6 + (dptrs_per_blk * 2) +
+ offset[n - 1] * (dptrs_per_blk + 1);
+ offset[n++] = (block / direct_blks) % dptrs_per_blk;
+ noffset[n] = 7 + (dptrs_per_blk * 2) +
+ offset[n - 2] * (dptrs_per_blk + 1) +
+ offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 3;
+ goto got;
+ } else {
+ BUG();
+ }
+got:
+ return level;
+}
+
+/*
+ * Caller should call f2fs_put_dnode(dn).
+ */
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *npage[4];
+ struct page *parent;
+ int offset[4];
+ unsigned int noffset[4];
+ nid_t nids[4];
+ int level, i;
+ int err = 0;
+
+ level = get_node_path(index, offset, noffset);
+
+ nids[0] = dn->inode->i_ino;
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+
+ parent = npage[0];
+ nids[1] = get_nid(parent, offset[0], true);
+ dn->inode_page = npage[0];
+ dn->inode_page_locked = true;
+
+ /* get indirect or direct nodes */
+ for (i = 1; i <= level; i++) {
+ bool done = false;
+
+ if (!nids[i] && !ro) {
+ mutex_lock_op(sbi, NODE_NEW);
+
+ /* alloc new node */
+ if (!alloc_nid(sbi, &(nids[i]))) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = -ENOSPC;
+ goto release_pages;
+ }
+
+ dn->nid = nids[i];
+ npage[i] = new_node_page(dn, noffset[i]);
+ if (IS_ERR(npage[i])) {
+ alloc_nid_failed(sbi, nids[i]);
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = PTR_ERR(npage[i]);
+ goto release_pages;
+ }
+
+ set_nid(parent, offset[i - 1], nids[i], i == 1);
+ alloc_nid_done(sbi, nids[i]);
+ mutex_unlock_op(sbi, NODE_NEW);
+ done = true;
+ } else if (ro && i == level && level > 1) {
+ npage[i] = get_node_page_ra(parent, offset[i - 1]);
+ if (IS_ERR(npage[i])) {
+ err = PTR_ERR(npage[i]);
+ goto release_pages;
+ }
+ done = true;
+ }
+ if (i == 1) {
+ dn->inode_page_locked = false;
+ unlock_page(parent);
+ } else {
+ f2fs_put_page(parent, 1);
+ }
+
+ if (!done) {
+ npage[i] = get_node_page(sbi, nids[i]);
+ if (IS_ERR(npage[i])) {
+ err = PTR_ERR(npage[i]);
+ f2fs_put_page(npage[0], 0);
+ goto release_out;
+ }
+ }
+ if (i < level) {
+ parent = npage[i];
+ nids[i + 1] = get_nid(parent, offset[i], false);
+ }
+ }
+ dn->nid = nids[level];
+ dn->ofs_in_node = offset[level];
+ dn->node_page = npage[level];
+ dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ return 0;
+
+release_pages:
+ f2fs_put_page(parent, 1);
+ if (i > 1)
+ f2fs_put_page(npage[0], 0);
+release_out:
+ dn->inode_page = NULL;
+ dn->node_page = NULL;
+ return err;
+}
+
+static void truncate_node(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct node_info ni;
+
+ get_node_info(sbi, dn->nid, &ni);
+ if (dn->inode->i_blocks == 0) {
+ BUG_ON(ni.blk_addr != NULL_ADDR);
+ goto invalidate;
+ }
+ BUG_ON(ni.blk_addr == NULL_ADDR);
+
+ /* Deallocate node address */
+ invalidate_blocks(sbi, ni.blk_addr);
+ dec_valid_node_count(sbi, dn->inode, 1);
+ set_node_addr(sbi, &ni, NULL_ADDR);
+
+ if (dn->nid == dn->inode->i_ino) {
+ remove_orphan_inode(sbi, dn->nid);
+ dec_valid_inode_count(sbi);
+ } else {
+ sync_inode_page(dn);
+ }
+invalidate:
+ clear_node_page_dirty(dn->node_page);
+ F2FS_SET_SB_DIRT(sbi);
+
+ f2fs_put_page(dn->node_page, 1);
+ dn->node_page = NULL;
+}
+
+static int truncate_dnode(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *page;
+
+ if (dn->nid == 0)
+ return 1;
+
+ /* get direct node */
+ page = get_node_page(sbi, dn->nid);
+ if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
+ return 1;
+ else if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ /* Make dnode_of_data for parameter */
+ dn->node_page = page;
+ dn->ofs_in_node = 0;
+ truncate_data_blocks(dn);
+ truncate_node(dn);
+ return 1;
+}
+
+static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
+ int ofs, int depth)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct dnode_of_data rdn = *dn;
+ struct page *page;
+ struct f2fs_node *rn;
+ nid_t child_nid;
+ unsigned int child_nofs;
+ int freed = 0;
+ int i, ret;
+
+ if (dn->nid == 0)
+ return NIDS_PER_BLOCK + 1;
+
+ page = get_node_page(sbi, dn->nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ rn = (struct f2fs_node *)page_address(page);
+ if (depth < 3) {
+ for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
+ child_nid = le32_to_cpu(rn->in.nid[i]);
+ if (child_nid == 0)
+ continue;
+ rdn.nid = child_nid;
+ ret = truncate_dnode(&rdn);
+ if (ret < 0)
+ goto out_err;
+ set_nid(page, i, 0, false);
+ }
+ } else {
+ child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
+ for (i = ofs; i < NIDS_PER_BLOCK; i++) {
+ child_nid = le32_to_cpu(rn->in.nid[i]);
+ if (child_nid == 0) {
+ child_nofs += NIDS_PER_BLOCK + 1;
+ continue;
+ }
+ rdn.nid = child_nid;
+ ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
+ if (ret == (NIDS_PER_BLOCK + 1)) {
+ set_nid(page, i, 0, false);
+ child_nofs += ret;
+ } else if (ret < 0 && ret != -ENOENT) {
+ goto out_err;
+ }
+ }
+ freed = child_nofs;
+ }
+
+ if (!ofs) {
+ /* remove current indirect node */
+ dn->node_page = page;
+ truncate_node(dn);
+ freed++;
+ } else {
+ f2fs_put_page(page, 1);
+ }
+ return freed;
+
+out_err:
+ f2fs_put_page(page, 1);
+ return ret;
+}
+
+static int truncate_partial_nodes(struct dnode_of_data *dn,
+ struct f2fs_inode *ri, int *offset, int depth)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *pages[2];
+ nid_t nid[3];
+ nid_t child_nid;
+ int err = 0;
+ int i;
+ int idx = depth - 2;
+
+ nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ if (!nid[0])
+ return 0;
+
+ /* get indirect nodes in the path */
+ for (i = 0; i < depth - 1; i++) {
+ /* refernece count'll be increased */
+ pages[i] = get_node_page(sbi, nid[i]);
+ if (IS_ERR(pages[i])) {
+ depth = i + 1;
+ err = PTR_ERR(pages[i]);
+ goto fail;
+ }
+ nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
+ }
+
+ /* free direct nodes linked to a partial indirect node */
+ for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+ child_nid = get_nid(pages[idx], i, false);
+ if (!child_nid)
+ continue;
+ dn->nid = child_nid;
+ err = truncate_dnode(dn);
+ if (err < 0)
+ goto fail;
+ set_nid(pages[idx], i, 0, false);
+ }
+
+ if (offset[depth - 1] == 0) {
+ dn->node_page = pages[idx];
+ dn->nid = nid[idx];
+ truncate_node(dn);
+ } else {
+ f2fs_put_page(pages[idx], 1);
+ }
+ offset[idx]++;
+ offset[depth - 1] = 0;
+fail:
+ for (i = depth - 3; i >= 0; i--)
+ f2fs_put_page(pages[i], 1);
+ return err;
+}
+
+/*
+ * All the block addresses of data and nodes should be nullified.
+ */
+int truncate_inode_blocks(struct inode *inode, pgoff_t from)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int err = 0, cont = 1;
+ int level, offset[4], noffset[4];
+ unsigned int nofs = 0;
+ struct f2fs_node *rn;
+ struct dnode_of_data dn;
+ struct page *page;
+
+ level = get_node_path(from, offset, noffset);
+
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ set_new_dnode(&dn, inode, page, NULL, 0);
+ unlock_page(page);
+
+ rn = page_address(page);
+ switch (level) {
+ case 0:
+ case 1:
+ nofs = noffset[1];
+ break;
+ case 2:
+ nofs = noffset[1];
+ if (!offset[level - 1])
+ goto skip_partial;
+ err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ nofs += 1 + NIDS_PER_BLOCK;
+ break;
+ case 3:
+ nofs = 5 + 2 * NIDS_PER_BLOCK;
+ if (!offset[level - 1])
+ goto skip_partial;
+ err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ break;
+ default:
+ BUG();
+ }
+
+skip_partial:
+ while (cont) {
+ dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ switch (offset[0]) {
+ case NODE_DIR1_BLOCK:
+ case NODE_DIR2_BLOCK:
+ err = truncate_dnode(&dn);
+ break;
+
+ case NODE_IND1_BLOCK:
+ case NODE_IND2_BLOCK:
+ err = truncate_nodes(&dn, nofs, offset[1], 2);
+ break;
+
+ case NODE_DIND_BLOCK:
+ err = truncate_nodes(&dn, nofs, offset[1], 3);
+ cont = 0;
+ break;
+
+ default:
+ BUG();
+ }
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ if (offset[1] == 0 &&
+ rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ offset[1] = 0;
+ offset[0]++;
+ nofs += err;
+ }
+fail:
+ f2fs_put_page(page, 0);
+ return err > 0 ? 0 : err;
+}
+
+int remove_inode_page(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ nid_t ino = inode->i_ino;
+ struct dnode_of_data dn;
+
+ mutex_lock_op(sbi, NODE_TRUNC);
+ page = get_node_page(sbi, ino);
+ if (IS_ERR(page)) {
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return PTR_ERR(page);
+ }
+
+ if (F2FS_I(inode)->i_xattr_nid) {
+ nid_t nid = F2FS_I(inode)->i_xattr_nid;
+ struct page *npage = get_node_page(sbi, nid);
+
+ if (IS_ERR(npage)) {
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return PTR_ERR(npage);
+ }
+
+ F2FS_I(inode)->i_xattr_nid = 0;
+ set_new_dnode(&dn, inode, page, npage, nid);
+ dn.inode_page_locked = 1;
+ truncate_node(&dn);
+ }
+
+ /* 0 is possible, after f2fs_new_inode() is failed */
+ BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
+ set_new_dnode(&dn, inode, page, page, ino);
+ truncate_node(&dn);
+
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return 0;
+}
+
+int new_inode_page(struct inode *inode, const struct qstr *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ struct dnode_of_data dn;
+
+ /* allocate inode page for new inode */
+ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+ mutex_lock_op(sbi, NODE_NEW);
+ page = new_node_page(&dn, 0);
+ init_dent_inode(name, page);
+ mutex_unlock_op(sbi, NODE_NEW);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ f2fs_put_page(page, 1);
+ return 0;
+}
+
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct node_info old_ni, new_ni;
+ struct page *page;
+ int err;
+
+ if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ return ERR_PTR(-EPERM);
+
+ page = grab_cache_page(mapping, dn->nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ get_node_info(sbi, dn->nid, &old_ni);
+
+ SetPageUptodate(page);
+ fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
+
+ /* Reinitialize old_ni with new node page */
+ BUG_ON(old_ni.blk_addr != NULL_ADDR);
+ new_ni = old_ni;
+ new_ni.ino = dn->inode->i_ino;
+
+ if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ set_node_addr(sbi, &new_ni, NEW_ADDR);
+ set_cold_node(dn->inode, page);
+
+ dn->node_page = page;
+ sync_inode_page(dn);
+ set_page_dirty(page);
+ if (ofs == 0)
+ inc_valid_inode_count(sbi);
+
+ return page;
+
+fail:
+ clear_node_page_dirty(page);
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+}
+
+static int read_node_page(struct page *page, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct node_info ni;
+
+ get_node_info(sbi, page->index, &ni);
+
+ if (ni.blk_addr == NULL_ADDR)
+ return -ENOENT;
+ return f2fs_readpage(sbi, page, ni.blk_addr, type);
+}
+
+/*
+ * Readahead a node page
+ */
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct page *apage;
+
+ apage = find_get_page(mapping, nid);
+ if (apage && PageUptodate(apage))
+ goto release_out;
+ f2fs_put_page(apage, 0);
+
+ apage = grab_cache_page(mapping, nid);
+ if (!apage)
+ return;
+
+ if (read_node_page(apage, READA))
+ unlock_page(apage);
+
+release_out:
+ f2fs_put_page(apage, 0);
+ return;
+}
+
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ int err;
+ struct page *page;
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+
+ page = grab_cache_page(mapping, nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_node_page(page, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+
+ BUG_ON(nid != nid_of_node(page));
+ mark_page_accessed(page);
+ return page;
+}
+
+/*
+ * Return a locked page for the desired node page.
+ * And, readahead MAX_RA_NODE number of node pages.
+ */
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ int i, end;
+ int err = 0;
+ nid_t nid;
+ struct page *page;
+
+ /* First, try getting the desired direct node. */
+ nid = get_nid(parent, start, false);
+ if (!nid)
+ return ERR_PTR(-ENOENT);
+
+ page = find_get_page(mapping, nid);
+ if (page && PageUptodate(page))
+ goto page_hit;
+ f2fs_put_page(page, 0);
+
+repeat:
+ page = grab_cache_page(mapping, nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_node_page(page, READA);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+
+ /* Then, try readahead for siblings of the desired node */
+ end = start + MAX_RA_NODE;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start + 1; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ if (!nid)
+ continue;
+ ra_node_page(sbi, nid);
+ }
+
+page_hit:
+ lock_page(page);
+ if (PageError(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+
+ /* Has the page been truncated? */
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+ return page;
+}
+
+void sync_inode_page(struct dnode_of_data *dn)
+{
+ if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
+ update_inode(dn->inode, dn->node_page);
+ } else if (dn->inode_page) {
+ if (!dn->inode_page_locked)
+ lock_page(dn->inode_page);
+ update_inode(dn->inode, dn->inode_page);
+ if (!dn->inode_page_locked)
+ unlock_page(dn->inode_page);
+ } else {
+ f2fs_write_inode(dn->inode, NULL);
+ }
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+ struct writeback_control *wbc)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int step = ino ? 2 : 0;
+ int nwritten = 0, wrote = 0;
+
+ pagevec_init(&pvec, 0);
+
+next_step:
+ index = 0;
+ end = LONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * flushing sequence with step:
+ * 0. indirect nodes
+ * 1. dentry dnodes
+ * 2. file dnodes
+ */
+ if (step == 0 && IS_DNODE(page))
+ continue;
+ if (step == 1 && (!IS_DNODE(page) ||
+ is_cold_node(page)))
+ continue;
+ if (step == 2 && (!IS_DNODE(page) ||
+ !is_cold_node(page)))
+ continue;
+
+ /*
+ * If an fsync mode,
+ * we should not skip writing node pages.
+ */
+ if (ino && ino_of_node(page) == ino)
+ lock_page(page);
+ else if (!trylock_page(page))
+ continue;
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino && ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ /* called by fsync() */
+ if (ino && IS_DNODE(page)) {
+ int mark = !is_checkpointed_node(sbi, ino);
+ set_fsync_mark(page, 1);
+ if (IS_INODE(page))
+ set_dentry_mark(page, mark);
+ nwritten++;
+ } else {
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
+ }
+ mapping->a_ops->writepage(page, wbc);
+ wrote++;
+
+ if (--wbc->nr_to_write == 0)
+ break;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+
+ if (wbc->nr_to_write == 0) {
+ step = 2;
+ break;
+ }
+ }
+
+ if (step < 2) {
+ step++;
+ goto next_step;
+ }
+
+ if (wrote)
+ f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
+
+ return nwritten;
+}
+
+static int f2fs_write_node_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ nid_t nid;
+ block_t new_addr;
+ struct node_info ni;
+
+ if (wbc->for_reclaim) {
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+
+ wait_on_page_writeback(page);
+
+ mutex_lock_op(sbi, NODE_WRITE);
+
+ /* get old block addr of this node page */
+ nid = nid_of_node(page);
+ BUG_ON(page->index != nid);
+
+ get_node_info(sbi, nid, &ni);
+
+ /* This page is already truncated */
+ if (ni.blk_addr == NULL_ADDR)
+ return 0;
+
+ set_page_writeback(page);
+
+ /* insert node offset */
+ write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+ set_node_addr(sbi, &ni, new_addr);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+
+ mutex_unlock_op(sbi, NODE_WRITE);
+ unlock_page(page);
+ return 0;
+}
+
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * Be default, 512 pages (2MB), a segment size, is quite reasonable.
+ */
+#define COLLECT_DIRTY_NODES 512
+static int f2fs_write_node_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct block_device *bdev = sbi->sb->s_bdev;
+ long nr_to_write = wbc->nr_to_write;
+
+ /* First check balancing cached NAT entries */
+ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
+ write_checkpoint(sbi, false);
+ return 0;
+ }
+
+ /* collect a number of dirty node pages and write together */
+ if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
+ return 0;
+
+ /* if mounting is failed, skip writing node pages */
+ wbc->nr_to_write = bio_get_nr_vecs(bdev);
+ sync_node_pages(sbi, 0, wbc);
+ wbc->nr_to_write = nr_to_write -
+ (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+ return 0;
+}
+
+static int f2fs_set_node_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ inc_page_count(sbi, F2FS_DIRTY_NODES);
+ SetPagePrivate(page);
+ return 1;
+ }
+ return 0;
+}
+
+static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (PageDirty(page))
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ ClearPagePrivate(page);
+}
+
+static int f2fs_release_node_page(struct page *page, gfp_t wait)
+{
+ ClearPagePrivate(page);
+ return 0;
+}
+
+/*
+ * Structure of the f2fs node operations
+ */
+const struct address_space_operations f2fs_node_aops = {
+ .writepage = f2fs_write_node_page,
+ .writepages = f2fs_write_node_pages,
+ .set_page_dirty = f2fs_set_node_page_dirty,
+ .invalidatepage = f2fs_invalidate_node_page,
+ .releasepage = f2fs_release_node_page,
+};
+
+static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+{
+ struct list_head *this;
+ struct free_nid *i = NULL;
+ list_for_each(this, head) {
+ i = list_entry(this, struct free_nid, list);
+ if (i->nid == n)
+ break;
+ i = NULL;
+ }
+ return i;
+}
+
+static void __del_from_free_nid_list(struct free_nid *i)
+{
+ list_del(&i->list);
+ kmem_cache_free(free_nid_slab, i);
+}
+
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct free_nid *i;
+
+ if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+ return 0;
+retry:
+ i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+ if (!i) {
+ cond_resched();
+ goto retry;
+ }
+ i->nid = nid;
+ i->state = NID_NEW;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ kmem_cache_free(free_nid_slab, i);
+ return 0;
+ }
+ list_add_tail(&i->list, &nm_i->free_nid_list);
+ nm_i->fcnt++;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return 1;
+}
+
+static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct free_nid *i;
+ spin_lock(&nm_i->free_nid_list_lock);
+ i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ if (i && i->state == NID_NEW) {
+ __del_from_free_nid_list(i);
+ nm_i->fcnt--;
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
+}
+
+static int scan_nat_page(struct f2fs_nm_info *nm_i,
+ struct page *nat_page, nid_t start_nid)
+{
+ struct f2fs_nat_block *nat_blk = page_address(nat_page);
+ block_t blk_addr;
+ int fcnt = 0;
+ int i;
+
+ /* 0 nid should not be used */
+ if (start_nid == 0)
+ ++start_nid;
+
+ i = start_nid % NAT_ENTRY_PER_BLOCK;
+
+ for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+ blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
+ BUG_ON(blk_addr == NEW_ADDR);
+ if (blk_addr == NULL_ADDR)
+ fcnt += add_free_nid(nm_i, start_nid);
+ }
+ return fcnt;
+}
+
+static void build_free_nids(struct f2fs_sb_info *sbi)
+{
+ struct free_nid *fnid, *next_fnid;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t nid = 0;
+ bool is_cycled = false;
+ int fcnt = 0;
+ int i;
+
+ nid = nm_i->next_scan_nid;
+ nm_i->init_scan_nid = nid;
+
+ ra_nat_pages(sbi, nid);
+
+ while (1) {
+ struct page *page = get_current_nat_page(sbi, nid);
+
+ fcnt += scan_nat_page(nm_i, page, nid);
+ f2fs_put_page(page, 1);
+
+ nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
+
+ if (nid >= nm_i->max_nid) {
+ nid = 0;
+ is_cycled = true;
+ }
+ if (fcnt > MAX_FREE_NIDS)
+ break;
+ if (is_cycled && nm_i->init_scan_nid <= nid)
+ break;
+ }
+
+ nm_i->next_scan_nid = nid;
+
+ /* find free nids from current sum_pages */
+ mutex_lock(&curseg->curseg_mutex);
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(sum, i));
+ if (addr == NULL_ADDR)
+ add_free_nid(nm_i, nid);
+ else
+ remove_free_nid(nm_i, nid);
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+
+ /* remove the free nids from current allocated nids */
+ list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
+ struct nat_entry *ne;
+
+ read_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, fnid->nid);
+ if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+ remove_free_nid(nm_i, fnid->nid);
+ read_unlock(&nm_i->nat_tree_lock);
+ }
+}
+
+/*
+ * If this function returns success, caller can obtain a new nid
+ * from second parameter of this function.
+ * The returned nid could be used ino as well as nid when inode is created.
+ */
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i = NULL;
+ struct list_head *this;
+retry:
+ mutex_lock(&nm_i->build_lock);
+ if (!nm_i->fcnt) {
+ /* scan NAT in order to build free nid list */
+ build_free_nids(sbi);
+ if (!nm_i->fcnt) {
+ mutex_unlock(&nm_i->build_lock);
+ return false;
+ }
+ }
+ mutex_unlock(&nm_i->build_lock);
+
+ /*
+ * We check fcnt again since previous check is racy as
+ * we didn't hold free_nid_list_lock. So other thread
+ * could consume all of free nids.
+ */
+ spin_lock(&nm_i->free_nid_list_lock);
+ if (!nm_i->fcnt) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ goto retry;
+ }
+
+ BUG_ON(list_empty(&nm_i->free_nid_list));
+ list_for_each(this, &nm_i->free_nid_list) {
+ i = list_entry(this, struct free_nid, list);
+ if (i->state == NID_NEW)
+ break;
+ }
+
+ BUG_ON(i->state != NID_NEW);
+ *nid = i->nid;
+ i->state = NID_ALLOC;
+ nm_i->fcnt--;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return true;
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ if (i) {
+ BUG_ON(i->state != NID_ALLOC);
+ __del_from_free_nid_list(i);
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ alloc_nid_done(sbi, nid);
+ add_free_nid(NM_I(sbi), nid);
+}
+
+void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
+ struct f2fs_summary *sum, struct node_info *ni,
+ block_t new_blkaddr)
+{
+ rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
+ set_node_addr(sbi, ni, new_blkaddr);
+ clear_node_page_dirty(page);
+}
+
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct f2fs_node *src, *dst;
+ nid_t ino = ino_of_node(page);
+ struct node_info old_ni, new_ni;
+ struct page *ipage;
+
+ ipage = grab_cache_page(mapping, ino);
+ if (!ipage)
+ return -ENOMEM;
+
+ /* Should not use this inode from free nid list */
+ remove_free_nid(NM_I(sbi), ino);
+
+ get_node_info(sbi, ino, &old_ni);
+ SetPageUptodate(ipage);
+ fill_node_footer(ipage, ino, ino, 0, true);
+
+ src = (struct f2fs_node *)page_address(page);
+ dst = (struct f2fs_node *)page_address(ipage);
+
+ memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
+ dst->i.i_size = 0;
+ dst->i.i_blocks = cpu_to_le64(1);
+ dst->i.i_links = cpu_to_le32(1);
+ dst->i.i_xattr_nid = 0;
+
+ new_ni = old_ni;
+ new_ni.ino = ino;
+
+ set_node_addr(sbi, &new_ni, NEW_ADDR);
+ inc_valid_inode_count(sbi);
+
+ f2fs_put_page(ipage, 1);
+ return 0;
+}
+
+int restore_node_summary(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct f2fs_summary_block *sum)
+{
+ struct f2fs_node *rn;
+ struct f2fs_summary *sum_entry;
+ struct page *page;
+ block_t addr;
+ int i, last_offset;
+
+ /* alloc temporal page for read node */
+ page = alloc_page(GFP_NOFS | __GFP_ZERO);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ lock_page(page);
+
+ /* scan the node segment */
+ last_offset = sbi->blocks_per_seg;
+ addr = START_BLOCK(sbi, segno);
+ sum_entry = &sum->entries[0];
+
+ for (i = 0; i < last_offset; i++, sum_entry++) {
+ if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+ goto out;
+
+ rn = (struct f2fs_node *)page_address(page);
+ sum_entry->nid = rn->footer.nid;
+ sum_entry->version = 0;
+ sum_entry->ofs_in_node = 0;
+ addr++;
+
+ /*
+ * In order to read next node page,
+ * we must clear PageUptodate flag.
+ */
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+ return 0;
+}
+
+static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ int i;
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
+ mutex_unlock(&curseg->curseg_mutex);
+ return false;
+ }
+
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ struct nat_entry *ne;
+ struct f2fs_nat_entry raw_ne;
+ nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+
+ raw_ne = nat_in_journal(sum, i);
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne) {
+ __set_nat_cache_dirty(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+ continue;
+ }
+ ne = grab_nat_entry(nm_i, nid);
+ if (!ne) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
+ nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
+ nat_set_version(ne, raw_ne.version);
+ __set_nat_cache_dirty(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+ }
+ update_nats_in_cursum(sum, -i);
+ mutex_unlock(&curseg->curseg_mutex);
+ return true;
+}
+
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct list_head *cur, *n;
+ struct page *page = NULL;
+ struct f2fs_nat_block *nat_blk = NULL;
+ nid_t start_nid = 0, end_nid = 0;
+ bool flushed;
+
+ flushed = flush_nats_in_journal(sbi);
+
+ if (!flushed)
+ mutex_lock(&curseg->curseg_mutex);
+
+ /* 1) flush dirty nat caches */
+ list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
+ struct nat_entry *ne;
+ nid_t nid;
+ struct f2fs_nat_entry raw_ne;
+ int offset = -1;
+ block_t new_blkaddr;
+
+ ne = list_entry(cur, struct nat_entry, list);
+ nid = nat_get_nid(ne);
+
+ if (nat_get_blkaddr(ne) == NEW_ADDR)
+ continue;
+ if (flushed)
+ goto to_nat_page;
+
+ /* if there is room for nat enries in curseg->sumpage */
+ offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
+ if (offset >= 0) {
+ raw_ne = nat_in_journal(sum, offset);
+ goto flush_now;
+ }
+to_nat_page:
+ if (!page || (start_nid > nid || nid > end_nid)) {
+ if (page) {
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+ start_nid = START_NID(nid);
+ end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
+
+ /*
+ * get nat block with dirty flag, increased reference
+ * count, mapped and lock
+ */
+ page = get_next_nat_page(sbi, start_nid);
+ nat_blk = page_address(page);
+ }
+
+ BUG_ON(!nat_blk);
+ raw_ne = nat_blk->entries[nid - start_nid];
+flush_now:
+ new_blkaddr = nat_get_blkaddr(ne);
+
+ raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
+ raw_ne.block_addr = cpu_to_le32(new_blkaddr);
+ raw_ne.version = nat_get_version(ne);
+
+ if (offset < 0) {
+ nat_blk->entries[nid - start_nid] = raw_ne;
+ } else {
+ nat_in_journal(sum, offset) = raw_ne;
+ nid_in_journal(sum, offset) = cpu_to_le32(nid);
+ }
+
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
+ write_lock(&nm_i->nat_tree_lock);
+ __del_from_nat_cache(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+
+ /* We can reuse this freed nid at this point */
+ add_free_nid(NM_I(sbi), nid);
+ } else {
+ write_lock(&nm_i->nat_tree_lock);
+ __clear_nat_cache_dirty(nm_i, ne);
+ ne->checkpointed = true;
+ write_unlock(&nm_i->nat_tree_lock);
+ }
+ }
+ if (!flushed)
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_put_page(page, 1);
+
+ /* 2) shrink nat caches if necessary */
+ try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
+}
+
+static int init_node_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned char *version_bitmap;
+ unsigned int nat_segs, nat_blocks;
+
+ nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
+
+ /* segment_count_nat includes pair segment so divide to 2. */
+ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
+ nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+ nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+ nm_i->fcnt = 0;
+ nm_i->nat_cnt = 0;
+
+ INIT_LIST_HEAD(&nm_i->free_nid_list);
+ INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+ INIT_LIST_HEAD(&nm_i->nat_entries);
+ INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
+
+ mutex_init(&nm_i->build_lock);
+ spin_lock_init(&nm_i->free_nid_list_lock);
+ rwlock_init(&nm_i->nat_tree_lock);
+
+ nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
+ nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+ nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+
+ nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
+ if (!nm_i->nat_bitmap)
+ return -ENOMEM;
+ version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
+ if (!version_bitmap)
+ return -EFAULT;
+
+ /* copy version bitmap */
+ memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+ return 0;
+}
+
+int build_node_manager(struct f2fs_sb_info *sbi)
+{
+ int err;
+
+ sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+ if (!sbi->nm_info)
+ return -ENOMEM;
+
+ err = init_node_manager(sbi);
+ if (err)
+ return err;
+
+ build_free_nids(sbi);
+ return 0;
+}
+
+void destroy_node_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i, *next_i;
+ struct nat_entry *natvec[NATVEC_SIZE];
+ nid_t nid = 0;
+ unsigned int found;
+
+ if (!nm_i)
+ return;
+
+ /* destroy free nid list */
+ spin_lock(&nm_i->free_nid_list_lock);
+ list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+ BUG_ON(i->state == NID_ALLOC);
+ __del_from_free_nid_list(i);
+ nm_i->fcnt--;
+ }
+ BUG_ON(nm_i->fcnt);
+ spin_unlock(&nm_i->free_nid_list_lock);
+
+ /* destroy nat cache */
+ write_lock(&nm_i->nat_tree_lock);
+ while ((found = __gang_lookup_nat_cache(nm_i,
+ nid, NATVEC_SIZE, natvec))) {
+ unsigned idx;
+ for (idx = 0; idx < found; idx++) {
+ struct nat_entry *e = natvec[idx];
+ nid = nat_get_nid(e) + 1;
+ __del_from_nat_cache(nm_i, e);
+ }
+ }
+ BUG_ON(nm_i->nat_cnt);
+ write_unlock(&nm_i->nat_tree_lock);
+
+ kfree(nm_i->nat_bitmap);
+ sbi->nm_info = NULL;
+ kfree(nm_i);
+}
+
+int __init create_node_manager_caches(void)
+{
+ nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+ sizeof(struct nat_entry), NULL);
+ if (!nat_entry_slab)
+ return -ENOMEM;
+
+ free_nid_slab = f2fs_kmem_cache_create("free_nid",
+ sizeof(struct free_nid), NULL);
+ if (!free_nid_slab) {
+ kmem_cache_destroy(nat_entry_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_node_manager_caches(void)
+{
+ kmem_cache_destroy(free_nid_slab);
+ kmem_cache_destroy(nat_entry_slab);
+}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
new file mode 100644
index 000000000000..afdb130f782e
--- /dev/null
+++ b/fs/f2fs/node.h
@@ -0,0 +1,353 @@
+/*
+ * fs/f2fs/node.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* start node id of a node block dedicated to the given node id */
+#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+
+/* node block offset on the NAT area dedicated to the given start node id */
+#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+
+/* # of pages to perform readahead before building free nids */
+#define FREE_NID_PAGES 4
+
+/* maximum # of free node ids to produce during build_free_nids */
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
+
+/* maximum readahead size for node during getting data blocks */
+#define MAX_RA_NODE 128
+
+/* maximum cached nat entries to manage memory footprint */
+#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
+
+/* vector size for gang look-up from nat cache that consists of radix tree */
+#define NATVEC_SIZE 64
+
+/*
+ * For node information
+ */
+struct node_info {
+ nid_t nid; /* node id */
+ nid_t ino; /* inode number of the node's owner */
+ block_t blk_addr; /* block address of the node */
+ unsigned char version; /* version of the node */
+};
+
+struct nat_entry {
+ struct list_head list; /* for clean or dirty nat list */
+ bool checkpointed; /* whether it is checkpointed or not */
+ struct node_info ni; /* in-memory node information */
+};
+
+#define nat_get_nid(nat) (nat->ni.nid)
+#define nat_set_nid(nat, n) (nat->ni.nid = n)
+#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
+#define nat_get_ino(nat) (nat->ni.ino)
+#define nat_set_ino(nat, i) (nat->ni.ino = i)
+#define nat_get_version(nat) (nat->ni.version)
+#define nat_set_version(nat, v) (nat->ni.version = v)
+
+#define __set_nat_cache_dirty(nm_i, ne) \
+ list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+#define __clear_nat_cache_dirty(nm_i, ne) \
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+#define inc_node_version(version) (++version)
+
+static inline void node_info_from_raw_nat(struct node_info *ni,
+ struct f2fs_nat_entry *raw_ne)
+{
+ ni->ino = le32_to_cpu(raw_ne->ino);
+ ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
+ ni->version = raw_ne->version;
+}
+
+/*
+ * For free nid mangement
+ */
+enum nid_state {
+ NID_NEW, /* newly added to free nid list */
+ NID_ALLOC /* it is allocated */
+};
+
+struct free_nid {
+ struct list_head list; /* for free node id list */
+ nid_t nid; /* node id */
+ int state; /* in use or not: NID_NEW or NID_ALLOC */
+};
+
+static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *fnid;
+
+ if (nm_i->fcnt <= 0)
+ return -1;
+ spin_lock(&nm_i->free_nid_list_lock);
+ fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ *nid = fnid->nid;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return 0;
+}
+
+/*
+ * inline functions
+ */
+static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
+}
+
+static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ pgoff_t block_off;
+ pgoff_t block_addr;
+ int seg_off;
+
+ block_off = NAT_BLOCK_OFFSET(start);
+ seg_off = block_off >> sbi->log_blocks_per_seg;
+
+ block_addr = (pgoff_t)(nm_i->nat_blkaddr +
+ (seg_off << sbi->log_blocks_per_seg << 1) +
+ (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+
+ if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+ block_addr += sbi->blocks_per_seg;
+
+ return block_addr;
+}
+
+static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
+ pgoff_t block_addr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ block_addr -= nm_i->nat_blkaddr;
+ if ((block_addr >> sbi->log_blocks_per_seg) % 2)
+ block_addr -= sbi->blocks_per_seg;
+ else
+ block_addr += sbi->blocks_per_seg;
+
+ return block_addr + nm_i->nat_blkaddr;
+}
+
+static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
+{
+ unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
+
+ if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+ f2fs_clear_bit(block_off, nm_i->nat_bitmap);
+ else
+ f2fs_set_bit(block_off, nm_i->nat_bitmap);
+}
+
+static inline void fill_node_footer(struct page *page, nid_t nid,
+ nid_t ino, unsigned int ofs, bool reset)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ if (reset)
+ memset(rn, 0, sizeof(*rn));
+ rn->footer.nid = cpu_to_le32(nid);
+ rn->footer.ino = cpu_to_le32(ino);
+ rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+}
+
+static inline void copy_node_footer(struct page *dst, struct page *src)
+{
+ void *src_addr = page_address(src);
+ void *dst_addr = page_address(dst);
+ struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
+ struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
+ memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
+}
+
+static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ rn->footer.cp_ver = ckpt->checkpoint_ver;
+ rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+}
+
+static inline nid_t ino_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned flag = le32_to_cpu(rn->footer.flag);
+ return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline unsigned long long cpver_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
+/*
+ * f2fs assigns the following node offsets described as (num).
+ * N = NIDS_PER_BLOCK
+ *
+ * Inode block (0)
+ * |- direct node (1)
+ * |- direct node (2)
+ * |- indirect node (3)
+ * | `- direct node (4 => 4 + N - 1)
+ * |- indirect node (4 + N)
+ * | `- direct node (5 + N => 5 + 2N - 1)
+ * `- double indirect node (5 + 2N)
+ * `- indirect node (6 + 2N)
+ * `- direct node (x(N + 1))
+ */
+static inline bool IS_DNODE(struct page *node_page)
+{
+ unsigned int ofs = ofs_of_node(node_page);
+ if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
+ ofs == 5 + 2 * NIDS_PER_BLOCK)
+ return false;
+ if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
+ ofs -= 6 + 2 * NIDS_PER_BLOCK;
+ if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+ return false;
+ }
+ return true;
+}
+
+static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+
+ wait_on_page_writeback(p);
+
+ if (i)
+ rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
+ else
+ rn->in.nid[off] = cpu_to_le32(nid);
+ set_page_dirty(p);
+}
+
+static inline nid_t get_nid(struct page *p, int off, bool i)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ if (i)
+ return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
+ return le32_to_cpu(rn->in.nid[off]);
+}
+
+/*
+ * Coldness identification:
+ * - Mark cold files in f2fs_inode_info
+ * - Mark cold node blocks in their node footer
+ * - Mark cold data pages in page cache
+ */
+static inline int is_cold_file(struct inode *inode)
+{
+ return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+}
+
+static inline int is_cold_data(struct page *page)
+{
+ return PageChecked(page);
+}
+
+static inline void set_cold_data(struct page *page)
+{
+ SetPageChecked(page);
+}
+
+static inline void clear_cold_data(struct page *page)
+{
+ ClearPageChecked(page);
+}
+
+static inline int is_cold_node(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << COLD_BIT_SHIFT);
+}
+
+static inline unsigned char is_fsync_dnode(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << FSYNC_BIT_SHIFT);
+}
+
+static inline unsigned char is_dent_dnode(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << DENT_BIT_SHIFT);
+}
+
+static inline void set_cold_node(struct inode *inode, struct page *page)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+
+ if (S_ISDIR(inode->i_mode))
+ flag &= ~(0x1 << COLD_BIT_SHIFT);
+ else
+ flag |= (0x1 << COLD_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
+
+static inline void set_fsync_mark(struct page *page, int mark)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ if (mark)
+ flag |= (0x1 << FSYNC_BIT_SHIFT);
+ else
+ flag &= ~(0x1 << FSYNC_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
+
+static inline void set_dentry_mark(struct page *page, int mark)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ if (mark)
+ flag |= (0x1 << DENT_BIT_SHIFT);
+ else
+ flag &= ~(0x1 << DENT_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
new file mode 100644
index 000000000000..b235215ac138
--- /dev/null
+++ b/fs/f2fs/recovery.c
@@ -0,0 +1,375 @@
+/*
+ * fs/f2fs/recovery.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *fsync_entry_slab;
+
+bool space_for_roll_forward(struct f2fs_sb_info *sbi)
+{
+ if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
+ > sbi->user_block_count)
+ return false;
+ return true;
+}
+
+static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
+ nid_t ino)
+{
+ struct list_head *this;
+ struct fsync_inode_entry *entry;
+
+ list_for_each(this, head) {
+ entry = list_entry(this, struct fsync_inode_entry, list);
+ if (entry->inode->i_ino == ino)
+ return entry;
+ }
+ return NULL;
+}
+
+static int recover_dentry(struct page *ipage, struct inode *inode)
+{
+ struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ struct f2fs_inode *raw_inode = &(raw_node->i);
+ struct qstr name;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ struct inode *dir;
+ int err = 0;
+
+ if (!is_dent_dnode(ipage))
+ goto out;
+
+ dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
+ if (IS_ERR(dir)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ name.len = le32_to_cpu(raw_inode->i_namelen);
+ name.name = raw_inode->i_name;
+
+ de = f2fs_find_entry(dir, &name, &page);
+ if (de) {
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ } else {
+ err = __f2fs_add_link(dir, &name, inode);
+ }
+ iput(dir);
+out:
+ kunmap(ipage);
+ return err;
+}
+
+static int recover_inode(struct inode *inode, struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+ struct f2fs_inode *raw_inode = &(raw_node->i);
+
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ i_size_write(inode, le64_to_cpu(raw_inode->i_size));
+ inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+
+ return recover_dentry(node_page, inode);
+}
+
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+{
+ unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ struct curseg_info *curseg;
+ struct page *page;
+ block_t blkaddr;
+ int err = 0;
+
+ /* get node pages in the current segment */
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+ blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+
+ /* read node page */
+ page = alloc_page(GFP_F2FS_ZERO);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ lock_page(page);
+
+ while (1) {
+ struct fsync_inode_entry *entry;
+
+ if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ goto out;
+
+ if (cp_ver != cpver_of_node(page))
+ goto out;
+
+ if (!is_fsync_dnode(page))
+ goto next;
+
+ entry = get_fsync_inode(head, ino_of_node(page));
+ if (entry) {
+ entry->blkaddr = blkaddr;
+ if (IS_INODE(page) && is_dent_dnode(page))
+ set_inode_flag(F2FS_I(entry->inode),
+ FI_INC_LINK);
+ } else {
+ if (IS_INODE(page) && is_dent_dnode(page)) {
+ if (recover_inode_page(sbi, page)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* add this fsync inode to the list */
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+ if (!entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
+ if (IS_ERR(entry->inode)) {
+ err = PTR_ERR(entry->inode);
+ kmem_cache_free(fsync_entry_slab, entry);
+ goto out;
+ }
+
+ list_add_tail(&entry->list, head);
+ entry->blkaddr = blkaddr;
+ }
+ if (IS_INODE(page)) {
+ err = recover_inode(entry->inode, page);
+ if (err)
+ goto out;
+ }
+next:
+ /* check next segment */
+ blkaddr = next_blkaddr_of_node(page);
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+ return err;
+}
+
+static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
+ struct list_head *head)
+{
+ struct fsync_inode_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, list) {
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+ }
+}
+
+static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct seg_entry *sentry;
+ unsigned int segno = GET_SEGNO(sbi, blkaddr);
+ unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ struct f2fs_summary sum;
+ nid_t ino;
+ void *kaddr;
+ struct inode *inode;
+ struct page *node_page;
+ block_t bidx;
+ int i;
+
+ sentry = get_seg_entry(sbi, segno);
+ if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
+ return;
+
+ /* Get the previous summary */
+ for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+ struct curseg_info *curseg = CURSEG_I(sbi, i);
+ if (curseg->segno == segno) {
+ sum = curseg->sum_blk->entries[blkoff];
+ break;
+ }
+ }
+ if (i > CURSEG_COLD_DATA) {
+ struct page *sum_page = get_sum_page(sbi, segno);
+ struct f2fs_summary_block *sum_node;
+ kaddr = page_address(sum_page);
+ sum_node = (struct f2fs_summary_block *)kaddr;
+ sum = sum_node->entries[blkoff];
+ f2fs_put_page(sum_page, 1);
+ }
+
+ /* Get the node page */
+ node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ bidx = start_bidx_of_node(ofs_of_node(node_page)) +
+ le16_to_cpu(sum.ofs_in_node);
+ ino = ino_of_node(node_page);
+ f2fs_put_page(node_page, 1);
+
+ /* Deallocate previous index in the node page */
+ inode = f2fs_iget(sbi->sb, ino);
+ if (IS_ERR(inode))
+ return;
+
+ truncate_hole(inode, bidx, bidx + 1);
+ iput(inode);
+}
+
+static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct page *page, block_t blkaddr)
+{
+ unsigned int start, end;
+ struct dnode_of_data dn;
+ struct f2fs_summary sum;
+ struct node_info ni;
+
+ start = start_bidx_of_node(ofs_of_node(page));
+ if (IS_INODE(page))
+ end = start + ADDRS_PER_INODE;
+ else
+ end = start + ADDRS_PER_BLOCK;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, start, 0))
+ return;
+
+ wait_on_page_writeback(dn.node_page);
+
+ get_node_info(sbi, dn.nid, &ni);
+ BUG_ON(ni.ino != ino_of_node(page));
+ BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
+
+ for (; start < end; start++) {
+ block_t src, dest;
+
+ src = datablock_addr(dn.node_page, dn.ofs_in_node);
+ dest = datablock_addr(page, dn.ofs_in_node);
+
+ if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+ if (src == NULL_ADDR) {
+ int err = reserve_new_block(&dn);
+ /* We should not get -ENOSPC */
+ BUG_ON(err);
+ }
+
+ /* Check the previous node page having this index */
+ check_index_in_prev_nodes(sbi, dest);
+
+ set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+ /* write dummy data page */
+ recover_data_page(sbi, NULL, &sum, src, dest);
+ update_extent_cache(dest, &dn);
+ }
+ dn.ofs_in_node++;
+ }
+
+ /* write node page in place */
+ set_summary(&sum, dn.nid, 0, 0);
+ if (IS_INODE(dn.node_page))
+ sync_inode_page(&dn);
+
+ copy_node_footer(dn.node_page, page);
+ fill_node_footer(dn.node_page, dn.nid, ni.ino,
+ ofs_of_node(page), false);
+ set_page_dirty(dn.node_page);
+
+ recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+ f2fs_put_dnode(&dn);
+}
+
+static void recover_data(struct f2fs_sb_info *sbi,
+ struct list_head *head, int type)
+{
+ unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ struct curseg_info *curseg;
+ struct page *page;
+ block_t blkaddr;
+
+ /* get node pages in the current segment */
+ curseg = CURSEG_I(sbi, type);
+ blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+ /* read node page */
+ page = alloc_page(GFP_NOFS | __GFP_ZERO);
+ if (IS_ERR(page))
+ return;
+ lock_page(page);
+
+ while (1) {
+ struct fsync_inode_entry *entry;
+
+ if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ goto out;
+
+ if (cp_ver != cpver_of_node(page))
+ goto out;
+
+ entry = get_fsync_inode(head, ino_of_node(page));
+ if (!entry)
+ goto next;
+
+ do_recover_data(sbi, entry->inode, page, blkaddr);
+
+ if (entry->blkaddr == blkaddr) {
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+ }
+next:
+ /* check next segment */
+ blkaddr = next_blkaddr_of_node(page);
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+
+ allocate_new_segments(sbi);
+}
+
+void recover_fsync_data(struct f2fs_sb_info *sbi)
+{
+ struct list_head inode_list;
+
+ fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
+ sizeof(struct fsync_inode_entry), NULL);
+ if (unlikely(!fsync_entry_slab))
+ return;
+
+ INIT_LIST_HEAD(&inode_list);
+
+ /* step #1: find fsynced inode numbers */
+ if (find_fsync_dnodes(sbi, &inode_list))
+ goto out;
+
+ if (list_empty(&inode_list))
+ goto out;
+
+ /* step #2: recover data */
+ sbi->por_doing = 1;
+ recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ sbi->por_doing = 0;
+ BUG_ON(!list_empty(&inode_list));
+out:
+ destroy_fsync_dnodes(sbi, &inode_list);
+ kmem_cache_destroy(fsync_entry_slab);
+ write_checkpoint(sbi, false);
+}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
new file mode 100644
index 000000000000..777f17e496e6
--- /dev/null
+++ b/fs/f2fs/segment.c
@@ -0,0 +1,1770 @@
+/*
+ * fs/f2fs/segment.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+
+#include "f2fs.h"
+#include "segment.h"
+#include "node.h"
+
+/*
+ * This function balances dirty node and dentry pages.
+ * In addition, it controls garbage collection.
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+{
+ /*
+ * We should do GC or end up with checkpoint, if there are so many dirty
+ * dir/node pages without enough free segments.
+ */
+ if (has_not_enough_free_secs(sbi, 0)) {
+ mutex_lock(&sbi->gc_mutex);
+ f2fs_gc(sbi);
+ }
+}
+
+static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ /* need not be added */
+ if (IS_CURSEG(sbi, segno))
+ return;
+
+ if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]++;
+
+ if (dirty_type == DIRTY) {
+ struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ dirty_type = sentry->type;
+ if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]++;
+ }
+}
+
+static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]--;
+
+ if (dirty_type == DIRTY) {
+ struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ dirty_type = sentry->type;
+ if (test_and_clear_bit(segno,
+ dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]--;
+ clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
+ clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+ }
+}
+
+/*
+ * Should not occur error such as -ENOMEM.
+ * Adding dirty entry into seglist is not critical operation.
+ * If a given segment is one of current working segments, it won't be added.
+ */
+void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned short valid_blocks;
+
+ if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+ return;
+
+ mutex_lock(&dirty_i->seglist_lock);
+
+ valid_blocks = get_valid_blocks(sbi, segno, 0);
+
+ if (valid_blocks == 0) {
+ __locate_dirty_segment(sbi, segno, PRE);
+ __remove_dirty_segment(sbi, segno, DIRTY);
+ } else if (valid_blocks < sbi->blocks_per_seg) {
+ __locate_dirty_segment(sbi, segno, DIRTY);
+ } else {
+ /* Recovery routine with SSR needs this */
+ __remove_dirty_segment(sbi, segno, DIRTY);
+ }
+
+ mutex_unlock(&dirty_i->seglist_lock);
+ return;
+}
+
+/*
+ * Should call clear_prefree_segments after checkpoint is done.
+ */
+static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno, offset = 0;
+ unsigned int total_segs = TOTAL_SEGS(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ while (1) {
+ segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+ offset);
+ if (segno >= total_segs)
+ break;
+ __set_test_and_free(sbi, segno);
+ offset = segno + 1;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+void clear_prefree_segments(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno, offset = 0;
+ unsigned int total_segs = TOTAL_SEGS(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ while (1) {
+ segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+ offset);
+ if (segno >= total_segs)
+ break;
+
+ offset = segno + 1;
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
+ dirty_i->nr_dirty[PRE]--;
+
+ /* Let's use trim */
+ if (test_opt(sbi, DISCARD))
+ blkdev_issue_discard(sbi->sb->s_bdev,
+ START_BLOCK(sbi, segno) <<
+ sbi->log_sectors_per_block,
+ 1 << (sbi->log_sectors_per_block +
+ sbi->log_blocks_per_seg),
+ GFP_NOFS, 0);
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+ sit_i->dirty_sentries++;
+}
+
+static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
+ unsigned int segno, int modified)
+{
+ struct seg_entry *se = get_seg_entry(sbi, segno);
+ se->type = type;
+ if (modified)
+ __mark_sit_entry_dirty(sbi, segno);
+}
+
+static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+{
+ struct seg_entry *se;
+ unsigned int segno, offset;
+ long int new_vblocks;
+
+ segno = GET_SEGNO(sbi, blkaddr);
+
+ se = get_seg_entry(sbi, segno);
+ new_vblocks = se->valid_blocks + del;
+ offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+
+ BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
+ (new_vblocks > sbi->blocks_per_seg)));
+
+ se->valid_blocks = new_vblocks;
+ se->mtime = get_mtime(sbi);
+ SIT_I(sbi)->max_mtime = se->mtime;
+
+ /* Update valid block bitmap */
+ if (del > 0) {
+ if (f2fs_set_bit(offset, se->cur_valid_map))
+ BUG();
+ } else {
+ if (!f2fs_clear_bit(offset, se->cur_valid_map))
+ BUG();
+ }
+ if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+ se->ckpt_valid_blocks += del;
+
+ __mark_sit_entry_dirty(sbi, segno);
+
+ /* update total number of valid blocks to be written in ckpt area */
+ SIT_I(sbi)->written_valid_blocks += del;
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, segno)->valid_blocks += del;
+}
+
+static void refresh_sit_entry(struct f2fs_sb_info *sbi,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+}
+
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+{
+ unsigned int segno = GET_SEGNO(sbi, addr);
+ struct sit_info *sit_i = SIT_I(sbi);
+
+ BUG_ON(addr == NULL_ADDR);
+ if (addr == NEW_ADDR)
+ return;
+
+ /* add it into sit main buffer */
+ mutex_lock(&sit_i->sentry_lock);
+
+ update_sit_entry(sbi, addr, -1);
+
+ /* add it into dirty seglist */
+ locate_dirty_segment(sbi, segno);
+
+ mutex_unlock(&sit_i->sentry_lock);
+}
+
+/*
+ * This function should be resided under the curseg_mutex lock
+ */
+static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
+ struct f2fs_summary *sum, unsigned short offset)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ void *addr = curseg->sum_blk;
+ addr += offset * sizeof(struct f2fs_summary);
+ memcpy(addr, sum, sizeof(struct f2fs_summary));
+ return;
+}
+
+/*
+ * Calculate the number of current summary pages for writing
+ */
+int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+{
+ int total_size_bytes = 0;
+ int valid_sum_count = 0;
+ int i, sum_space;
+
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ if (sbi->ckpt->alloc_type[i] == SSR)
+ valid_sum_count += sbi->blocks_per_seg;
+ else
+ valid_sum_count += curseg_blkoff(sbi, i);
+ }
+
+ total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
+ + sizeof(struct nat_journal) + 2
+ + sizeof(struct sit_journal) + 2;
+ sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
+ if (total_size_bytes < sum_space)
+ return 1;
+ else if (total_size_bytes < 2 * sum_space)
+ return 2;
+ return 3;
+}
+
+/*
+ * Caller should put this summary page
+ */
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
+}
+
+static void write_sum_page(struct f2fs_sb_info *sbi,
+ struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ void *kaddr = page_address(page);
+ memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
+static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
+ int ofs_unit, int type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
+ unsigned int segno, next_segno, i;
+ int ofs = 0;
+
+ /*
+ * If there is not enough reserved sections,
+ * we should not reuse prefree segments.
+ */
+ if (has_not_enough_free_secs(sbi, 0))
+ return NULL_SEGNO;
+
+ /*
+ * NODE page should not reuse prefree segment,
+ * since those information is used for SPOR.
+ */
+ if (IS_NODESEG(type))
+ return NULL_SEGNO;
+next:
+ segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
+ ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
+ if (segno < TOTAL_SEGS(sbi)) {
+ /* skip intermediate segments in a section */
+ if (segno % ofs_unit)
+ goto next;
+
+ /* skip if whole section is not prefree */
+ next_segno = find_next_zero_bit(prefree_segmap,
+ TOTAL_SEGS(sbi), segno + 1);
+ if (next_segno - segno < ofs_unit)
+ goto next;
+
+ /* skip if whole section was not free at the last checkpoint */
+ for (i = 0; i < ofs_unit; i++)
+ if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
+ goto next;
+ return segno;
+ }
+ return NULL_SEGNO;
+}
+
+/*
+ * Find a new segment from the free segments bitmap to right order
+ * This function should be returned with success, otherwise BUG
+ */
+static void get_new_segment(struct f2fs_sb_info *sbi,
+ unsigned int *newseg, bool new_sec, int dir)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int total_secs = sbi->total_sections;
+ unsigned int segno, secno, zoneno;
+ unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+ unsigned int hint = *newseg / sbi->segs_per_sec;
+ unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int left_start = hint;
+ bool init = true;
+ int go_left = 0;
+ int i;
+
+ write_lock(&free_i->segmap_lock);
+
+ if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+ segno = find_next_zero_bit(free_i->free_segmap,
+ TOTAL_SEGS(sbi), *newseg + 1);
+ if (segno < TOTAL_SEGS(sbi))
+ goto got_it;
+ }
+find_other_zone:
+ secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
+ if (secno >= total_secs) {
+ if (dir == ALLOC_RIGHT) {
+ secno = find_next_zero_bit(free_i->free_secmap,
+ total_secs, 0);
+ BUG_ON(secno >= total_secs);
+ } else {
+ go_left = 1;
+ left_start = hint - 1;
+ }
+ }
+ if (go_left == 0)
+ goto skip_left;
+
+ while (test_bit(left_start, free_i->free_secmap)) {
+ if (left_start > 0) {
+ left_start--;
+ continue;
+ }
+ left_start = find_next_zero_bit(free_i->free_secmap,
+ total_secs, 0);
+ BUG_ON(left_start >= total_secs);
+ break;
+ }
+ secno = left_start;
+skip_left:
+ hint = secno;
+ segno = secno * sbi->segs_per_sec;
+ zoneno = secno / sbi->secs_per_zone;
+
+ /* give up on finding another zone */
+ if (!init)
+ goto got_it;
+ if (sbi->secs_per_zone == 1)
+ goto got_it;
+ if (zoneno == old_zoneno)
+ goto got_it;
+ if (dir == ALLOC_LEFT) {
+ if (!go_left && zoneno + 1 >= total_zones)
+ goto got_it;
+ if (go_left && zoneno == 0)
+ goto got_it;
+ }
+ for (i = 0; i < NR_CURSEG_TYPE; i++)
+ if (CURSEG_I(sbi, i)->zone == zoneno)
+ break;
+
+ if (i < NR_CURSEG_TYPE) {
+ /* zone is in user, try another */
+ if (go_left)
+ hint = zoneno * sbi->secs_per_zone - 1;
+ else if (zoneno + 1 >= total_zones)
+ hint = 0;
+ else
+ hint = (zoneno + 1) * sbi->secs_per_zone;
+ init = false;
+ goto find_other_zone;
+ }
+got_it:
+ /* set it as dirty segment in free segmap */
+ BUG_ON(test_bit(segno, free_i->free_segmap));
+ __set_inuse(sbi, segno);
+ *newseg = segno;
+ write_unlock(&free_i->segmap_lock);
+}
+
+static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ struct summary_footer *sum_footer;
+
+ curseg->segno = curseg->next_segno;
+ curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->next_blkoff = 0;
+ curseg->next_segno = NULL_SEGNO;
+
+ sum_footer = &(curseg->sum_blk->footer);
+ memset(sum_footer, 0, sizeof(struct summary_footer));
+ if (IS_DATASEG(type))
+ SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
+ if (IS_NODESEG(type))
+ SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
+ __set_sit_entry_type(sbi, type, curseg->segno, modified);
+}
+
+/*
+ * Allocate a current working segment.
+ * This function always allocates a free segment in LFS manner.
+ */
+static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int segno = curseg->segno;
+ int dir = ALLOC_LEFT;
+
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+ if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
+ dir = ALLOC_RIGHT;
+
+ if (test_opt(sbi, NOHEAP))
+ dir = ALLOC_RIGHT;
+
+ get_new_segment(sbi, &segno, new_sec, dir);
+ curseg->next_segno = segno;
+ reset_curseg(sbi, type, 1);
+ curseg->alloc_type = LFS;
+}
+
+static void __next_free_blkoff(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg, block_t start)
+{
+ struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+ block_t ofs;
+ for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
+ if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
+ && !f2fs_test_bit(ofs, se->cur_valid_map))
+ break;
+ }
+ seg->next_blkoff = ofs;
+}
+
+/*
+ * If a segment is written by LFS manner, next block offset is just obtained
+ * by increasing the current block offset. However, if a segment is written by
+ * SSR manner, next block offset obtained by calling __next_free_blkoff
+ */
+static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg)
+{
+ if (seg->alloc_type == SSR)
+ __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+ else
+ seg->next_blkoff++;
+}
+
+/*
+ * This function always allocates a used segment (from dirty seglist) by SSR
+ * manner, so it should recover the existing segment information of valid blocks
+ */
+static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int new_segno = curseg->next_segno;
+ struct f2fs_summary_block *sum_node;
+ struct page *sum_page;
+
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+ __set_test_and_inuse(sbi, new_segno);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ __remove_dirty_segment(sbi, new_segno, PRE);
+ __remove_dirty_segment(sbi, new_segno, DIRTY);
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ reset_curseg(sbi, type, 1);
+ curseg->alloc_type = SSR;
+ __next_free_blkoff(sbi, curseg, 0);
+
+ if (reuse) {
+ sum_page = get_sum_page(sbi, new_segno);
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+ f2fs_put_page(sum_page, 1);
+ }
+}
+
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+
+ if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+ return v_ops->get_victim(sbi,
+ &(curseg)->next_segno, BG_GC, type, SSR);
+
+ /* For data segments, let's do SSR more intensively */
+ for (; type >= CURSEG_HOT_DATA; type--)
+ if (v_ops->get_victim(sbi, &(curseg)->next_segno,
+ BG_GC, type, SSR))
+ return 1;
+ return 0;
+}
+
+/*
+ * flush out current segment and replace it with new segment
+ * This function should be returned with success, otherwise BUG
+ */
+static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
+ int type, bool force)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int ofs_unit;
+
+ if (force) {
+ new_curseg(sbi, type, true);
+ goto out;
+ }
+
+ ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
+ curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
+
+ if (curseg->next_segno != NULL_SEGNO)
+ change_curseg(sbi, type, false);
+ else if (type == CURSEG_WARM_NODE)
+ new_curseg(sbi, type, false);
+ else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
+ change_curseg(sbi, type, true);
+ else
+ new_curseg(sbi, type, false);
+out:
+ sbi->segment_count[curseg->alloc_type]++;
+}
+
+void allocate_new_segments(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg;
+ unsigned int old_curseg;
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ curseg = CURSEG_I(sbi, i);
+ old_curseg = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_curseg);
+ }
+}
+
+static const struct segment_allocation default_salloc_ops = {
+ .allocate_segment = allocate_segment_by_default,
+};
+
+static void f2fs_end_io_write(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_private *p = bio->bi_private;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (!uptodate) {
+ SetPageError(page);
+ if (page->mapping)
+ set_bit(AS_EIO, &page->mapping->flags);
+ set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+ p->sbi->sb->s_flags |= MS_RDONLY;
+ }
+ end_page_writeback(page);
+ dec_page_count(p->sbi, F2FS_WRITEBACK);
+ } while (bvec >= bio->bi_io_vec);
+
+ if (p->is_sync)
+ complete(p->wait);
+ kfree(p);
+ bio_put(bio);
+}
+
+struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
+{
+ struct bio *bio;
+ struct bio_private *priv;
+retry:
+ priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
+ if (!priv) {
+ cond_resched();
+ goto retry;
+ }
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+ bio->bi_bdev = bdev;
+ bio->bi_private = priv;
+ return bio;
+}
+
+static void do_submit_bio(struct f2fs_sb_info *sbi,
+ enum page_type type, bool sync)
+{
+ int rw = sync ? WRITE_SYNC : WRITE;
+ enum page_type btype = type > META ? META : type;
+
+ if (type >= META_FLUSH)
+ rw = WRITE_FLUSH_FUA;
+
+ if (sbi->bio[btype]) {
+ struct bio_private *p = sbi->bio[btype]->bi_private;
+ p->sbi = sbi;
+ sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+ if (type == META_FLUSH) {
+ DECLARE_COMPLETION_ONSTACK(wait);
+ p->is_sync = true;
+ p->wait = &wait;
+ submit_bio(rw, sbi->bio[btype]);
+ wait_for_completion(&wait);
+ } else {
+ p->is_sync = false;
+ submit_bio(rw, sbi->bio[btype]);
+ }
+ sbi->bio[btype] = NULL;
+ }
+}
+
+void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
+{
+ down_write(&sbi->bio_sem);
+ do_submit_bio(sbi, type, sync);
+ up_write(&sbi->bio_sem);
+}
+
+static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, enum page_type type)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+
+ verify_block_addr(sbi, blk_addr);
+
+ down_write(&sbi->bio_sem);
+
+ inc_page_count(sbi, F2FS_WRITEBACK);
+
+ if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
+ do_submit_bio(sbi, type, false);
+alloc_new:
+ if (sbi->bio[type] == NULL) {
+ sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+ sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ /*
+ * The end_io will be assigned at the sumbission phase.
+ * Until then, let bio_add_page() merge consecutive IOs as much
+ * as possible.
+ */
+ }
+
+ if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ do_submit_bio(sbi, type, false);
+ goto alloc_new;
+ }
+
+ sbi->last_block_in_bio[type] = blk_addr;
+
+ up_write(&sbi->bio_sem);
+}
+
+static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ if (curseg->next_blkoff < sbi->blocks_per_seg)
+ return true;
+ return false;
+}
+
+static int __get_segment_type_2(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA)
+ return CURSEG_HOT_DATA;
+ else
+ return CURSEG_HOT_NODE;
+}
+
+static int __get_segment_type_4(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA) {
+ struct inode *inode = page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode))
+ return CURSEG_HOT_DATA;
+ else
+ return CURSEG_COLD_DATA;
+ } else {
+ if (IS_DNODE(page) && !is_cold_node(page))
+ return CURSEG_HOT_NODE;
+ else
+ return CURSEG_COLD_NODE;
+ }
+}
+
+static int __get_segment_type_6(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA) {
+ struct inode *inode = page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode))
+ return CURSEG_HOT_DATA;
+ else if (is_cold_data(page) || is_cold_file(inode))
+ return CURSEG_COLD_DATA;
+ else
+ return CURSEG_WARM_DATA;
+ } else {
+ if (IS_DNODE(page))
+ return is_cold_node(page) ? CURSEG_WARM_NODE :
+ CURSEG_HOT_NODE;
+ else
+ return CURSEG_COLD_NODE;
+ }
+}
+
+static int __get_segment_type(struct page *page, enum page_type p_type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ switch (sbi->active_logs) {
+ case 2:
+ return __get_segment_type_2(page, p_type);
+ case 4:
+ return __get_segment_type_4(page, p_type);
+ }
+ /* NR_CURSEG_TYPE(6) logs by default */
+ BUG_ON(sbi->active_logs != NR_CURSEG_TYPE);
+ return __get_segment_type_6(page, p_type);
+}
+
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, enum page_type p_type)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg;
+ unsigned int old_cursegno;
+ int type;
+
+ type = __get_segment_type(page, p_type);
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ old_cursegno = curseg->segno;
+
+ /*
+ * __add_sum_entry should be resided under the curseg_mutex
+ * because, this function updates a summary entry in the
+ * current summary block.
+ */
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ mutex_lock(&sit_i->sentry_lock);
+ __refresh_next_blkoff(sbi, curseg);
+ sbi->block_count[curseg->alloc_type]++;
+
+ /*
+ * SIT information should be updated before segment allocation,
+ * since SSR needs latest valid block information.
+ */
+ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ mutex_unlock(&sit_i->sentry_lock);
+
+ if (p_type == NODE)
+ fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+
+ /* writeout dirty page into bdev */
+ submit_write_page(sbi, page, *new_blkaddr, p_type);
+
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+ set_page_writeback(page);
+ submit_write_page(sbi, page, page->index, META);
+}
+
+void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+ unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
+{
+ struct f2fs_summary sum;
+ set_summary(&sum, nid, 0, 0);
+ do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+}
+
+void write_data_page(struct inode *inode, struct page *page,
+ struct dnode_of_data *dn, block_t old_blkaddr,
+ block_t *new_blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_summary sum;
+ struct node_info ni;
+
+ BUG_ON(old_blkaddr == NULL_ADDR);
+ get_node_info(sbi, dn->nid, &ni);
+ set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+
+ do_write_page(sbi, page, old_blkaddr,
+ new_blkaddr, &sum, DATA);
+}
+
+void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blk_addr)
+{
+ submit_write_page(sbi, page, old_blk_addr, DATA);
+}
+
+void recover_data_page(struct f2fs_sb_info *sbi,
+ struct page *page, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg;
+ unsigned int segno, old_cursegno;
+ struct seg_entry *se;
+ int type;
+
+ segno = GET_SEGNO(sbi, new_blkaddr);
+ se = get_seg_entry(sbi, segno);
+ type = se->type;
+
+ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (old_blkaddr == NULL_ADDR)
+ type = CURSEG_COLD_DATA;
+ else
+ type = CURSEG_WARM_DATA;
+ }
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ old_cursegno = curseg->segno;
+
+ /* change the current segment */
+ if (segno != curseg->segno) {
+ curseg->next_segno = segno;
+ change_curseg(sbi, type, true);
+ }
+
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+void rewrite_node_page(struct f2fs_sb_info *sbi,
+ struct page *page, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ int type = CURSEG_WARM_NODE;
+ struct curseg_info *curseg;
+ unsigned int segno, old_cursegno;
+ block_t next_blkaddr = next_blkaddr_of_node(page);
+ unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, new_blkaddr);
+ old_cursegno = curseg->segno;
+
+ /* change the current segment */
+ if (segno != curseg->segno) {
+ curseg->next_segno = segno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ /* change the current log to the next block addr in advance */
+ if (next_segno != segno) {
+ curseg->next_segno = next_segno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+
+ /* rewrite node page */
+ set_page_writeback(page);
+ submit_write_page(sbi, page, new_blkaddr, NODE);
+ f2fs_submit_bio(sbi, NODE, true);
+ refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+static int read_compacted_summaries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct curseg_info *seg_i;
+ unsigned char *kaddr;
+ struct page *page;
+ block_t start;
+ int i, j, offset;
+
+ start = start_sum_block(sbi);
+
+ page = get_meta_page(sbi, start++);
+ kaddr = (unsigned char *)page_address(page);
+
+ /* Step 1: restore nat cache */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+
+ /* Step 2: restore sit cache */
+ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
+ SUM_JOURNAL_SIZE);
+ offset = 2 * SUM_JOURNAL_SIZE;
+
+ /* Step 3: restore summary entries */
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ unsigned short blk_off;
+ unsigned int segno;
+
+ seg_i = CURSEG_I(sbi, i);
+ segno = le32_to_cpu(ckpt->cur_data_segno[i]);
+ blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
+ seg_i->next_segno = segno;
+ reset_curseg(sbi, i, 0);
+ seg_i->alloc_type = ckpt->alloc_type[i];
+ seg_i->next_blkoff = blk_off;
+
+ if (seg_i->alloc_type == SSR)
+ blk_off = sbi->blocks_per_seg;
+
+ for (j = 0; j < blk_off; j++) {
+ struct f2fs_summary *s;
+ s = (struct f2fs_summary *)(kaddr + offset);
+ seg_i->sum_blk->entries[j] = *s;
+ offset += SUMMARY_SIZE;
+ if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ SUM_FOOTER_SIZE)
+ continue;
+
+ f2fs_put_page(page, 1);
+ page = NULL;
+
+ page = get_meta_page(sbi, start++);
+ kaddr = (unsigned char *)page_address(page);
+ offset = 0;
+ }
+ }
+ f2fs_put_page(page, 1);
+ return 0;
+}
+
+static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_summary_block *sum;
+ struct curseg_info *curseg;
+ struct page *new;
+ unsigned short blk_off;
+ unsigned int segno = 0;
+ block_t blk_addr = 0;
+
+ /* get segment number and block addr */
+ if (IS_DATASEG(type)) {
+ segno = le32_to_cpu(ckpt->cur_data_segno[type]);
+ blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
+ CURSEG_HOT_DATA]);
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
+ else
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
+ } else {
+ segno = le32_to_cpu(ckpt->cur_node_segno[type -
+ CURSEG_HOT_NODE]);
+ blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
+ CURSEG_HOT_NODE]);
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
+ type - CURSEG_HOT_NODE);
+ else
+ blk_addr = GET_SUM_BLOCK(sbi, segno);
+ }
+
+ new = get_meta_page(sbi, blk_addr);
+ sum = (struct f2fs_summary_block *)page_address(new);
+
+ if (IS_NODESEG(type)) {
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+ struct f2fs_summary *ns = &sum->entries[0];
+ int i;
+ for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+ ns->version = 0;
+ ns->ofs_in_node = 0;
+ }
+ } else {
+ if (restore_node_summary(sbi, segno, sum)) {
+ f2fs_put_page(new, 1);
+ return -EINVAL;
+ }
+ }
+ }
+
+ /* set uncompleted segment to curseg */
+ curseg = CURSEG_I(sbi, type);
+ mutex_lock(&curseg->curseg_mutex);
+ memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+ curseg->next_segno = segno;
+ reset_curseg(sbi, type, 0);
+ curseg->alloc_type = ckpt->alloc_type[type];
+ curseg->next_blkoff = blk_off;
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_put_page(new, 1);
+ return 0;
+}
+
+static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
+{
+ int type = CURSEG_HOT_DATA;
+
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+ /* restore for compacted data summary */
+ if (read_compacted_summaries(sbi))
+ return -EINVAL;
+ type = CURSEG_HOT_NODE;
+ }
+
+ for (; type <= CURSEG_COLD_NODE; type++)
+ if (read_normal_summaries(sbi, type))
+ return -EINVAL;
+ return 0;
+}
+
+static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct page *page;
+ unsigned char *kaddr;
+ struct f2fs_summary *summary;
+ struct curseg_info *seg_i;
+ int written_size = 0;
+ int i, j;
+
+ page = grab_meta_page(sbi, blkaddr++);
+ kaddr = (unsigned char *)page_address(page);
+
+ /* Step 1: write nat cache */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+ written_size += SUM_JOURNAL_SIZE;
+
+ /* Step 2: write sit cache */
+ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
+ SUM_JOURNAL_SIZE);
+ written_size += SUM_JOURNAL_SIZE;
+
+ set_page_dirty(page);
+
+ /* Step 3: write summary entries */
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ unsigned short blkoff;
+ seg_i = CURSEG_I(sbi, i);
+ if (sbi->ckpt->alloc_type[i] == SSR)
+ blkoff = sbi->blocks_per_seg;
+ else
+ blkoff = curseg_blkoff(sbi, i);
+
+ for (j = 0; j < blkoff; j++) {
+ if (!page) {
+ page = grab_meta_page(sbi, blkaddr++);
+ kaddr = (unsigned char *)page_address(page);
+ written_size = 0;
+ }
+ summary = (struct f2fs_summary *)(kaddr + written_size);
+ *summary = seg_i->sum_blk->entries[j];
+ written_size += SUMMARY_SIZE;
+ set_page_dirty(page);
+
+ if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ SUM_FOOTER_SIZE)
+ continue;
+
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+ }
+ if (page)
+ f2fs_put_page(page, 1);
+}
+
+static void write_normal_summaries(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ int i, end;
+ if (IS_DATASEG(type))
+ end = type + NR_CURSEG_DATA_TYPE;
+ else
+ end = type + NR_CURSEG_NODE_TYPE;
+
+ for (i = type; i < end; i++) {
+ struct curseg_info *sum = CURSEG_I(sbi, i);
+ mutex_lock(&sum->curseg_mutex);
+ write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
+ mutex_unlock(&sum->curseg_mutex);
+ }
+}
+
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+ write_compacted_summaries(sbi, start_blk);
+ else
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
+}
+
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+ return;
+}
+
+int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+ unsigned int val, int alloc)
+{
+ int i;
+
+ if (type == NAT_JOURNAL) {
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+ return i;
+ }
+ if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+ return update_nats_in_cursum(sum, 1);
+ } else if (type == SIT_JOURNAL) {
+ for (i = 0; i < sits_in_cursum(sum); i++)
+ if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+ return i;
+ if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+ return update_sits_in_cursum(sum, 1);
+ }
+ return -1;
+}
+
+static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+ block_t blk_addr = sit_i->sit_base_addr + offset;
+
+ check_seg_range(sbi, segno);
+
+ /* calculate sit block address */
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+ blk_addr += sit_i->sit_blocks;
+
+ return get_meta_page(sbi, blk_addr);
+}
+
+static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
+ unsigned int start)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct page *src_page, *dst_page;
+ pgoff_t src_off, dst_off;
+ void *src_addr, *dst_addr;
+
+ src_off = current_sit_addr(sbi, start);
+ dst_off = next_sit_addr(sbi, src_off);
+
+ /* get current sit block page without lock */
+ src_page = get_meta_page(sbi, src_off);
+ dst_page = grab_meta_page(sbi, dst_off);
+ BUG_ON(PageDirty(src_page));
+
+ src_addr = page_address(src_page);
+ dst_addr = page_address(dst_page);
+ memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+
+ set_page_dirty(dst_page);
+ f2fs_put_page(src_page, 1);
+
+ set_to_next_sit(sit_i, start);
+
+ return dst_page;
+}
+
+static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ int i;
+
+ /*
+ * If the journal area in the current summary is full of sit entries,
+ * all the sit entries will be flushed. Otherwise the sit entries
+ * are not able to replace with newly hot sit entries.
+ */
+ if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
+ for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ unsigned int segno;
+ segno = le32_to_cpu(segno_in_journal(sum, i));
+ __mark_sit_entry_dirty(sbi, segno);
+ }
+ update_sits_in_cursum(sum, -sits_in_cursum(sum));
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * CP calls this function, which flushes SIT entries including sit_journal,
+ * and moves prefree segs to free segs.
+ */
+void flush_sit_entries(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ unsigned long nsegs = TOTAL_SEGS(sbi);
+ struct page *page = NULL;
+ struct f2fs_sit_block *raw_sit = NULL;
+ unsigned int start = 0, end = 0;
+ unsigned int segno = -1;
+ bool flushed;
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ /*
+ * "flushed" indicates whether sit entries in journal are flushed
+ * to the SIT area or not.
+ */
+ flushed = flush_sits_in_journal(sbi);
+
+ while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
+ struct seg_entry *se = get_seg_entry(sbi, segno);
+ int sit_offset, offset;
+
+ sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+
+ if (flushed)
+ goto to_sit_page;
+
+ offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
+ if (offset >= 0) {
+ segno_in_journal(sum, offset) = cpu_to_le32(segno);
+ seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
+ goto flush_done;
+ }
+to_sit_page:
+ if (!page || (start > segno) || (segno > end)) {
+ if (page) {
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+
+ start = START_SEGNO(sit_i, segno);
+ end = start + SIT_ENTRY_PER_BLOCK - 1;
+
+ /* read sit block that will be updated */
+ page = get_next_sit_page(sbi, start);
+ raw_sit = page_address(page);
+ }
+
+ /* udpate entry in SIT block */
+ seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
+flush_done:
+ __clear_bit(segno, bitmap);
+ sit_i->dirty_sentries--;
+ }
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+
+ /* writeout last modified SIT block */
+ f2fs_put_page(page, 1);
+
+ set_prefree_as_free_segments(sbi);
+}
+
+static int build_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct sit_info *sit_i;
+ unsigned int sit_segs, start;
+ char *src_bitmap, *dst_bitmap;
+ unsigned int bitmap_size;
+
+ /* allocate memory for SIT information */
+ sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+ if (!sit_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->sit_info = sit_i;
+
+ sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+ if (!sit_i->sentries)
+ return -ENOMEM;
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!sit_i->dirty_sentries_bitmap)
+ return -ENOMEM;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ sit_i->sentries[start].cur_valid_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ sit_i->sentries[start].ckpt_valid_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].cur_valid_map
+ || !sit_i->sentries[start].ckpt_valid_map)
+ return -ENOMEM;
+ }
+
+ if (sbi->segs_per_sec > 1) {
+ sit_i->sec_entries = vzalloc(sbi->total_sections *
+ sizeof(struct sec_entry));
+ if (!sit_i->sec_entries)
+ return -ENOMEM;
+ }
+
+ /* get information related with SIT */
+ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
+
+ /* setup SIT bitmap from ckeckpoint pack */
+ bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
+ src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
+
+ dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dst_bitmap)
+ return -ENOMEM;
+ memcpy(dst_bitmap, src_bitmap, bitmap_size);
+
+ /* init SIT information */
+ sit_i->s_ops = &default_salloc_ops;
+
+ sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
+ sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+ sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+ sit_i->sit_bitmap = dst_bitmap;
+ sit_i->bitmap_size = bitmap_size;
+ sit_i->dirty_sentries = 0;
+ sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
+ sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
+ sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+ mutex_init(&sit_i->sentry_lock);
+ return 0;
+}
+
+static int build_free_segmap(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ struct free_segmap_info *free_i;
+ unsigned int bitmap_size, sec_bitmap_size;
+
+ /* allocate memory for free segmap information */
+ free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+ if (!free_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->free_info = free_i;
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ if (!free_i->free_segmap)
+ return -ENOMEM;
+
+ sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+ free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ if (!free_i->free_secmap)
+ return -ENOMEM;
+
+ /* set all segments as dirty temporarily */
+ memset(free_i->free_segmap, 0xff, bitmap_size);
+ memset(free_i->free_secmap, 0xff, sec_bitmap_size);
+
+ /* init free segmap information */
+ free_i->start_segno =
+ (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+ free_i->free_segments = 0;
+ free_i->free_sections = 0;
+ rwlock_init(&free_i->segmap_lock);
+ return 0;
+}
+
+static int build_curseg(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *array;
+ int i;
+
+ array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ SM_I(sbi)->curseg_array = array;
+
+ for (i = 0; i < NR_CURSEG_TYPE; i++) {
+ mutex_init(&array[i].curseg_mutex);
+ array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ if (!array[i].sum_blk)
+ return -ENOMEM;
+ array[i].segno = NULL_SEGNO;
+ array[i].next_blkoff = 0;
+ }
+ return restore_curseg_summaries(sbi);
+}
+
+static void build_sit_entries(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ unsigned int start;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ struct seg_entry *se = &sit_i->sentries[start];
+ struct f2fs_sit_block *sit_blk;
+ struct f2fs_sit_entry sit;
+ struct page *page;
+ int i;
+
+ mutex_lock(&curseg->curseg_mutex);
+ for (i = 0; i < sits_in_cursum(sum); i++) {
+ if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+ sit = sit_in_journal(sum, i);
+ mutex_unlock(&curseg->curseg_mutex);
+ goto got_it;
+ }
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+ page = get_current_sit_page(sbi, start);
+ sit_blk = (struct f2fs_sit_block *)page_address(page);
+ sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+ f2fs_put_page(page, 1);
+got_it:
+ check_block_count(sbi, start, &sit);
+ seg_info_from_raw_sit(se, &sit);
+ if (sbi->segs_per_sec > 1) {
+ struct sec_entry *e = get_sec_entry(sbi, start);
+ e->valid_blocks += se->valid_blocks;
+ }
+ }
+}
+
+static void init_free_segmap(struct f2fs_sb_info *sbi)
+{
+ unsigned int start;
+ int type;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ struct seg_entry *sentry = get_seg_entry(sbi, start);
+ if (!sentry->valid_blocks)
+ __set_free(sbi, start);
+ }
+
+ /* set use the current segments */
+ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
+ struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+ __set_test_and_inuse(sbi, curseg_t->segno);
+ }
+}
+
+static void init_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int segno = 0, offset = 0;
+ unsigned short valid_blocks;
+
+ while (segno < TOTAL_SEGS(sbi)) {
+ /* find dirty segment based on free segmap */
+ segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
+ if (segno >= TOTAL_SEGS(sbi))
+ break;
+ offset = segno + 1;
+ valid_blocks = get_valid_blocks(sbi, segno, 0);
+ if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+ continue;
+ mutex_lock(&dirty_i->seglist_lock);
+ __locate_dirty_segment(sbi, segno, DIRTY);
+ mutex_unlock(&dirty_i->seglist_lock);
+ }
+}
+
+static int init_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+ return -ENOMEM;
+ return 0;
+}
+
+static int build_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i;
+ unsigned int bitmap_size, i;
+
+ /* allocate memory for dirty segments list information */
+ dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+ if (!dirty_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->dirty_info = dirty_i;
+ mutex_init(&dirty_i->seglist_lock);
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ for (i = 0; i < NR_DIRTY_TYPE; i++) {
+ dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dirty_i->dirty_segmap[i])
+ return -ENOMEM;
+ }
+
+ init_dirty_segmap(sbi);
+ return init_victim_segmap(sbi);
+}
+
+/*
+ * Update min, max modified time for cost-benefit GC algorithm
+ */
+static void init_min_max_mtime(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ sit_i->min_mtime = LLONG_MAX;
+
+ for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ unsigned int i;
+ unsigned long long mtime = 0;
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ mtime += get_seg_entry(sbi, segno + i)->mtime;
+
+ mtime = div_u64(mtime, sbi->segs_per_sec);
+
+ if (sit_i->min_mtime > mtime)
+ sit_i->min_mtime = mtime;
+ }
+ sit_i->max_mtime = get_mtime(sbi);
+ mutex_unlock(&sit_i->sentry_lock);
+}
+
+int build_segment_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_sm_info *sm_info;
+ int err;
+
+ sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+ if (!sm_info)
+ return -ENOMEM;
+
+ /* init sm info */
+ sbi->sm_info = sm_info;
+ INIT_LIST_HEAD(&sm_info->wblist_head);
+ spin_lock_init(&sm_info->wblist_lock);
+ sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+ sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+ sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
+ sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+ sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+ sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
+ sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+
+ err = build_sit_info(sbi);
+ if (err)
+ return err;
+ err = build_free_segmap(sbi);
+ if (err)
+ return err;
+ err = build_curseg(sbi);
+ if (err)
+ return err;
+
+ /* reinit free segmap based on SIT */
+ build_sit_entries(sbi);
+
+ init_free_segmap(sbi);
+ err = build_dirty_segmap(sbi);
+ if (err)
+ return err;
+
+ init_min_max_mtime(sbi);
+ return 0;
+}
+
+static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ kfree(dirty_i->dirty_segmap[dirty_type]);
+ dirty_i->nr_dirty[dirty_type] = 0;
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+void reset_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
+}
+
+static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ kfree(dirty_i->victim_segmap[FG_GC]);
+ kfree(dirty_i->victim_segmap[BG_GC]);
+}
+
+static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ int i;
+
+ if (!dirty_i)
+ return;
+
+ /* discard pre-free/dirty segments list */
+ for (i = 0; i < NR_DIRTY_TYPE; i++)
+ discard_dirty_segmap(sbi, i);
+
+ destroy_victim_segmap(sbi);
+ SM_I(sbi)->dirty_info = NULL;
+ kfree(dirty_i);
+}
+
+static void destroy_curseg(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *array = SM_I(sbi)->curseg_array;
+ int i;
+
+ if (!array)
+ return;
+ SM_I(sbi)->curseg_array = NULL;
+ for (i = 0; i < NR_CURSEG_TYPE; i++)
+ kfree(array[i].sum_blk);
+ kfree(array);
+}
+
+static void destroy_free_segmap(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+ if (!free_i)
+ return;
+ SM_I(sbi)->free_info = NULL;
+ kfree(free_i->free_segmap);
+ kfree(free_i->free_secmap);
+ kfree(free_i);
+}
+
+static void destroy_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int start;
+
+ if (!sit_i)
+ return;
+
+ if (sit_i->sentries) {
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ kfree(sit_i->sentries[start].cur_valid_map);
+ kfree(sit_i->sentries[start].ckpt_valid_map);
+ }
+ }
+ vfree(sit_i->sentries);
+ vfree(sit_i->sec_entries);
+ kfree(sit_i->dirty_sentries_bitmap);
+
+ SM_I(sbi)->sit_info = NULL;
+ kfree(sit_i->sit_bitmap);
+ kfree(sit_i);
+}
+
+void destroy_segment_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ destroy_dirty_segmap(sbi);
+ destroy_curseg(sbi);
+ destroy_free_segmap(sbi);
+ destroy_sit_info(sbi);
+ sbi->sm_info = NULL;
+ kfree(sm_info);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
new file mode 100644
index 000000000000..552dadbb2327
--- /dev/null
+++ b/fs/f2fs/segment.h
@@ -0,0 +1,618 @@
+/*
+ * fs/f2fs/segment.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* constant macro */
+#define NULL_SEGNO ((unsigned int)(~0))
+
+/* V: Logical segment # in volume, R: Relative segment # in main area */
+#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+
+#define IS_DATASEG(t) \
+ ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \
+ (t == CURSEG_WARM_DATA))
+
+#define IS_NODESEG(t) \
+ ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
+ (t == CURSEG_WARM_NODE))
+
+#define IS_CURSEG(sbi, segno) \
+ ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+
+#define IS_CURSEC(sbi, secno) \
+ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ sbi->segs_per_sec)) \
+
+#define START_BLOCK(sbi, segno) \
+ (SM_I(sbi)->seg0_blkaddr + \
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+#define NEXT_FREE_BLKADDR(sbi, curseg) \
+ (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+
+#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
+
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
+ ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_SEGNO(sbi, blk_addr) \
+ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
+ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
+ GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
+#define GET_SECNO(sbi, segno) \
+ ((segno) / sbi->segs_per_sec)
+#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
+ ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+
+#define GET_SUM_BLOCK(sbi, segno) \
+ ((sbi->sm_info->ssa_blkaddr) + segno)
+
+#define GET_SUM_TYPE(footer) ((footer)->entry_type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+
+#define SIT_ENTRY_OFFSET(sit_i, segno) \
+ (segno % sit_i->sents_per_block)
+#define SIT_BLOCK_OFFSET(sit_i, segno) \
+ (segno / SIT_ENTRY_PER_BLOCK)
+#define START_SEGNO(sit_i, segno) \
+ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define f2fs_bitmap_size(nr) \
+ (BITS_TO_LONGS(nr) * sizeof(unsigned long))
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
+
+#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
+ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+
+/* during checkpoint, bio_private is used to synchronize the last bio */
+struct bio_private {
+ struct f2fs_sb_info *sbi;
+ bool is_sync;
+ void *wait;
+};
+
+/*
+ * indicate a block allocation direction: RIGHT and LEFT.
+ * RIGHT means allocating new sections towards the end of volume.
+ * LEFT means the opposite direction.
+ */
+enum {
+ ALLOC_RIGHT = 0,
+ ALLOC_LEFT
+};
+
+/*
+ * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * LFS writes data sequentially with cleaning operations.
+ * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
+ */
+enum {
+ LFS = 0,
+ SSR
+};
+
+/*
+ * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * GC_CB is based on cost-benefit algorithm.
+ * GC_GREEDY is based on greedy algorithm.
+ */
+enum {
+ GC_CB = 0,
+ GC_GREEDY
+};
+
+/*
+ * BG_GC means the background cleaning job.
+ * FG_GC means the on-demand cleaning job.
+ */
+enum {
+ BG_GC = 0,
+ FG_GC
+};
+
+/* for a function parameter to select a victim segment */
+struct victim_sel_policy {
+ int alloc_mode; /* LFS or SSR */
+ int gc_mode; /* GC_CB or GC_GREEDY */
+ unsigned long *dirty_segmap; /* dirty segment bitmap */
+ unsigned int offset; /* last scanned bitmap offset */
+ unsigned int ofs_unit; /* bitmap search unit */
+ unsigned int min_cost; /* minimum cost */
+ unsigned int min_segno; /* segment # having min. cost */
+};
+
+struct seg_entry {
+ unsigned short valid_blocks; /* # of valid blocks */
+ unsigned char *cur_valid_map; /* validity bitmap of blocks */
+ /*
+ * # of valid blocks and the validity bitmap stored in the the last
+ * checkpoint pack. This information is used by the SSR mode.
+ */
+ unsigned short ckpt_valid_blocks;
+ unsigned char *ckpt_valid_map;
+ unsigned char type; /* segment type like CURSEG_XXX_TYPE */
+ unsigned long long mtime; /* modification time of the segment */
+};
+
+struct sec_entry {
+ unsigned int valid_blocks; /* # of valid blocks in a section */
+};
+
+struct segment_allocation {
+ void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
+};
+
+struct sit_info {
+ const struct segment_allocation *s_ops;
+
+ block_t sit_base_addr; /* start block address of SIT area */
+ block_t sit_blocks; /* # of blocks used by SIT area */
+ block_t written_valid_blocks; /* # of valid blocks in main area */
+ char *sit_bitmap; /* SIT bitmap pointer */
+ unsigned int bitmap_size; /* SIT bitmap size */
+
+ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
+ unsigned int dirty_sentries; /* # of dirty sentries */
+ unsigned int sents_per_block; /* # of SIT entries per block */
+ struct mutex sentry_lock; /* to protect SIT cache */
+ struct seg_entry *sentries; /* SIT segment-level cache */
+ struct sec_entry *sec_entries; /* SIT section-level cache */
+
+ /* for cost-benefit algorithm in cleaning procedure */
+ unsigned long long elapsed_time; /* elapsed time after mount */
+ unsigned long long mounted_time; /* mount time */
+ unsigned long long min_mtime; /* min. modification time */
+ unsigned long long max_mtime; /* max. modification time */
+};
+
+struct free_segmap_info {
+ unsigned int start_segno; /* start segment number logically */
+ unsigned int free_segments; /* # of free segments */
+ unsigned int free_sections; /* # of free sections */
+ rwlock_t segmap_lock; /* free segmap lock */
+ unsigned long *free_segmap; /* free segment bitmap */
+ unsigned long *free_secmap; /* free section bitmap */
+};
+
+/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
+enum dirty_type {
+ DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */
+ DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */
+ DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */
+ DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */
+ DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */
+ DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
+ DIRTY, /* to count # of dirty segments */
+ PRE, /* to count # of entirely obsolete segments */
+ NR_DIRTY_TYPE
+};
+
+struct dirty_seglist_info {
+ const struct victim_selection *v_ops; /* victim selction operation */
+ unsigned long *dirty_segmap[NR_DIRTY_TYPE];
+ struct mutex seglist_lock; /* lock for segment bitmaps */
+ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
+ unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */
+};
+
+/* victim selection function for cleaning and SSR */
+struct victim_selection {
+ int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
+ int, int, char);
+};
+
+/* for active log information */
+struct curseg_info {
+ struct mutex curseg_mutex; /* lock for consistency */
+ struct f2fs_summary_block *sum_blk; /* cached summary block */
+ unsigned char alloc_type; /* current allocation type */
+ unsigned int segno; /* current segment number */
+ unsigned short next_blkoff; /* next block offset to write */
+ unsigned int zone; /* current zone number */
+ unsigned int next_segno; /* preallocated segment */
+};
+
+/*
+ * inline functions
+ */
+static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
+{
+ return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
+}
+
+static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return &sit_i->sentries[segno];
+}
+
+static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+}
+
+static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno, int section)
+{
+ /*
+ * In order to get # of valid blocks in a section instantly from many
+ * segments, f2fs manages two counting structures separately.
+ */
+ if (section > 1)
+ return get_sec_entry(sbi, segno)->valid_blocks;
+ else
+ return get_seg_entry(sbi, segno)->valid_blocks;
+}
+
+static inline void seg_info_from_raw_sit(struct seg_entry *se,
+ struct f2fs_sit_entry *rs)
+{
+ se->valid_blocks = GET_SIT_VBLOCKS(rs);
+ se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
+ memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ se->type = GET_SIT_TYPE(rs);
+ se->mtime = le64_to_cpu(rs->mtime);
+}
+
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+ struct f2fs_sit_entry *rs)
+{
+ unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
+ se->valid_blocks;
+ rs->vblocks = cpu_to_le16(raw_vblocks);
+ memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+ memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ se->ckpt_valid_blocks = se->valid_blocks;
+ rs->mtime = cpu_to_le64(se->mtime);
+}
+
+static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
+ unsigned int max, unsigned int segno)
+{
+ unsigned int ret;
+ read_lock(&free_i->segmap_lock);
+ ret = find_next_bit(free_i->free_segmap, max, segno);
+ read_unlock(&free_i->segmap_lock);
+ return ret;
+}
+
+static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int next;
+
+ write_lock(&free_i->segmap_lock);
+ clear_bit(segno, free_i->free_segmap);
+ free_i->free_segments++;
+
+ next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+ if (next >= start_segno + sbi->segs_per_sec) {
+ clear_bit(secno, free_i->free_secmap);
+ free_i->free_sections++;
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_inuse(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ set_bit(segno, free_i->free_segmap);
+ free_i->free_segments--;
+ if (!test_and_set_bit(secno, free_i->free_secmap))
+ free_i->free_sections--;
+}
+
+static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int next;
+
+ write_lock(&free_i->segmap_lock);
+ if (test_and_clear_bit(segno, free_i->free_segmap)) {
+ free_i->free_segments++;
+
+ next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
+ start_segno);
+ if (next >= start_segno + sbi->segs_per_sec) {
+ if (test_and_clear_bit(secno, free_i->free_secmap))
+ free_i->free_sections++;
+ }
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ write_lock(&free_i->segmap_lock);
+ if (!test_and_set_bit(segno, free_i->free_segmap)) {
+ free_i->free_segments--;
+ if (!test_and_set_bit(secno, free_i->free_secmap))
+ free_i->free_sections--;
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
+ void *dst_addr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
+}
+
+static inline block_t written_block_count(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ block_t vblocks;
+
+ mutex_lock(&sit_i->sentry_lock);
+ vblocks = sit_i->written_valid_blocks;
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return vblocks;
+}
+
+static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int free_segs;
+
+ read_lock(&free_i->segmap_lock);
+ free_segs = free_i->free_segments;
+ read_unlock(&free_i->segmap_lock);
+
+ return free_segs;
+}
+
+static inline int reserved_segments(struct f2fs_sb_info *sbi)
+{
+ return SM_I(sbi)->reserved_segments;
+}
+
+static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int free_secs;
+
+ read_lock(&free_i->segmap_lock);
+ free_secs = free_i->free_sections;
+ read_unlock(&free_i->segmap_lock);
+
+ return free_secs;
+}
+
+static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
+{
+ return DIRTY_I(sbi)->nr_dirty[PRE];
+}
+
+static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
+{
+ return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
+}
+
+static inline int overprovision_segments(struct f2fs_sb_info *sbi)
+{
+ return SM_I(sbi)->ovp_segments;
+}
+
+static inline int overprovision_sections(struct f2fs_sb_info *sbi)
+{
+ return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline int reserved_sections(struct f2fs_sb_info *sbi)
+{
+ return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline bool need_SSR(struct f2fs_sb_info *sbi)
+{
+ return (free_sections(sbi) < overprovision_sections(sbi));
+}
+
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+{
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+
+ if (sbi->por_doing)
+ return false;
+
+ return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi)));
+}
+
+static inline int utilization(struct f2fs_sb_info *sbi)
+{
+ return (long int)valid_user_blocks(sbi) * 100 /
+ (long int)sbi->user_block_count;
+}
+
+/*
+ * Sometimes f2fs may be better to drop out-of-place update policy.
+ * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
+ * data in the original place likewise other traditional file systems.
+ * But, currently set 100 in percentage, which means it is disabled.
+ * See below need_inplace_update().
+ */
+#define MIN_IPU_UTIL 100
+static inline bool need_inplace_update(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (S_ISDIR(inode->i_mode))
+ return false;
+ if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+ return true;
+ return false;
+}
+
+static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
+ int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->segno;
+}
+
+static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
+ int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->alloc_type;
+}
+
+static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->next_blkoff;
+}
+
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ unsigned int end_segno = SM_I(sbi)->segment_count - 1;
+ BUG_ON(segno > end_segno);
+}
+
+/*
+ * This function is used for only debugging.
+ * NOTE: In future, we have to remove this function.
+ */
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+ block_t start_addr = sm_info->seg0_blkaddr;
+ block_t end_addr = start_addr + total_blks - 1;
+ BUG_ON(blk_addr < start_addr);
+ BUG_ON(blk_addr > end_addr);
+}
+
+/*
+ * Summary block is always treated as invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+ int segno, struct f2fs_sit_entry *raw_sit)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ unsigned int end_segno = sm_info->segment_count - 1;
+ int valid_blocks = 0;
+ int i;
+
+ /* check segment usage */
+ BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
+
+ /* check boundary of a given segment number */
+ BUG_ON(segno > end_segno);
+
+ /* check bitmap with valid block count */
+ for (i = 0; i < sbi->blocks_per_seg; i++)
+ if (f2fs_test_bit(i, raw_sit->valid_map))
+ valid_blocks++;
+ BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+}
+
+static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
+ unsigned int start)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+ block_t blk_addr = sit_i->sit_base_addr + offset;
+
+ check_seg_range(sbi, start);
+
+ /* calculate sit block address */
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+ blk_addr += sit_i->sit_blocks;
+
+ return blk_addr;
+}
+
+static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
+ pgoff_t block_addr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ block_addr -= sit_i->sit_base_addr;
+ if (block_addr < sit_i->sit_blocks)
+ block_addr += sit_i->sit_blocks;
+ else
+ block_addr -= sit_i->sit_blocks;
+
+ return block_addr + sit_i->sit_base_addr;
+}
+
+static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
+{
+ unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+
+ if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
+ f2fs_clear_bit(block_off, sit_i->sit_bitmap);
+ else
+ f2fs_set_bit(block_off, sit_i->sit_bitmap);
+}
+
+static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
+ sit_i->mounted_time;
+}
+
+static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
+ unsigned int ofs_in_node, unsigned char version)
+{
+ sum->nid = cpu_to_le32(nid);
+ sum->ofs_in_node = cpu_to_le16(ofs_in_node);
+ sum->version = version;
+}
+
+static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
+{
+ return __start_cp_addr(sbi) +
+ le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
+{
+ return __start_cp_addr(sbi) +
+ le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
+ - (base + 1) + type;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
new file mode 100644
index 000000000000..8c117649a035
--- /dev/null
+++ b/fs/f2fs/super.c
@@ -0,0 +1,749 @@
+/*
+ * fs/f2fs/super.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "xattr.h"
+
+static struct kmem_cache *f2fs_inode_cachep;
+
+enum {
+ Opt_gc_background_off,
+ Opt_disable_roll_forward,
+ Opt_discard,
+ Opt_noheap,
+ Opt_nouser_xattr,
+ Opt_noacl,
+ Opt_active_logs,
+ Opt_disable_ext_identify,
+ Opt_err,
+};
+
+static match_table_t f2fs_tokens = {
+ {Opt_gc_background_off, "background_gc_off"},
+ {Opt_disable_roll_forward, "disable_roll_forward"},
+ {Opt_discard, "discard"},
+ {Opt_noheap, "no_heap"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_noacl, "noacl"},
+ {Opt_active_logs, "active_logs=%u"},
+ {Opt_disable_ext_identify, "disable_ext_identify"},
+ {Opt_err, NULL},
+};
+
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ va_end(args);
+}
+
+static void init_once(void *foo)
+{
+ struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
+
+ inode_init_once(&fi->vfs_inode);
+}
+
+static struct inode *f2fs_alloc_inode(struct super_block *sb)
+{
+ struct f2fs_inode_info *fi;
+
+ fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+ if (!fi)
+ return NULL;
+
+ init_once((void *) fi);
+
+ /* Initilize f2fs-specific inode info */
+ fi->vfs_inode.i_version = 1;
+ atomic_set(&fi->dirty_dents, 0);
+ fi->i_current_depth = 1;
+ fi->i_advise = 0;
+ rwlock_init(&fi->ext.ext_lock);
+
+ set_inode_flag(fi, FI_NEW_INODE);
+
+ return &fi->vfs_inode;
+}
+
+static void f2fs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
+}
+
+static void f2fs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, f2fs_i_callback);
+}
+
+static void f2fs_put_super(struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ f2fs_destroy_stats(sbi);
+ stop_gc_thread(sbi);
+
+ write_checkpoint(sbi, true);
+
+ iput(sbi->node_inode);
+ iput(sbi->meta_inode);
+
+ /* destroy f2fs internal modules */
+ destroy_node_manager(sbi);
+ destroy_segment_manager(sbi);
+
+ kfree(sbi->ckpt);
+
+ sb->s_fs_info = NULL;
+ brelse(sbi->raw_super_buf);
+ kfree(sbi);
+}
+
+int f2fs_sync_fs(struct super_block *sb, int sync)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
+ return 0;
+
+ if (sync)
+ write_checkpoint(sbi, false);
+ else
+ f2fs_balance_fs(sbi);
+
+ return 0;
+}
+
+static int f2fs_freeze(struct super_block *sb)
+{
+ int err;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ err = f2fs_sync_fs(sb, 1);
+ return err;
+}
+
+static int f2fs_unfreeze(struct super_block *sb)
+{
+ return 0;
+}
+
+static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ block_t total_count, user_block_count, start_count, ovp_count;
+
+ total_count = le64_to_cpu(sbi->raw_super->block_count);
+ user_block_count = sbi->user_block_count;
+ start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
+ ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
+ buf->f_type = F2FS_SUPER_MAGIC;
+ buf->f_bsize = sbi->blocksize;
+
+ buf->f_blocks = total_count - start_count;
+ buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+ buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+
+ buf->f_files = sbi->total_node_count;
+ buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+
+ buf->f_namelen = F2FS_MAX_NAME_LEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+
+ return 0;
+}
+
+static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
+
+ if (test_opt(sbi, BG_GC))
+ seq_puts(seq, ",background_gc_on");
+ else
+ seq_puts(seq, ",background_gc_off");
+ if (test_opt(sbi, DISABLE_ROLL_FORWARD))
+ seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, DISCARD))
+ seq_puts(seq, ",discard");
+ if (test_opt(sbi, NOHEAP))
+ seq_puts(seq, ",no_heap_alloc");
+#ifdef CONFIG_F2FS_FS_XATTR
+ if (test_opt(sbi, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ else
+ seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ if (test_opt(sbi, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ else
+ seq_puts(seq, ",noacl");
+#endif
+ if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ seq_puts(seq, ",disable_ext_identify");
+
+ seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+
+ return 0;
+}
+
+static struct super_operations f2fs_sops = {
+ .alloc_inode = f2fs_alloc_inode,
+ .destroy_inode = f2fs_destroy_inode,
+ .write_inode = f2fs_write_inode,
+ .show_options = f2fs_show_options,
+ .evict_inode = f2fs_evict_inode,
+ .put_super = f2fs_put_super,
+ .sync_fs = f2fs_sync_fs,
+ .freeze_fs = f2fs_freeze,
+ .unfreeze_fs = f2fs_unfreeze,
+ .statfs = f2fs_statfs,
+};
+
+static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+
+ if (ino < F2FS_ROOT_INO(sbi))
+ return ERR_PTR(-ESTALE);
+
+ /*
+ * f2fs_iget isn't quite right if the inode is currently unallocated!
+ * However f2fs_iget currently does appropriate checks to handle stale
+ * inodes so everything is OK.
+ */
+ inode = f2fs_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (generation && inode->i_generation != generation) {
+ /* we didn't find the right inode.. */
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ return inode;
+}
+
+static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ f2fs_nfs_get_inode);
+}
+
+static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ f2fs_nfs_get_inode);
+}
+
+static const struct export_operations f2fs_export_ops = {
+ .fh_to_dentry = f2fs_fh_to_dentry,
+ .fh_to_parent = f2fs_fh_to_parent,
+ .get_parent = f2fs_get_parent,
+};
+
+static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
+ char *options)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background_off:
+ clear_opt(sbi, BG_GC);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static loff_t max_file_size(unsigned bits)
+{
+ loff_t result = ADDRS_PER_INODE;
+ loff_t leaf_count = ADDRS_PER_BLOCK;
+
+ /* two direct node blocks */
+ result += (leaf_count * 2);
+
+ /* two indirect node blocks */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += (leaf_count * 2);
+
+ /* one double indirect node block */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += leaf_count;
+
+ result <<= bits;
+ return result;
+}
+
+static int sanity_check_raw_super(struct super_block *sb,
+ struct f2fs_super_block *raw_super)
+{
+ unsigned int blocksize;
+
+ if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Magic Mismatch, valid(0x%x) - read(0x%x)",
+ F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
+ return 1;
+ }
+
+ /* Currently, support only 4KB page cache size */
+ if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid page_cache_size (%lu), supports only 4KB\n",
+ PAGE_CACHE_SIZE);
+ return 1;
+ }
+
+ /* Currently, support only 4KB block size */
+ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
+ if (blocksize != F2FS_BLKSIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid blocksize (%u), supports only 4KB\n",
+ blocksize);
+ return 1;
+ }
+
+ if (le32_to_cpu(raw_super->log_sectorsize) !=
+ F2FS_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
+ return 1;
+ }
+ if (le32_to_cpu(raw_super->log_sectors_per_block) !=
+ F2FS_LOG_SECTORS_PER_BLOCK) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
+ return 1;
+ }
+ return 0;
+}
+
+static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+{
+ unsigned int total, fsmeta;
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+ total = le32_to_cpu(raw_super->segment_count);
+ fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
+ fsmeta += le32_to_cpu(raw_super->segment_count_sit);
+ fsmeta += le32_to_cpu(raw_super->segment_count_nat);
+ fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
+ fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
+
+ if (fsmeta >= total)
+ return 1;
+
+ if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+ return 1;
+ }
+ return 0;
+}
+
+static void init_sb_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = sbi->raw_super;
+ int i;
+
+ sbi->log_sectors_per_block =
+ le32_to_cpu(raw_super->log_sectors_per_block);
+ sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
+ sbi->blocksize = 1 << sbi->log_blocksize;
+ sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+ sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
+ sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+ sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+ sbi->total_sections = le32_to_cpu(raw_super->section_count);
+ sbi->total_node_count =
+ (le32_to_cpu(raw_super->segment_count_nat) / 2)
+ * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+ sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
+ sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
+ sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ atomic_set(&sbi->nr_pages[i], 0);
+}
+
+static int validate_superblock(struct super_block *sb,
+ struct f2fs_super_block **raw_super,
+ struct buffer_head **raw_super_buf, sector_t block)
+{
+ const char *super = (block == 0 ? "first" : "second");
+
+ /* read f2fs raw super block */
+ *raw_super_buf = sb_bread(sb, block);
+ if (!*raw_super_buf) {
+ f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
+ super);
+ return 1;
+ }
+
+ *raw_super = (struct f2fs_super_block *)
+ ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+
+ /* sanity checking of raw super */
+ if (!sanity_check_raw_super(sb, *raw_super))
+ return 0;
+
+ f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+ "in %s superblock", super);
+ return 1;
+}
+
+static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct f2fs_sb_info *sbi;
+ struct f2fs_super_block *raw_super;
+ struct buffer_head *raw_super_buf;
+ struct inode *root;
+ long err = -EINVAL;
+ int i;
+
+ /* allocate memory for f2fs-specific super block info */
+ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ /* set a block size */
+ if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+ f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
+ goto free_sbi;
+ }
+
+ if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) {
+ brelse(raw_super_buf);
+ if (validate_superblock(sb, &raw_super, &raw_super_buf, 1))
+ goto free_sb_buf;
+ }
+ /* init some FS parameters */
+ sbi->active_logs = NR_CURSEG_TYPE;
+
+ set_opt(sbi, BG_GC);
+
+#ifdef CONFIG_F2FS_FS_XATTR
+ set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ set_opt(sbi, POSIX_ACL);
+#endif
+ /* parse mount options */
+ if (parse_options(sb, sbi, (char *)data))
+ goto free_sb_buf;
+
+ sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sb->s_max_links = F2FS_LINK_MAX;
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+
+ sb->s_op = &f2fs_sops;
+ sb->s_xattr = f2fs_xattr_handlers;
+ sb->s_export_op = &f2fs_export_ops;
+ sb->s_magic = F2FS_SUPER_MAGIC;
+ sb->s_fs_info = sbi;
+ sb->s_time_gran = 1;
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+
+ /* init f2fs-specific super block info */
+ sbi->sb = sb;
+ sbi->raw_super = raw_super;
+ sbi->raw_super_buf = raw_super_buf;
+ mutex_init(&sbi->gc_mutex);
+ mutex_init(&sbi->write_inode);
+ mutex_init(&sbi->writepages);
+ mutex_init(&sbi->cp_mutex);
+ for (i = 0; i < NR_LOCK_TYPE; i++)
+ mutex_init(&sbi->fs_lock[i]);
+ sbi->por_doing = 0;
+ spin_lock_init(&sbi->stat_lock);
+ init_rwsem(&sbi->bio_sem);
+ init_sb_info(sbi);
+
+ /* get an inode for meta space */
+ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
+ if (IS_ERR(sbi->meta_inode)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
+ err = PTR_ERR(sbi->meta_inode);
+ goto free_sb_buf;
+ }
+
+ err = get_valid_checkpoint(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
+ goto free_meta_inode;
+ }
+
+ /* sanity checking of checkpoint */
+ err = -EINVAL;
+ if (sanity_check_ckpt(sbi)) {
+ f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
+ goto free_cp;
+ }
+
+ sbi->total_valid_node_count =
+ le32_to_cpu(sbi->ckpt->valid_node_count);
+ sbi->total_valid_inode_count =
+ le32_to_cpu(sbi->ckpt->valid_inode_count);
+ sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
+ sbi->total_valid_block_count =
+ le64_to_cpu(sbi->ckpt->valid_block_count);
+ sbi->last_valid_block_count = sbi->total_valid_block_count;
+ sbi->alloc_valid_block_count = 0;
+ INIT_LIST_HEAD(&sbi->dir_inode_list);
+ spin_lock_init(&sbi->dir_inode_lock);
+
+ init_orphan_info(sbi);
+
+ /* setup f2fs internal modules */
+ err = build_segment_manager(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR,
+ "Failed to initialize F2FS segment manager");
+ goto free_sm;
+ }
+ err = build_node_manager(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR,
+ "Failed to initialize F2FS node manager");
+ goto free_nm;
+ }
+
+ build_gc_manager(sbi);
+
+ /* get an inode for node space */
+ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
+ if (IS_ERR(sbi->node_inode)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
+ err = PTR_ERR(sbi->node_inode);
+ goto free_nm;
+ }
+
+ /* if there are nt orphan nodes free them */
+ err = -EINVAL;
+ if (recover_orphan_inodes(sbi))
+ goto free_node_inode;
+
+ /* read root inode and dentry */
+ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
+ if (IS_ERR(root)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
+ err = PTR_ERR(root);
+ goto free_node_inode;
+ }
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+ goto free_root_inode;
+
+ sb->s_root = d_make_root(root); /* allocate root dentry */
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto free_root_inode;
+ }
+
+ /* recover fsynced data */
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
+ recover_fsync_data(sbi);
+
+ /* After POR, we can run background GC thread */
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto fail;
+
+ return 0;
+fail:
+ stop_gc_thread(sbi);
+free_root_inode:
+ dput(sb->s_root);
+ sb->s_root = NULL;
+free_node_inode:
+ iput(sbi->node_inode);
+free_nm:
+ destroy_node_manager(sbi);
+free_sm:
+ destroy_segment_manager(sbi);
+free_cp:
+ kfree(sbi->ckpt);
+free_meta_inode:
+ make_bad_inode(sbi->meta_inode);
+ iput(sbi->meta_inode);
+free_sb_buf:
+ brelse(raw_super_buf);
+free_sbi:
+ kfree(sbi);
+ return err;
+}
+
+static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+}
+
+static struct file_system_type f2fs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "f2fs",
+ .mount = f2fs_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_inodecache(void)
+{
+ f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), NULL);
+ if (f2fs_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(f2fs_inode_cachep);
+}
+
+static int __init init_f2fs_fs(void)
+{
+ int err;
+
+ err = init_inodecache();
+ if (err)
+ goto fail;
+ err = create_node_manager_caches();
+ if (err)
+ goto fail;
+ err = create_gc_caches();
+ if (err)
+ goto fail;
+ err = create_checkpoint_caches();
+ if (err)
+ goto fail;
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto fail;
+ f2fs_create_root_stats();
+fail:
+ return err;
+}
+
+static void __exit exit_f2fs_fs(void)
+{
+ f2fs_destroy_root_stats();
+ unregister_filesystem(&f2fs_fs_type);
+ destroy_checkpoint_caches();
+ destroy_gc_caches();
+ destroy_node_manager_caches();
+ destroy_inodecache();
+}
+
+module_init(init_f2fs_fs)
+module_exit(exit_f2fs_fs)
+
+MODULE_AUTHOR("Samsung Electronics's Praesto Team");
+MODULE_DESCRIPTION("Flash Friendly File System");
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
new file mode 100644
index 000000000000..8038c0496504
--- /dev/null
+++ b/fs/f2fs/xattr.c
@@ -0,0 +1,443 @@
+/*
+ * fs/f2fs/xattr.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ * suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ * Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/rwsem.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+
+static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ int total_len, prefix_len = 0;
+ const char *prefix = NULL;
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ total_len = prefix_len + name_len + 1;
+ if (list && total_len <= list_size) {
+ memcpy(list, prefix, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return f2fs_getxattr(dentry->d_inode, type, name,
+ buffer, size);
+}
+
+static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+}
+
+static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
+ size_t size;
+
+ if (type != F2FS_XATTR_INDEX_ADVISE)
+ return 0;
+
+ size = strlen(xname) + 1;
+ if (list && size <= list_size)
+ memcpy(list, xname, size);
+ return size;
+}
+
+static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+
+ *((char *)buffer) = F2FS_I(inode)->i_advise;
+ return sizeof(char);
+}
+
+static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+ if (value == NULL)
+ return -EINVAL;
+
+ F2FS_I(inode)->i_advise |= *(char *)value;
+ return 0;
+}
+
+const struct xattr_handler f2fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = F2FS_XATTR_INDEX_USER,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = F2FS_XATTR_INDEX_TRUSTED,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_advise_handler = {
+ .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .flags = F2FS_XATTR_INDEX_ADVISE,
+ .list = f2fs_xattr_advise_list,
+ .get = f2fs_xattr_advise_get,
+ .set = f2fs_xattr_advise_set,
+};
+
+static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+ [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
+ [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+#endif
+ [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+ [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
+};
+
+const struct xattr_handler *f2fs_xattr_handlers[] = {
+ &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ &f2fs_xattr_acl_access_handler,
+ &f2fs_xattr_acl_default_handler,
+#endif
+ &f2fs_xattr_trusted_handler,
+ &f2fs_xattr_advise_handler,
+ NULL,
+};
+
+static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
+{
+ const struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
+ handler = f2fs_xattr_handler_map[name_index];
+ return handler;
+}
+
+int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_entry *entry;
+ struct page *page;
+ void *base_addr;
+ int error = 0, found = 0;
+ size_t value_len, name_len;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+
+ if (!fi->i_xattr_nid)
+ return -ENODATA;
+
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+
+ list_for_each_xattr(entry, base_addr) {
+ if (entry->e_name_index != name_index)
+ continue;
+ if (entry->e_name_len != name_len)
+ continue;
+ if (!memcmp(entry->e_name, name, name_len)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ error = -ENODATA;
+ goto cleanup;
+ }
+
+ value_len = le16_to_cpu(entry->e_value_size);
+
+ if (buffer && value_len > buffer_size) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+
+ if (buffer) {
+ char *pval = entry->e_name + entry->e_name_len;
+ memcpy(buffer, pval, value_len);
+ }
+ error = value_len;
+
+cleanup:
+ f2fs_put_page(page, 1);
+ return error;
+}
+
+ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_entry *entry;
+ struct page *page;
+ void *base_addr;
+ int error = 0;
+ size_t rest = buffer_size;
+
+ if (!fi->i_xattr_nid)
+ return 0;
+
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+
+ list_for_each_xattr(entry, base_addr) {
+ const struct xattr_handler *handler =
+ f2fs_xattr_handler(entry->e_name_index);
+ size_t size;
+
+ if (!handler)
+ continue;
+
+ size = handler->list(dentry, buffer, rest, entry->e_name,
+ entry->e_name_len, handler->flags);
+ if (buffer && size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+
+ if (buffer)
+ buffer += size;
+ rest -= size;
+ }
+ error = buffer_size - rest;
+cleanup:
+ f2fs_put_page(page, 1);
+ return error;
+}
+
+int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_header *header = NULL;
+ struct f2fs_xattr_entry *here, *last;
+ struct page *page;
+ void *base_addr;
+ int error, found, free, newsize;
+ size_t name_len;
+ char *pval;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+
+ if (value == NULL)
+ value_len = 0;
+
+ if (name_len > 255 || value_len > MAX_VALUE_LEN)
+ return -ERANGE;
+
+ f2fs_balance_fs(sbi);
+
+ mutex_lock_op(sbi, NODE_NEW);
+ if (!fi->i_xattr_nid) {
+ /* Allocate new attribute block */
+ struct dnode_of_data dn;
+
+ if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ return -ENOSPC;
+ }
+ set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
+ mark_inode_dirty(inode);
+
+ page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(page)) {
+ alloc_nid_failed(sbi, fi->i_xattr_nid);
+ fi->i_xattr_nid = 0;
+ mutex_unlock_op(sbi, NODE_NEW);
+ return PTR_ERR(page);
+ }
+
+ alloc_nid_done(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+ header = XATTR_HDR(base_addr);
+ header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
+ header->h_refcount = cpu_to_le32(1);
+ } else {
+ /* The inode already has an extended attribute block. */
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ return PTR_ERR(page);
+ }
+
+ base_addr = page_address(page);
+ header = XATTR_HDR(base_addr);
+ }
+
+ if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
+ error = -EIO;
+ goto cleanup;
+ }
+
+ /* find entry with wanted name. */
+ found = 0;
+ list_for_each_xattr(here, base_addr) {
+ if (here->e_name_index != name_index)
+ continue;
+ if (here->e_name_len != name_len)
+ continue;
+ if (!memcmp(here->e_name, name, name_len)) {
+ found = 1;
+ break;
+ }
+ }
+
+ last = here;
+
+ while (!IS_XATTR_LAST_ENTRY(last))
+ last = XATTR_NEXT_ENTRY(last);
+
+ newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
+ name_len + value_len);
+
+ /* 1. Check space */
+ if (value) {
+ /* If value is NULL, it is remove operation.
+ * In case of update operation, we caculate free.
+ */
+ free = MIN_OFFSET - ((char *)last - (char *)header);
+ if (found)
+ free = free - ENTRY_SIZE(here);
+
+ if (free < newsize) {
+ error = -ENOSPC;
+ goto cleanup;
+ }
+ }
+
+ /* 2. Remove old entry */
+ if (found) {
+ /* If entry is found, remove old entry.
+ * If not found, remove operation is not needed.
+ */
+ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
+ int oldsize = ENTRY_SIZE(here);
+
+ memmove(here, next, (char *)last - (char *)next);
+ last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
+ memset(last, 0, oldsize);
+ }
+
+ /* 3. Write new entry */
+ if (value) {
+ /* Before we come here, old entry is removed.
+ * We just write new entry. */
+ memset(last, 0, newsize);
+ last->e_name_index = name_index;
+ last->e_name_len = name_len;
+ memcpy(last->e_name, name, name_len);
+ pval = last->e_name + name_len;
+ memcpy(pval, value, value_len);
+ last->e_value_size = cpu_to_le16(value_len);
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+ inode->i_mode = fi->i_acl_mode;
+ inode->i_ctime = CURRENT_TIME;
+ clear_inode_flag(fi, FI_ACL_MODE);
+ }
+ f2fs_write_inode(inode, NULL);
+ mutex_unlock_op(sbi, NODE_NEW);
+
+ return 0;
+cleanup:
+ f2fs_put_page(page, 1);
+ mutex_unlock_op(sbi, NODE_NEW);
+ return error;
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
new file mode 100644
index 000000000000..49c9558305e3
--- /dev/null
+++ b/fs/f2fs/xattr.h
@@ -0,0 +1,145 @@
+/*
+ * fs/f2fs/xattr.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.h
+ *
+ * On-disk format of extended attributes for the ext2 filesystem.
+ *
+ * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_XATTR_H__
+#define __F2FS_XATTR_H__
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define F2FS_XATTR_MAGIC 0xF2F52011
+
+/* Maximum number of references to one attribute block */
+#define F2FS_XATTR_REFCOUNT_MAX 1024
+
+/* Name indexes */
+#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_XATTR_INDEX_USER 1
+#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define F2FS_XATTR_INDEX_TRUSTED 4
+#define F2FS_XATTR_INDEX_LUSTRE 5
+#define F2FS_XATTR_INDEX_SECURITY 6
+#define F2FS_XATTR_INDEX_ADVISE 7
+
+struct f2fs_xattr_header {
+ __le32 h_magic; /* magic number for identification */
+ __le32 h_refcount; /* reference count */
+ __u32 h_reserved[4]; /* zero right now */
+};
+
+struct f2fs_xattr_entry {
+ __u8 e_name_index;
+ __u8 e_name_len;
+ __le16 e_value_size; /* size of attribute value */
+ char e_name[0]; /* attribute name */
+};
+
+#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
+#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
+#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1))
+#define XATTR_ROUND (3)
+
+#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+
+#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
+ entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+
+#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
+ ENTRY_SIZE(entry)))
+
+#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define list_for_each_xattr(entry, addr) \
+ for (entry = XATTR_FIRST_ENTRY(addr);\
+ !IS_XATTR_LAST_ENTRY(entry);\
+ entry = XATTR_NEXT_ENTRY(entry))
+
+
+#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \
+ sizeof(struct node_footer) - \
+ sizeof(__u32))
+
+#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
+ sizeof(struct f2fs_xattr_entry))
+
+/*
+ * On-disk structure of f2fs_xattr
+ * We use only 1 block for xattr.
+ *
+ * +--------------------+
+ * | f2fs_xattr_header |
+ * | |
+ * +--------------------+
+ * | f2fs_xattr_entry |
+ * | .e_name_index = 1 |
+ * | .e_name_len = 3 |
+ * | .e_value_size = 14 |
+ * | .e_name = "foo" |
+ * | "value_of_xattr" |<- value_offs = e_name + e_name_len
+ * +--------------------+
+ * | f2fs_xattr_entry |
+ * | .e_name_index = 4 |
+ * | .e_name = "bar" |
+ * +--------------------+
+ * | |
+ * | Free |
+ * | |
+ * +--------------------+<- MIN_OFFSET
+ * | node_footer |
+ * | (nid, ino, offset) |
+ * +--------------------+
+ *
+ **/
+
+#ifdef CONFIG_F2FS_FS_XATTR
+extern const struct xattr_handler f2fs_xattr_user_handler;
+extern const struct xattr_handler f2fs_xattr_trusted_handler;
+extern const struct xattr_handler f2fs_xattr_acl_access_handler;
+extern const struct xattr_handler f2fs_xattr_acl_default_handler;
+extern const struct xattr_handler f2fs_xattr_advise_handler;
+
+extern const struct xattr_handler *f2fs_xattr_handlers[];
+
+extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len);
+extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size);
+extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
+
+#else
+
+#define f2fs_xattr_handlers NULL
+static inline int f2fs_setxattr(struct inode *inode, int name_index,
+ const char *name, const void *value, size_t value_len)
+{
+ return -EOPNOTSUPP;
+}
+static inline int f2fs_getxattr(struct inode *inode, int name_index,
+ const char *name, void *buffer, size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 2a182342442e..165012ef363a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -461,8 +461,7 @@ static int fat_parse_short(struct super_block *sb,
}
/*
- * Return values: negative -> error, 0 -> not found, positive -> found,
- * value is the total amount of slots, including the shortname entry.
+ * Return values: negative -> error/not found, 0 -> found.
*/
int fat_search_long(struct inode *inode, const unsigned char *name,
int name_len, struct fat_slot_info *sinfo)
@@ -699,7 +698,7 @@ out:
static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
}
@@ -780,7 +779,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
int short_only, both;
@@ -820,7 +819,7 @@ FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)
static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct compat_dirent __user *d1 = compat_ptr(arg);
int short_only, both;
@@ -1255,7 +1254,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
sinfo->nr_slots = nr_slots;
- /* First stage: search free direcotry entries */
+ /* First stage: search free directory entries */
free_slots = nr_bhs = 0;
bh = prev = NULL;
pos = 0;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 12701a567752..e9cc3f0d58e2 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -95,6 +95,8 @@ struct msdos_sb_info {
spinlock_t dir_hash_lock;
struct hlist_head dir_hashtable[FAT_HASH_SIZE];
+
+ unsigned int dirty; /* fs state before mount */
};
#define FAT_CACHE_VALID 0 /* special case for valid cache */
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a62e0ecbe2db..3978f8ca1823 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -32,7 +32,7 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int is_dir = S_ISDIR(inode->i_mode);
u32 attr, oldattr;
@@ -116,7 +116,7 @@ out:
long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
u32 __user *user_attr = (u32 __user *)arg;
switch (cmd) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 35806813ea4e..acf6e479b443 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -341,12 +341,11 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
- struct hlist_node *_p;
struct msdos_inode_info *i;
struct inode *inode = NULL;
spin_lock(&sbi->inode_hash_lock);
- hlist_for_each_entry(i, _p, head, i_fat_hash) {
+ hlist_for_each_entry(i, head, i_fat_hash) {
BUG_ON(i->vfs_inode.i_sb != sb);
if (i->i_pos != i_pos)
continue;
@@ -488,10 +487,59 @@ static void fat_evict_inode(struct inode *inode)
fat_detach(inode);
}
+static void fat_set_state(struct super_block *sb,
+ unsigned int set, unsigned int force)
+{
+ struct buffer_head *bh;
+ struct fat_boot_sector *b;
+ struct msdos_sb_info *sbi = sb->s_fs_info;
+
+ /* do not change any thing if mounted read only */
+ if ((sb->s_flags & MS_RDONLY) && !force)
+ return;
+
+ /* do not change state if fs was dirty */
+ if (sbi->dirty) {
+ /* warn only on set (mount). */
+ if (set)
+ fat_msg(sb, KERN_WARNING, "Volume was not properly "
+ "unmounted. Some data may be corrupt. "
+ "Please run fsck.");
+ return;
+ }
+
+ bh = sb_bread(sb, 0);
+ if (bh == NULL) {
+ fat_msg(sb, KERN_ERR, "unable to read boot sector "
+ "to mark fs as dirty");
+ return;
+ }
+
+ b = (struct fat_boot_sector *) bh->b_data;
+
+ if (sbi->fat_bits == 32) {
+ if (set)
+ b->fat32.state |= FAT_STATE_DIRTY;
+ else
+ b->fat32.state &= ~FAT_STATE_DIRTY;
+ } else /* fat 16 and 12 */ {
+ if (set)
+ b->fat16.state |= FAT_STATE_DIRTY;
+ else
+ b->fat16.state &= ~FAT_STATE_DIRTY;
+ }
+
+ mark_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+}
+
static void fat_put_super(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ fat_set_state(sb, 0, 0);
+
iput(sbi->fsinfo_inode);
iput(sbi->fat_inode);
@@ -566,8 +614,18 @@ static void __exit fat_destroy_inodecache(void)
static int fat_remount(struct super_block *sb, int *flags, char *data)
{
+ int new_rdonly;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+
+ /* make sure we update state on remount. */
+ new_rdonly = *flags & MS_RDONLY;
+ if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
+ if (new_rdonly)
+ fat_set_state(sb, 0, 0);
+ else
+ fat_set_state(sb, 1, 1);
+ }
return 0;
}
@@ -1298,17 +1356,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->prev_free = FAT_START_ENT;
sb->s_maxbytes = 0xffffffff;
- if (!sbi->fat_length && b->fat32_length) {
+ if (!sbi->fat_length && b->fat32.length) {
struct fat_boot_fsinfo *fsinfo;
struct buffer_head *fsinfo_bh;
/* Must be FAT32 */
sbi->fat_bits = 32;
- sbi->fat_length = le32_to_cpu(b->fat32_length);
- sbi->root_cluster = le32_to_cpu(b->root_cluster);
+ sbi->fat_length = le32_to_cpu(b->fat32.length);
+ sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
/* MC - if info_sector is 0, don't multiply by 0 */
- sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
+ sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
if (sbi->fsinfo_sector == 0)
sbi->fsinfo_sector = 1;
@@ -1344,7 +1402,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
- fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
+ fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
" (%u)", sbi->dir_entries);
brelse(bh);
goto out_invalid;
@@ -1362,6 +1420,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (sbi->fat_bits != 32)
sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
+ /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
+ if (sbi->fat_bits == 32)
+ sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
+ else /* fat 16 or 12 */
+ sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
+
/* check that FAT table does not overflow */
fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
@@ -1456,6 +1520,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
"the device does not support discard");
}
+ fat_set_state(sb, 1, 0);
return 0;
out_invalid:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 5eb600dc43a9..359d307b5507 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -135,6 +135,10 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
}
if (ret < 0)
return ret;
+ /*
+ * FIXME:Although we can add this cache, fat_cache_add() is
+ * assuming to be called after linear search with fat_cache_id.
+ */
// fat_cache_add(inode, new_fclus, new_dclus);
} else {
MSDOS_I(inode)->i_start = new_dclus;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index ef4b5faba87b..499c10438ca2 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -21,13 +21,12 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct hlist_head *head;
- struct hlist_node *_p;
struct msdos_inode_info *i;
struct inode *inode = NULL;
head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
spin_lock(&sbi->dir_hash_lock);
- hlist_for_each_entry(i, _p, head, i_dir_hash) {
+ hlist_for_each_entry(i, head, i_dir_hash) {
BUG_ON(i->vfs_inode.i_sb != sb);
if (i->i_logstart != i_logstart)
continue;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 71a600a19f06..6599222536eb 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -30,7 +30,7 @@
static int setfl(int fd, struct file * filp, unsigned long arg)
{
- struct inode * inode = filp->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(filp);
int error = 0;
/*
diff --git a/fs/fhandle.c b/fs/fhandle.c
index cccdc874bb55..999ff5c3cab0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,
handle_bytes = handle_dwords * sizeof(u32);
handle->handle_bytes = handle_bytes;
if ((handle->handle_bytes > f_handle.handle_bytes) ||
- (retval == 255) || (retval == -ENOSPC)) {
+ (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
/* As per old exportfs_encode_fh documentation
* we could return ENOSPC to indicate overflow
* But file system returned 255 always. So handle
diff --git a/fs/file.c b/fs/file.c
index 15cb8618e95d..3906d9577a18 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk)
}
}
-static void __devinit fdtable_defer_list_init(int cpu)
+static void fdtable_defer_list_init(int cpu)
{
struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
spin_lock_init(&fddef->lock);
@@ -516,7 +516,7 @@ struct files_struct init_files = {
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
},
- .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+ .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
/*
diff --git a/fs/file_table.c b/fs/file_table.c
index a72bf9ddd0d2..cd4d87a82951 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -94,8 +94,8 @@ int proc_nr_files(ctl_table *table, int write,
#endif
/* Find an unused file structure and return a pointer to it.
- * Returns NULL, if there are no more free file structures or
- * we run out of memory.
+ * Returns an error pointer if some error happend e.g. we over file
+ * structures limit, run out of memory or operation is not permitted.
*
* Be very careful using this. You are responsible for
* getting write access to any mount that you might assign
@@ -107,7 +107,8 @@ struct file *get_empty_filp(void)
{
const struct cred *cred = current_cred();
static long old_max;
- struct file * f;
+ struct file *f;
+ int error;
/*
* Privileged users can go above max_files
@@ -122,13 +123,16 @@ struct file *get_empty_filp(void)
}
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
- if (f == NULL)
- goto fail;
+ if (unlikely(!f))
+ return ERR_PTR(-ENOMEM);
percpu_counter_inc(&nr_files);
f->f_cred = get_cred(cred);
- if (security_file_alloc(f))
- goto fail_sec;
+ error = security_file_alloc(f);
+ if (unlikely(error)) {
+ file_free(f);
+ return ERR_PTR(error);
+ }
INIT_LIST_HEAD(&f->f_u.fu_list);
atomic_long_set(&f->f_count, 1);
@@ -144,12 +148,7 @@ over:
pr_info("VFS: file-max limit %lu reached\n", get_max_files());
old_max = get_nr_files();
}
- goto fail;
-
-fail_sec:
- file_free(f);
-fail:
- return NULL;
+ return ERR_PTR(-ENFILE);
}
/**
@@ -173,10 +172,11 @@ struct file *alloc_file(struct path *path, fmode_t mode,
struct file *file;
file = get_empty_filp();
- if (!file)
- return NULL;
+ if (IS_ERR(file))
+ return file;
file->f_path = *path;
+ file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
@@ -259,6 +259,7 @@ static void __fput(struct file *file)
drop_file_write_access(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
+ file->f_inode = NULL;
file_free(file);
dput(dentry);
mntput(mnt);
@@ -447,7 +448,7 @@ void mark_files_ro(struct super_block *sb)
lg_global_lock(&files_lglock);
do_file_list_for_each_entry(sb, f) {
- if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+ if (!S_ISREG(file_inode(f)->i_mode))
continue;
if (!file_count(f))
continue;
@@ -458,8 +459,8 @@ void mark_files_ro(struct super_block *sb)
spin_unlock(&f->f_lock);
if (file_check_writeable(f) != 0)
continue;
+ __mnt_drop_write(f->f_path.mnt);
file_release_write(f);
- mnt_drop_write_file(f);
} while_file_list_for_each_entry;
lg_global_unlock(&files_lglock);
}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index bd447e88f208..664b07a53870 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -237,7 +237,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
static int
vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
{
- struct inode *ip = fp->f_path.dentry->d_inode;
+ struct inode *ip = file_inode(fp);
struct super_block *sbp = ip->i_sb;
u_long bsize = sbp->s_blocksize;
u_long page, npages, block, pblocks, nblocks, offset;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 310972b72a66..21f46fb3a101 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, wbc);
+ int ret;
+
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+ trace_writeback_write_inode_start(inode, wbc);
+ ret = inode->i_sb->s_op->write_inode(inode, wbc);
+ trace_writeback_write_inode(inode, wbc);
+ return ret;
+ }
return 0;
}
@@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
WARN_ON(!(inode->i_state & I_SYNC));
+ trace_writeback_single_inode_start(inode, wbc, nr_to_write);
+
ret = do_writepages(mapping, wbc);
/*
@@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* dirty the inode itself
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ trace_writeback_dirty_inode_start(inode, flags);
+
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
+
+ trace_writeback_dirty_inode(inode, flags);
}
/*
@@ -1332,47 +1344,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
EXPORT_SYMBOL(writeback_inodes_sb);
/**
- * writeback_inodes_sb_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
* @sb: the superblock
- * @reason: reason why some writeback work was initiated
+ * @nr: the number of pages to write
+ * @reason: the reason of writeback
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+ unsigned long nr,
+ enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb(sb, reason);
- up_read(&sb->s_umount);
+ if (writeback_in_progress(sb->s_bdi))
return 1;
- } else
+
+ if (!down_read_trylock(&sb->s_umount))
return 0;
+
+ writeback_inodes_sb_nr(sb, nr, reason);
+ up_read(&sb->s_umount);
+ return 1;
}
-EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
/**
- * writeback_inodes_sb_nr_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
* @sb: the superblock
- * @nr: the number of pages to write
* @reason: reason why some writeback work was initiated
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb_nr(sb, nr, reason);
- up_read(&sb->s_umount);
- return 1;
- } else
- return 0;
+ return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
-EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb);
/**
* sync_inodes_sb - sync sb inode pages
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index fe6ca583bbc0..d8ac61d0c932 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -10,7 +10,7 @@
* Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
* It can block.
*/
-void set_fs_root(struct fs_struct *fs, struct path *path)
+void set_fs_root(struct fs_struct *fs, const struct path *path)
{
struct path old_root;
@@ -29,7 +29,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
* Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
* It can block.
*/
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
+void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
struct path old_pwd;
@@ -53,7 +53,7 @@ static inline int replace_path(struct path *p, const struct path *old, const str
return 1;
}
-void chroot_fs_refs(struct path *old_root, struct path *new_root)
+void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
struct task_struct *g, *p;
struct fs_struct *fs;
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 6a3c48abd677..b52aed1dca97 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);
*/
void fscache_io_error(struct fscache_cache *cache)
{
- set_bit(FSCACHE_IOERROR, &cache->flags);
-
- printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
- cache->ops->name);
+ if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
+ printk(KERN_ERR "FS-Cache:"
+ " Cache '%s' stopped due to I/O error\n",
+ cache->ops->name);
}
EXPORT_SYMBOL(fscache_io_error);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 990535071a8a..e2cba1f60c21 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -237,13 +237,12 @@ static int fscache_alloc_object(struct fscache_cache *cache,
struct fscache_cookie *cookie)
{
struct fscache_object *object;
- struct hlist_node *_n;
int ret;
_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
spin_lock(&cookie->lock);
- hlist_for_each_entry(object, _n, &cookie->backing_objects,
+ hlist_for_each_entry(object, &cookie->backing_objects,
cookie_link) {
if (object->cache == cache)
goto object_already_extant;
@@ -311,7 +310,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
{
struct fscache_object *p;
struct fscache_cache *cache = object->cache;
- struct hlist_node *_n;
int ret;
_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
@@ -321,7 +319,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
/* there may be multiple initial creations of this object, but we only
* want one */
ret = -EEXIST;
- hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+ hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
if (p->cache == object->cache) {
if (p->state >= FSCACHE_OBJECT_DYING)
ret = -ENOBUFS;
@@ -331,7 +329,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
/* pin the parent object */
spin_lock_nested(&cookie->parent->lock, 1);
- hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+ hlist_for_each_entry(p, &cookie->parent->backing_objects,
cookie_link) {
if (p->cache == object->cache) {
if (p->state >= FSCACHE_OBJECT_DYING) {
@@ -370,12 +368,71 @@ cant_attach_object:
}
/*
+ * Invalidate an object. Callable with spinlocks held.
+ */
+void __fscache_invalidate(struct fscache_cookie *cookie)
+{
+ struct fscache_object *object;
+
+ _enter("{%s}", cookie->def->name);
+
+ fscache_stat(&fscache_n_invalidates);
+
+ /* Only permit invalidation of data files. Invalidating an index will
+ * require the caller to release all its attachments to the tree rooted
+ * there, and if it's doing that, it may as well just retire the
+ * cookie.
+ */
+ ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+
+ /* We will be updating the cookie too. */
+ BUG_ON(!cookie->def->get_aux);
+
+ /* If there's an object, we tell the object state machine to handle the
+ * invalidation on our behalf, otherwise there's nothing to do.
+ */
+ if (!hlist_empty(&cookie->backing_objects)) {
+ spin_lock(&cookie->lock);
+
+ if (!hlist_empty(&cookie->backing_objects) &&
+ !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
+ &cookie->flags)) {
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object,
+ cookie_link);
+ if (object->state < FSCACHE_OBJECT_DYING)
+ fscache_raise_event(
+ object, FSCACHE_OBJECT_EV_INVALIDATE);
+ }
+
+ spin_unlock(&cookie->lock);
+ }
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_invalidate);
+
+/*
+ * Wait for object invalidation to complete.
+ */
+void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
+{
+ _enter("%p", cookie);
+
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
+ fscache_wait_bit_interruptible,
+ TASK_UNINTERRUPTIBLE);
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_wait_on_invalidate);
+
+/*
* update the index entries backing a cookie
*/
void __fscache_update_cookie(struct fscache_cookie *cookie)
{
struct fscache_object *object;
- struct hlist_node *_p;
fscache_stat(&fscache_n_updates);
@@ -392,7 +449,7 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
spin_lock(&cookie->lock);
/* update the index entry on disk in each cache backing this cookie */
- hlist_for_each_entry(object, _p,
+ hlist_for_each_entry(object,
&cookie->backing_objects, cookie_link) {
fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
}
@@ -442,16 +499,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+try_again:
spin_lock(&cookie->lock);
/* break links with all the active objects */
while (!hlist_empty(&cookie->backing_objects)) {
+ int n_reads;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
_debug("RELEASE OBJ%x", object->debug_id);
+ set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
+ n_reads = atomic_read(&object->n_reads);
+ if (n_reads) {
+ int n_ops = object->n_ops;
+ int n_in_progress = object->n_in_progress;
+ spin_unlock(&cookie->lock);
+ printk(KERN_ERR "FS-Cache:"
+ " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
+ cookie->def->name,
+ n_reads, n_ops, n_in_progress);
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ printk("Wait finished\n");
+ goto try_again;
+ }
+
/* detach each cache object from the object cookie */
spin_lock(&object->lock);
hlist_del_init(&object->cookie_link);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f6aad48d38a8..ee38fef4be51 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
struct fscache_operation *);
extern int fscache_submit_op(struct fscache_object *,
struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *,
+ void (*)(struct fscache_operation *));
+extern void fscache_cancel_all_ops(struct fscache_object *);
extern void fscache_abort_object(struct fscache_object *);
extern void fscache_start_operations(struct fscache_object *);
extern void fscache_operation_gc(struct work_struct *);
/*
+ * page.c
+ */
+extern void fscache_invalidate_writes(struct fscache_cookie *);
+
+/*
* proc.c
*/
#ifdef CONFIG_PROC_FS
@@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;
extern atomic_t fscache_n_store_vmscan_gone;
extern atomic_t fscache_n_store_vmscan_busy;
extern atomic_t fscache_n_store_vmscan_cancelled;
+extern atomic_t fscache_n_store_vmscan_wait;
extern atomic_t fscache_n_marks;
extern atomic_t fscache_n_uncaches;
@@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;
extern atomic_t fscache_n_acquires_nobufs;
extern atomic_t fscache_n_acquires_oom;
+extern atomic_t fscache_n_invalidates;
+extern atomic_t fscache_n_invalidates_run;
+
extern atomic_t fscache_n_updates;
extern atomic_t fscache_n_updates_null;
extern atomic_t fscache_n_updates_run;
@@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;
extern atomic_t fscache_n_cop_lookup_object;
extern atomic_t fscache_n_cop_lookup_complete;
extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_invalidate_object;
extern atomic_t fscache_n_cop_update_object;
extern atomic_t fscache_n_cop_drop_object;
extern atomic_t fscache_n_cop_put_object;
@@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;
static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
+ BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index ebe29c581380..f27c89d17885 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->n_in_progress,
obj->n_exclusive,
atomic_read(&obj->n_reads),
- obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+ obj->event_mask,
obj->events,
obj->flags,
work_busy(&obj->work));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index b6b897c550ac..50d41c180211 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,6 +14,7 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
+#include <linux/slab.h>
#include "internal.h"
const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
[FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
[FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
[FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
+ [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
[FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
[FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
[FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
@@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
[FSCACHE_OBJECT_CREATING] = "CRTN",
[FSCACHE_OBJECT_AVAILABLE] = "AVBL",
[FSCACHE_OBJECT_ACTIVE] = "ACTV",
+ [FSCACHE_OBJECT_INVALIDATING] = "INVL",
[FSCACHE_OBJECT_UPDATING] = "UPDT",
[FSCACHE_OBJECT_DYING] = "DYNG",
[FSCACHE_OBJECT_LC_DYING] = "LCDY",
@@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);
static void fscache_initialise_object(struct fscache_object *);
static void fscache_lookup_object(struct fscache_object *);
static void fscache_object_available(struct fscache_object *);
+static void fscache_invalidate_object(struct fscache_object *);
static void fscache_release_object(struct fscache_object *);
static void fscache_withdraw_object(struct fscache_object *);
static void fscache_enqueue_dependents(struct fscache_object *);
@@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
}
/*
+ * Notify netfs of invalidation completion.
+ */
+static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+{
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+}
+
+/*
* process events that have been sent to an object's state machine
* - initiates parent lookup
* - does object lookup
@@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
{
enum fscache_object_state new_state;
struct fscache_cookie *cookie;
+ int event;
ASSERT(object != NULL);
@@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)
/* wait for the parent object to become ready */
case FSCACHE_OBJECT_INIT:
object->event_mask =
- ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ FSCACHE_OBJECT_EVENTS_MASK &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
fscache_initialise_object(object);
goto done;
@@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)
case FSCACHE_OBJECT_ACTIVE:
goto active_transit;
+ /* Invalidate an object on disk */
+ case FSCACHE_OBJECT_INVALIDATING:
+ clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ fscache_invalidate_object(object);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+ goto active_transit;
+
/* update the object metadata on disk */
case FSCACHE_OBJECT_UPDATING:
clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
@@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)
/* determine the transition from a lookup state */
lookup_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
case FSCACHE_OBJECT_EV_RETIRE:
case FSCACHE_OBJECT_EV_RELEASE:
case FSCACHE_OBJECT_EV_ERROR:
new_state = FSCACHE_OBJECT_LC_DYING;
goto change_state;
+ case FSCACHE_OBJECT_EV_INVALIDATE:
+ new_state = FSCACHE_OBJECT_INVALIDATING;
+ goto change_state;
case FSCACHE_OBJECT_EV_REQUEUE:
goto done;
case -1:
@@ -268,13 +297,17 @@ lookup_transit:
/* determine the transition from an active state */
active_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
case FSCACHE_OBJECT_EV_RETIRE:
case FSCACHE_OBJECT_EV_RELEASE:
case FSCACHE_OBJECT_EV_ERROR:
new_state = FSCACHE_OBJECT_DYING;
goto change_state;
+ case FSCACHE_OBJECT_EV_INVALIDATE:
+ new_state = FSCACHE_OBJECT_INVALIDATING;
+ goto change_state;
case FSCACHE_OBJECT_EV_UPDATE:
new_state = FSCACHE_OBJECT_UPDATING;
goto change_state;
@@ -287,7 +320,8 @@ active_transit:
/* determine the transition from a terminal state */
terminal_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
new_state = FSCACHE_OBJECT_WITHDRAWING;
goto change_state;
@@ -320,8 +354,8 @@ done:
unsupported_event:
printk(KERN_ERR "FS-Cache:"
- " Unsupported event %lx [mask %lx] in state %s\n",
- object->events, object->event_mask,
+ " Unsupported event %d [%lx/%lx] in state %s\n",
+ event, object->events, object->event_mask,
fscache_object_states[object->state]);
BUG();
}
@@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)
if (object->n_in_progress == 0) {
if (object->n_ops > 0) {
ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
- ASSERTIF(object->n_ops > object->n_obj_ops,
- !list_empty(&object->pending_ops));
fscache_start_operations(object);
} else {
ASSERT(list_empty(&object->pending_ops));
@@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)
if (object->cookie == cookie) {
hlist_del_init(&object->cookie_link);
object->cookie = NULL;
+ fscache_invalidation_complete(cookie);
detached = true;
}
spin_unlock(&cookie->lock);
@@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
return result;
}
EXPORT_SYMBOL(fscache_check_aux);
+
+/*
+ * Asynchronously invalidate an object.
+ */
+static void fscache_invalidate_object(struct fscache_object *object)
+{
+ struct fscache_operation *op;
+ struct fscache_cookie *cookie = object->cookie;
+
+ _enter("{OBJ%x}", object->debug_id);
+
+ /* Reject any new read/write ops and abort any that are pending. */
+ fscache_invalidate_writes(cookie);
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+ fscache_cancel_all_ops(object);
+
+ /* Now we have to wait for in-progress reads and writes */
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+ _leave(" [ENOMEM]");
+ return;
+ }
+
+ fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+ op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+
+ spin_lock(&cookie->lock);
+ if (fscache_submit_exclusive_op(object, op) < 0)
+ goto submit_op_failed;
+ spin_unlock(&cookie->lock);
+ fscache_put_operation(op);
+
+ /* Once we've completed the invalidation, we know there will be no data
+ * stored in the cache and thus we can reinstate the data-check-skip
+ * optimisation.
+ */
+ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+ /* We can allow read and write requests to come in once again. They'll
+ * queue up behind our exclusive invalidation operation.
+ */
+ fscache_invalidation_complete(cookie);
+ _leave("");
+ return;
+
+submit_op_failed:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+ _leave(" [EIO]");
+}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 30afdfa7aec7..762a9ec4ffa4 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(op->processor != NULL);
ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
ASSERTCMP(atomic_read(&op->usage), >, 0);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
@@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
static void fscache_run_op(struct fscache_object *object,
struct fscache_operation *op)
{
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+
+ op->state = FSCACHE_OP_ST_IN_PROGRESS;
object->n_in_progress++;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
ASSERT(list_empty(&op->pend_link));
- ret = -ENOBUFS;
+ op->state = FSCACHE_OP_ST_PENDING;
if (fscache_object_is_active(object)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
- if (object->n_ops > 1) {
+ if (object->n_in_progress > 0) {
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
@@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
fscache_stat(&fscache_n_op_pend);
ret = 0;
} else {
- /* not allowed to submit ops in any other state */
- BUG();
+ /* If we're in any other state, there must have been an I/O
+ * error of some nature.
+ */
+ ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
+ ret = -EIO;
}
spin_unlock(&object->lock);
@@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},{%u}",
object->debug_id, op->debug_id, atomic_read(&op->usage));
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
spin_lock(&object->lock);
@@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,
ostate = object->state;
smp_rmb();
+ op->state = FSCACHE_OP_ST_PENDING;
if (fscache_object_is_active(object)) {
op->object = object;
object->n_ops++;
@@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,
object->state == FSCACHE_OBJECT_LC_DYING ||
object->state == FSCACHE_OBJECT_WITHDRAWING) {
fscache_stat(&fscache_n_op_rejected);
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
fscache_report_unexpected_submission(object, op, ostate);
ASSERT(!fscache_object_is_active(object));
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
} else {
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
}
@@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)
/*
* cancel an operation that's pending on an object
*/
-int fscache_cancel_op(struct fscache_operation *op)
+int fscache_cancel_op(struct fscache_operation *op,
+ void (*do_cancel)(struct fscache_operation *))
{
struct fscache_object *object = op->object;
int ret;
_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+ ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
+ ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
spin_lock(&object->lock);
ret = -EBUSY;
- if (!list_empty(&op->pend_link)) {
+ if (op->state == FSCACHE_OP_ST_PENDING) {
+ ASSERT(!list_empty(&op->pend_link));
fscache_stat(&fscache_n_op_cancelled);
list_del_init(&op->pend_link);
- object->n_ops--;
+ if (do_cancel)
+ do_cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
object->n_exclusive--;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
@@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)
}
/*
+ * Cancel all pending operations on an object
+ */
+void fscache_cancel_all_ops(struct fscache_object *object)
+{
+ struct fscache_operation *op;
+
+ _enter("OBJ%x", object->debug_id);
+
+ spin_lock(&object->lock);
+
+ while (!list_empty(&object->pending_ops)) {
+ op = list_entry(object->pending_ops.next,
+ struct fscache_operation, pend_link);
+ fscache_stat(&fscache_n_op_cancelled);
+ list_del_init(&op->pend_link);
+
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ fscache_put_operation(op);
+ cond_resched_lock(&object->lock);
+ }
+
+ spin_unlock(&object->lock);
+ _leave("");
+}
+
+/*
+ * Record the completion or cancellation of an in-progress operation.
+ */
+void fscache_op_complete(struct fscache_operation *op, bool cancelled)
+{
+ struct fscache_object *object = op->object;
+
+ _enter("OBJ%x", object->debug_id);
+
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+ ASSERTCMP(object->n_in_progress, >, 0);
+ ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+ object->n_exclusive, >, 0);
+ ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+ object->n_in_progress, ==, 1);
+
+ spin_lock(&object->lock);
+
+ op->state = cancelled ?
+ FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ object->n_in_progress--;
+ if (object->n_in_progress == 0)
+ fscache_start_operations(object);
+
+ spin_unlock(&object->lock);
+ _leave("");
+}
+EXPORT_SYMBOL(fscache_op_complete);
+
+/*
* release an operation
* - queues pending ops if this is the last in-progress op
*/
@@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)
return;
_debug("PUT OP");
- if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
- BUG();
+ ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
+ op->state, ==, FSCACHE_OP_ST_CANCELLED);
+ op->state = FSCACHE_OP_ST_DEAD;
fscache_stat(&fscache_n_op_release);
@@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
- atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
+ if (atomic_dec_and_test(&object->n_reads)) {
+ clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
+ &object->cookie->flags);
+ wake_up_bit(&object->cookie->flags,
+ FSCACHE_COOKIE_WAITING_ON_READS);
+ }
+ }
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
@@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)
return;
}
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
- ASSERTCMP(object->n_exclusive, >, 0);
- object->n_exclusive--;
- }
-
- ASSERTCMP(object->n_in_progress, >, 0);
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
-
ASSERTCMP(object->n_ops, >, 0);
object->n_ops--;
if (object->n_ops == 0)
@@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)
spin_unlock(&cache->op_gc_list_lock);
object = op->object;
+ spin_lock(&object->lock);
_debug("GC DEFERRED REL OBJ%x OP%x",
object->debug_id, op->debug_id);
fscache_stat(&fscache_n_op_gc);
ASSERTCMP(atomic_read(&op->usage), ==, 0);
-
- spin_lock(&object->lock);
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
- ASSERTCMP(object->n_exclusive, >, 0);
- object->n_exclusive--;
- }
-
- ASSERTCMP(object->n_in_progress, >, 0);
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
ASSERTCMP(object->n_ops, >, 0);
object->n_ops--;
@@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)
fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&object->lock);
+ kfree(op);
} while (count++ < 20);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3f7a59bfa7ad..ff000e52072d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
_enter("%p,%p,%x", cookie, page, gfp);
+try_again:
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
if (!val) {
@@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
return true;
page_busy:
- /* we might want to wait here, but that could deadlock the allocator as
- * the work threads writing to the cache may all end up sleeping
- * on memory allocation */
- fscache_stat(&fscache_n_store_vmscan_busy);
- return false;
+ /* We will wait here if we're allowed to, but that could deadlock the
+ * allocator as the work threads writing to the cache may all end up
+ * sleeping on memory allocation, so we may need to impose a timeout
+ * too. */
+ if (!(gfp & __GFP_WAIT)) {
+ fscache_stat(&fscache_n_store_vmscan_busy);
+ return false;
+ }
+
+ fscache_stat(&fscache_n_store_vmscan_wait);
+ __fscache_wait_on_page_write(cookie, page);
+ gfp &= ~__GFP_WAIT;
+ goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_abort_object(object);
}
+ fscache_op_complete(op, true);
_leave("");
}
@@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
+ ASSERTCMP(op->n_pages, ==, 0);
+
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
fscache_put_context(op->op.object->cookie, op->context);
@@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
}
/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+ struct fscache_retrieval *op =
+ container_of(_op, struct fscache_retrieval, op);
+
+ op->n_pages = 0;
+}
+
+/*
* wait for an object to become active (or dead)
*/
static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
@@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
fscache_stat(stat_op_waits);
if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
fscache_wait_bit_interruptible,
- TASK_INTERRUPTIBLE) < 0) {
- ret = fscache_cancel_op(&op->op);
+ TASK_INTERRUPTIBLE) != 0) {
+ ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
if (ret == 0)
return -ERESTARTSYS;
@@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
_debug("<<< GO");
check_if_dead:
+ if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
+ fscache_stat(stat_object_dead);
+ _leave(" = -ENOBUFS [cancelled]");
+ return -ENOBUFS;
+ }
if (unlikely(fscache_object_is_dead(object))) {
+ pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
+ fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
fscache_stat(stat_object_dead);
return -ENOBUFS;
}
@@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (hlist_empty(&cookie->backing_objects))
goto nobufs;
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(page, !=, NULL);
@@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
_leave(" = -ENOMEM");
return -ENOMEM;
}
+ op->n_pages = 1;
spin_lock(&cookie->lock);
@@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
atomic_inc(&object->n_reads);
- set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+ __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock;
+ goto nobufs_unlock_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_retrieval_ops);
@@ -425,6 +461,8 @@ error:
_leave(" = %d", ret);
return ret;
+nobufs_unlock_dec:
+ atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
kfree(op);
@@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (hlist_empty(&cookie->backing_objects))
goto nobufs;
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(*nr_pages, >, 0);
ASSERT(!list_empty(pages));
@@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(mapping, end_io_func, context);
if (!op)
return -ENOMEM;
+ op->n_pages = *nr_pages;
spin_lock(&cookie->lock);
@@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
struct fscache_object, cookie_link);
atomic_inc(&object->n_reads);
- set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+ __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock;
+ goto nobufs_unlock_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_retrieval_ops);
@@ -541,6 +585,8 @@ error:
_leave(" = %d", ret);
return ret;
+nobufs_unlock_dec:
+ atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
kfree(op);
@@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(page, !=, NULL);
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
+ op->n_pages = 1;
spin_lock(&cookie->lock);
@@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)
spin_lock(&object->lock);
cookie = object->cookie;
- if (!fscache_object_is_active(object) || !cookie) {
+ if (!fscache_object_is_active(object)) {
+ /* If we get here, then the on-disk cache object likely longer
+ * exists, so we should just cancel this write operation.
+ */
+ spin_unlock(&object->lock);
+ fscache_op_complete(&op->op, false);
+ _leave(" [inactive]");
+ return;
+ }
+
+ if (!cookie) {
+ /* If we get here, then the cookie belonging to the object was
+ * detached, probably by the cookie being withdrawn due to
+ * memory pressure, which means that the pages we might write
+ * to the cache from no longer exist - therefore, we can just
+ * cancel this write operation.
+ */
spin_unlock(&object->lock);
- _leave("");
+ fscache_op_complete(&op->op, false);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
+ _op->flags, _op->state, object->state, object->flags);
return;
}
@@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)
fscache_end_page_write(object, page);
if (ret < 0) {
fscache_abort_object(object);
+ fscache_op_complete(&op->op, true);
} else {
fscache_enqueue_operation(&op->op);
}
@@ -710,6 +781,38 @@ superseded:
spin_unlock(&cookie->stores_lock);
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
spin_unlock(&object->lock);
+ fscache_op_complete(&op->op, true);
+ _leave("");
+}
+
+/*
+ * Clear the pages pending writing for invalidation
+ */
+void fscache_invalidate_writes(struct fscache_cookie *cookie)
+{
+ struct page *page;
+ void *results[16];
+ int n, i;
+
+ _enter("");
+
+ while (spin_lock(&cookie->stores_lock),
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG),
+ n > 0) {
+ for (i = n - 1; i >= 0; i--) {
+ page = results[i];
+ radix_tree_delete(&cookie->stores, page->index);
+ }
+
+ spin_unlock(&cookie->stores_lock);
+
+ for (i = n - 1; i >= 0; i--)
+ page_cache_release(results[i]);
+ }
+
+ spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_stores);
- op = kzalloc(sizeof(*op), GFP_NOIO);
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
+ op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
if (!op)
goto nomem;
@@ -915,6 +1023,40 @@ done:
EXPORT_SYMBOL(__fscache_uncache_page);
/**
+ * fscache_mark_page_cached - Mark a page as being cached
+ * @op: The retrieval op pages are being marked for
+ * @page: The page to be marked
+ *
+ * Mark a netfs page as being cached. After this is called, the netfs
+ * must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
+{
+ struct fscache_cookie *cookie = op->op.object->cookie;
+
+#ifdef CONFIG_FSCACHE_STATS
+ atomic_inc(&fscache_n_marks);
+#endif
+
+ _debug("- mark %p{%lx}", page, page->index);
+ if (TestSetPageFsCache(page)) {
+ static bool once_only;
+ if (!once_only) {
+ once_only = true;
+ printk(KERN_WARNING "FS-Cache:"
+ " Cookie type %s marked page %lx"
+ " multiple times\n",
+ cookie->def->name, page->index);
+ }
+ }
+
+ if (cookie->def->mark_page_cached)
+ cookie->def->mark_page_cached(cookie->netfs_data,
+ op->mapping, page);
+}
+EXPORT_SYMBOL(fscache_mark_page_cached);
+
+/**
* fscache_mark_pages_cached - Mark pages as being cached
* @op: The retrieval op pages are being marked for
* @pagevec: The pages to be marked
@@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);
void fscache_mark_pages_cached(struct fscache_retrieval *op,
struct pagevec *pagevec)
{
- struct fscache_cookie *cookie = op->op.object->cookie;
unsigned long loop;
-#ifdef CONFIG_FSCACHE_STATS
- atomic_add(pagevec->nr, &fscache_n_marks);
-#endif
-
- for (loop = 0; loop < pagevec->nr; loop++) {
- struct page *page = pagevec->pages[loop];
-
- _debug("- mark %p{%lx}", page, page->index);
- if (TestSetPageFsCache(page)) {
- static bool once_only;
- if (!once_only) {
- once_only = true;
- printk(KERN_WARNING "FS-Cache:"
- " Cookie type %s marked page %lx"
- " multiple times\n",
- cookie->def->name, page->index);
- }
- }
- }
+ for (loop = 0; loop < pagevec->nr; loop++)
+ fscache_mark_page_cached(op, pagevec->pages[loop]);
- if (cookie->def->mark_pages_cached)
- cookie->def->mark_pages_cached(cookie->netfs_data,
- op->mapping, pagevec);
pagevec_reinit(pagevec);
}
EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4765190d537f..8179e8bc4a3d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;
atomic_t fscache_n_store_vmscan_gone;
atomic_t fscache_n_store_vmscan_busy;
atomic_t fscache_n_store_vmscan_cancelled;
+atomic_t fscache_n_store_vmscan_wait;
atomic_t fscache_n_marks;
atomic_t fscache_n_uncaches;
@@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;
atomic_t fscache_n_acquires_nobufs;
atomic_t fscache_n_acquires_oom;
+atomic_t fscache_n_invalidates;
+atomic_t fscache_n_invalidates_run;
+
atomic_t fscache_n_updates;
atomic_t fscache_n_updates_null;
atomic_t fscache_n_updates_run;
@@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;
atomic_t fscache_n_cop_lookup_object;
atomic_t fscache_n_cop_lookup_complete;
atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_invalidate_object;
atomic_t fscache_n_cop_update_object;
atomic_t fscache_n_cop_drop_object;
atomic_t fscache_n_cop_put_object;
@@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_object_created),
atomic_read(&fscache_n_object_lookups_timed_out));
+ seq_printf(m, "Invals : n=%u run=%u\n",
+ atomic_read(&fscache_n_invalidates),
+ atomic_read(&fscache_n_invalidates_run));
+
seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
atomic_read(&fscache_n_updates),
atomic_read(&fscache_n_updates_null),
@@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_store_radix_deletes),
atomic_read(&fscache_n_store_pages_over_limit));
- seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+ seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
atomic_read(&fscache_n_store_vmscan_not_storing),
atomic_read(&fscache_n_store_vmscan_gone),
atomic_read(&fscache_n_store_vmscan_busy),
- atomic_read(&fscache_n_store_vmscan_cancelled));
+ atomic_read(&fscache_n_store_vmscan_cancelled),
+ atomic_read(&fscache_n_store_vmscan_wait));
seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
atomic_read(&fscache_n_op_pend),
@@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_cop_lookup_object),
atomic_read(&fscache_n_cop_lookup_complete),
atomic_read(&fscache_n_cop_grab_object));
- seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+ seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+ atomic_read(&fscache_n_cop_invalidate_object),
atomic_read(&fscache_n_cop_update_object),
atomic_read(&fscache_n_cop_drop_object),
atomic_read(&fscache_n_cop_put_object),
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 0cf160a94eda..1b2f6c2c3aaf 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -4,12 +4,24 @@ config FUSE_FS
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
- There's also companion library: libfuse. This library along with
- utilities is available from the FUSE homepage:
+ There's also a companion library: libfuse2. This library is available
+ from the FUSE homepage:
<http://fuse.sourceforge.net/>
+ although chances are your distribution already has that library
+ installed if you've installed the "fuse" package itself.
See <file:Documentation/filesystems/fuse.txt> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+ tristate "Character device in Userspace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use a userspace character device
+ based on CUSE, answer Y or M.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 75a20c092dd4..b7978b9f75ef 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
{
struct fuse_conn *fc;
mutex_lock(&fuse_mutex);
- fc = file->f_path.dentry->d_inode->i_private;
+ fc = file_inode(file)->i_private;
if (fc)
fc = fuse_conn_get(fc);
mutex_unlock(&fuse_mutex);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index ee8d55042298..6f96a8def147 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -45,7 +45,6 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/module.h>
@@ -63,7 +62,7 @@ struct cuse_conn {
bool unrestricted_ioctl;
};
-static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */
+static DEFINE_MUTEX(cuse_lock); /* protects registration */
static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
static struct class *cuse_class;
@@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
loff_t pos = 0;
+ struct iovec iov = { .iov_base = buf, .iov_len = count };
- return fuse_direct_io(file, buf, count, &pos, 0);
+ return fuse_direct_io(file, &iov, 1, count, &pos, 0);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
loff_t pos = 0;
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(file, buf, count, &pos, 1);
+ return fuse_direct_io(file, &iov, 1, count, &pos, 1);
}
static int cuse_open(struct inode *inode, struct file *file)
@@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file)
int rc;
/* look up and get the connection */
- spin_lock(&cuse_lock);
+ mutex_lock(&cuse_lock);
list_for_each_entry(pos, cuse_conntbl_head(devt), list)
if (pos->dev->devt == devt) {
fuse_conn_get(&pos->fc);
cc = pos;
break;
}
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* dead? */
if (!cc)
@@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
{
char *end = p + len;
- char *key, *val;
+ char *uninitialized_var(key), *uninitialized_var(val);
int rc;
while (true) {
@@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev)
*/
static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
- struct cuse_conn *cc = fc_to_cc(fc);
+ struct cuse_conn *cc = fc_to_cc(fc), *pos;
struct cuse_init_out *arg = req->out.args[0].value;
struct page *page = req->pages[0];
struct cuse_devinfo devinfo = { };
struct device *dev;
struct cdev *cdev;
dev_t devt;
- int rc;
+ int rc, i;
if (req->out.h.error ||
arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
@@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
dev_set_drvdata(dev, cc);
dev_set_name(dev, "%s", devinfo.name);
+ mutex_lock(&cuse_lock);
+
+ /* make sure the device-name is unique */
+ for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+ list_for_each_entry(pos, &cuse_conntbl[i], list)
+ if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+ goto err_unlock;
+ }
+
rc = device_add(dev);
if (rc)
- goto err_device;
+ goto err_unlock;
/* register cdev */
rc = -ENOMEM;
cdev = cdev_alloc();
if (!cdev)
- goto err_device;
+ goto err_unlock;
cdev->owner = THIS_MODULE;
cdev->ops = &cuse_frontend_fops;
@@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
cc->cdev = cdev;
/* make the device available */
- spin_lock(&cuse_lock);
list_add(&cc->list, cuse_conntbl_head(devt));
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* announce device availability */
dev_set_uevent_suppress(dev, 0);
@@ -391,7 +401,8 @@ out:
err_cdev:
cdev_del(cdev);
-err_device:
+err_unlock:
+ mutex_unlock(&cuse_lock);
put_device(dev);
err_region:
unregister_chrdev_region(devt, 1);
@@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc)
BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
goto err;
@@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc)
req->out.argvar = 1;
req->out.argpages = 1;
req->pages[0] = page;
+ req->page_descs[0].length = req->out.args[1].size;
req->num_pages = 1;
req->end = cuse_process_init_reply;
fuse_request_send_background(fc, req);
@@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
int rc;
/* remove from the conntbl, no more access from this point on */
- spin_lock(&cuse_lock);
+ mutex_lock(&cuse_lock);
list_del_init(&cc->list);
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* remove device */
if (cc->dev)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c16335315e5d..11dfa0c3fb46 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)
return file->private_data;
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned npages)
{
memset(req, 0, sizeof(*req));
+ memset(pages, 0, sizeof(*pages) * npages);
+ memset(page_descs, 0, sizeof(*page_descs) * npages);
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
atomic_set(&req->count, 1);
+ req->pages = pages;
+ req->page_descs = page_descs;
+ req->max_pages = npages;
}
-struct fuse_req *fuse_request_alloc(void)
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
- if (req)
- fuse_request_init(req);
+ struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+ if (req) {
+ struct page **pages;
+ struct fuse_page_desc *page_descs;
+
+ if (npages <= FUSE_REQ_INLINE_PAGES) {
+ pages = req->inline_pages;
+ page_descs = req->inline_page_descs;
+ } else {
+ pages = kmalloc(sizeof(struct page *) * npages, flags);
+ page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+ npages, flags);
+ }
+
+ if (!pages || !page_descs) {
+ kfree(pages);
+ kfree(page_descs);
+ kmem_cache_free(fuse_req_cachep, req);
+ return NULL;
+ }
+
+ fuse_request_init(req, pages, page_descs, npages);
+ }
return req;
}
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+ return __fuse_request_alloc(npages, GFP_KERNEL);
+}
EXPORT_SYMBOL_GPL(fuse_request_alloc);
-struct fuse_req *fuse_request_alloc_nofs(void)
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
- if (req)
- fuse_request_init(req);
- return req;
+ return __fuse_request_alloc(npages, GFP_NOFS);
}
void fuse_request_free(struct fuse_req *req)
{
+ if (req->pages != req->inline_pages) {
+ kfree(req->pages);
+ kfree(req->page_descs);
+ }
kmem_cache_free(fuse_req_cachep, req);
}
@@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req)
req->in.h.pid = current->pid;
}
-struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
{
struct fuse_req *req;
sigset_t oldset;
@@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
if (!fc->connected)
goto out;
- req = fuse_request_alloc();
+ req = fuse_request_alloc(npages);
err = -ENOMEM;
if (!req)
goto out;
@@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
struct fuse_file *ff = file->private_data;
spin_lock(&fc->lock);
- fuse_request_init(req);
+ fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
@@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
* filesystem should not have it's own file open. If deadlock is
* intentional, it can still be broken by "aborting" the filesystem.
*/
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file)
{
struct fuse_req *req;
atomic_inc(&fc->num_waiting);
wait_event(fc->blocked_waitq, !fc->blocked);
- req = fuse_request_alloc();
+ req = fuse_request_alloc(0);
if (!req)
req = get_reserved_req(fc, file);
@@ -406,9 +440,8 @@ __acquires(fc->lock)
}
}
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
- req->isreply = 1;
spin_lock(&fc->lock);
if (!fc->connected)
req->out.h.error = -ENOTCONN;
@@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
spin_unlock(&fc->lock);
}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+ req->isreply = 1;
+ __fuse_request_send(fc, req);
+}
EXPORT_SYMBOL_GPL(fuse_request_send);
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
@@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fuse_request_send_nowait_locked(fc, req);
}
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ struct fuse_forget_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.nlookup = 1;
+ req = fuse_get_req_nofail_nopages(fc, file);
+ req->in.h.opcode = FUSE_FORGET;
+ req->in.h.nodeid = nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ req->isreply = 0;
+ __fuse_request_send(fc, req);
+ /* ignore errors */
+ fuse_put_request(fc, req);
+}
+
/*
* Lock the request. Up to the next unlock_request() there mustn't be
* anything that could cause a page-fault. If the request was already
@@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *oldpage = *pagep;
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
- struct address_space *mapping;
- pgoff_t index;
unlock_request(cs->fc, cs->req);
fuse_copy_finish(cs);
@@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (fuse_check_page(newpage) != 0)
goto out_fallback_unlock;
- mapping = oldpage->mapping;
- index = oldpage->index;
-
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
@@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
{
unsigned i;
struct fuse_req *req = cs->req;
- unsigned offset = req->page_offset;
- unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
int err;
+ unsigned offset = req->page_descs[i].offset;
+ unsigned count = min(nbytes, req->page_descs[i].length);
err = fuse_copy_page(cs, &req->pages[i], offset, count,
zeroing);
@@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
return err;
nbytes -= count;
- count = min(nbytes, (unsigned) PAGE_SIZE);
- offset = 0;
}
return 0;
}
@@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
+ int num_pages;
+
+ offset = outarg->offset & ~PAGE_CACHE_MASK;
+ file_size = i_size_read(inode);
+
+ num = outarg->size;
+ if (outarg->offset > file_size)
+ num = 0;
+ else if (outarg->offset + num > file_size)
+ num = file_size - outarg->offset;
- req = fuse_get_req(fc);
+ num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+ req = fuse_get_req(fc, num_pages);
if (IS_ERR(req))
return PTR_ERR(req);
- offset = outarg->offset & ~PAGE_CACHE_MASK;
-
req->in.h.opcode = FUSE_NOTIFY_REPLY;
req->in.h.nodeid = outarg->nodeid;
req->in.numargs = 2;
req->in.argpages = 1;
- req->page_offset = offset;
+ req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
index = outarg->offset >> PAGE_CACHE_SHIFT;
- file_size = i_size_read(inode);
- num = outarg->size;
- if (outarg->offset > file_size)
- num = 0;
- else if (outarg->offset + num > file_size)
- num = file_size - outarg->offset;
- while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+ while (num && req->num_pages < num_pages) {
struct page *page;
unsigned int this_num;
@@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
offset = 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7c09f9eb40c..ff15522481d4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,29 @@
#include <linux/namei.h>
#include <linux/slab.h>
+static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (filp->f_pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
#if BITS_PER_LONG >= 64
static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
{
@@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
return -ECHILD;
fc = get_fuse_conn(inode);
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return 0;
@@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
attr_version);
fuse_change_entry_timeout(entry, &outarg);
}
+ fuse_advise_use_readdirplus(inode);
return 1;
}
@@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
if (name->len > FUSE_NAME_MAX)
goto out;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out;
@@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
else
fuse_invalidate_entry_cache(entry);
+ fuse_advise_use_readdirplus(dir);
return newent;
out_iput:
@@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!forget)
goto out_err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out_put_forget_req;
@@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
{
struct fuse_mknod_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
{
struct fuse_conn *fc = get_fuse_conn(dir);
unsigned len = strlen(link) + 1;
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- drop_nlink(inode);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
@@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
int err;
struct fuse_rename_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_link_in inarg;
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
struct fuse_req *req;
u64 attr_version;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
/*
* Calling into a user-controlled filesystem gives the filesystem
- * daemon ptrace-like capabilities over the requester process. This
+ * daemon ptrace-like capabilities over the current process. This
* means, that the filesystem daemon is able to record the exact
* filesystem operations performed, and can also control the behavior
* of the requester process in otherwise impossible ways. For example
@@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
* for which the owner of the mount has ptrace privilege. This
* excludes processes started by other users, suid or sgid processes.
*/
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
- int ret;
if (fc->flags & FUSE_ALLOW_OTHER)
return 1;
- rcu_read_lock();
- ret = 0;
- cred = __task_cred(task);
+ cred = current_cred();
if (uid_eq(cred->euid, fc->user_id) &&
uid_eq(cred->suid, fc->user_id) &&
uid_eq(cred->uid, fc->user_id) &&
gid_eq(cred->egid, fc->group_id) &&
gid_eq(cred->sgid, fc->group_id) &&
gid_eq(cred->gid, fc->group_id))
- ret = 1;
- rcu_read_unlock();
+ return 1;
- return ret;
+ return 0;
}
static int fuse_access(struct inode *inode, int mask)
@@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask)
if (fc->no_access)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask)
bool refreshed = false;
int err = 0;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
/*
@@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
return 0;
}
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
{
int err;
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = parent->d_inode;
+ struct fuse_conn *fc;
+ struct inode *inode;
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (dentry && dentry->d_inode) {
+ inode = dentry->d_inode;
+ if (get_node_id(inode) == o->nodeid) {
+ struct fuse_inode *fi;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+
+ /*
+ * The other branch to 'found' comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ goto found;
+ }
+ err = d_invalidate(dentry);
+ if (err)
+ goto out;
+ dput(dentry);
+ dentry = NULL;
+ }
+
+ dentry = d_alloc(parent, &name);
+ err = -ENOMEM;
+ if (!dentry)
+ goto out;
+
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o), attr_version);
+ if (!inode)
+ goto out;
+
+ alias = d_materialise_unique(dentry, inode);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias))
+ goto out;
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+
+found:
+ fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
+ attr_version);
+
+ fuse_change_entry_timeout(dentry, o);
+
+ err = 0;
+out:
+ if (dentry)
+ dput(dentry);
+ return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ void *dstbuf, filldir_t filldir, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = filldir(dstbuf, dirent->name, dirent->namelen,
+ file->f_pos, dirent->ino,
+ dirent->type);
+ file->f_pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+ int plus, err;
size_t nbytes;
struct page *page;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
+ u64 attr_version = 0;
if (is_bad_inode(inode))
return -EIO;
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
fuse_put_request(fc, req);
return -ENOMEM;
}
+
+ plus = fuse_use_readdirplus(inode, file);
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+ req->page_descs[0].length = PAGE_SIZE;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fc);
+ fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
- if (!err)
- err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
- filldir);
+ if (!err) {
+ if (plus) {
+ err = parse_dirplusfile(page_address(page), nbytes,
+ file, dstbuf, filldir,
+ attr_version);
+ } else {
+ err = parse_dirfile(page_address(page), nbytes, file,
+ dstbuf, filldir);
+ }
+ }
__free_page(page);
fuse_invalidate_attr(inode); /* atime changed */
@@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
char *link;
if (IS_ERR(req))
@@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
loff_t oldsize;
int err;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
@@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
if (attr->ia_valid & ATTR_SIZE)
is_truncate = true;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
return fuse_update_attributes(inode, stat, NULL, NULL);
@@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
if (fc->no_setxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
if (fc->no_getxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (fc->no_listxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
if (fc->no_removexattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e21d4d8f87e3..34b80ba95bad 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
struct fuse_req *req;
int err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
return NULL;
ff->fc = fc;
- ff->reserved_req = fuse_request_alloc();
+ ff->reserved_req = fuse_request_alloc(0);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
@@ -355,7 +355,7 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
static int fuse_flush(struct file *file, fl_owner_t id)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
struct fuse_req *req;
@@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fc->no_flush)
return 0;
- req = fuse_get_req_nofail(fc, file);
+ req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
fuse_sync_writes(inode);
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page)
*/
fuse_wait_on_page_writeback(inode, page->index);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out;
@@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page)
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
+ req->page_descs[0].length = count;
num_read = fuse_send_read(req, file, pos, count, NULL);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -641,6 +642,7 @@ struct fuse_fill_data {
struct fuse_req *req;
struct file *file;
struct inode *inode;
+ unsigned nr_pages;
};
static int fuse_readpages_fill(void *_data, struct page *page)
@@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page)
(req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+ int nr_alloc = min_t(unsigned, data->nr_pages,
+ FUSE_MAX_PAGES_PER_REQ);
fuse_send_readpages(req, data->file);
- data->req = req = fuse_get_req(fc);
+ data->req = req = fuse_get_req(fc, nr_alloc);
if (IS_ERR(req)) {
unlock_page(page);
return PTR_ERR(req);
}
}
+
+ if (WARN_ON(req->num_pages >= req->max_pages)) {
+ fuse_put_request(fc, req);
+ return -EIO;
+ }
+
page_cache_get(page);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = PAGE_SIZE;
req->num_pages++;
+ data->nr_pages--;
return 0;
}
@@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
int err;
+ int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
err = -EIO;
if (is_bad_inode(inode))
@@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
data.file = file;
data.inode = inode;
- data.req = fuse_get_req(fc);
+ data.req = fuse_get_req(fc, nr_alloc);
+ data.nr_pages = nr_pages;
err = PTR_ERR(data.req);
if (IS_ERR(data.req))
goto out;
@@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
res = fuse_send_write(req, file, pos, count, NULL);
- offset = req->page_offset;
+ offset = req->page_descs[0].offset;
count = res;
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
@@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
int err;
req->in.argpages = 1;
- req->page_offset = offset;
+ req->page_descs[0].offset = offset;
do {
size_t tmp;
@@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
err = 0;
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = tmp;
req->num_pages++;
iov_iter_advance(ii, tmp);
@@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
- req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+ req->num_pages < req->max_pages && offset == 0);
return count > 0 ? count : err;
}
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+ return min_t(unsigned,
+ ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+ (pos >> PAGE_CACHE_SHIFT) + 1,
+ FUSE_MAX_PAGES_PER_REQ);
+}
+
static ssize_t fuse_perform_write(struct file *file,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos)
@@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file,
do {
struct fuse_req *req;
ssize_t count;
+ unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, nr_pages);
if (IS_ERR(req)) {
err = PTR_ERR(req);
break;
@@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
-static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+ unsigned index, unsigned nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ req->page_descs[i].length = PAGE_SIZE -
+ req->page_descs[i].offset;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+ return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+ size_t max_size)
+{
+ return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t *nbytesp, int write)
{
- size_t nbytes = *nbytesp;
- unsigned long user_addr = (unsigned long) buf;
- unsigned offset = user_addr & ~PAGE_MASK;
- int npages;
+ size_t nbytes = 0; /* # bytes already packed in req */
/* Special case for kernel I/O: can copy directly into the buffer */
if (segment_eq(get_fs(), KERNEL_DS)) {
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
if (write)
req->in.args[1].value = (void *) user_addr;
else
req->out.args[0].value = (void *) user_addr;
+ iov_iter_advance(ii, frag_size);
+ *nbytesp = frag_size;
return 0;
}
- nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
- npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
- npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
- if (npages < 0)
- return npages;
+ while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+ unsigned npages;
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ unsigned offset = user_addr & ~PAGE_MASK;
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
+ int ret;
+
+ unsigned n = req->max_pages - req->num_pages;
+ frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
+
+ npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ npages = clamp(npages, 1U, n);
+
+ ret = get_user_pages_fast(user_addr, npages, !write,
+ &req->pages[req->num_pages]);
+ if (ret < 0)
+ return ret;
- req->num_pages = npages;
- req->page_offset = offset;
+ npages = ret;
+ frag_size = min_t(size_t, frag_size,
+ (npages << PAGE_SHIFT) - offset);
+ iov_iter_advance(ii, frag_size);
+
+ req->page_descs[req->num_pages].offset = offset;
+ fuse_page_descs_length_init(req, req->num_pages, npages);
+
+ req->num_pages += npages;
+ req->page_descs[req->num_pages - 1].length -=
+ (npages << PAGE_SHIFT) - offset - frag_size;
+
+ nbytes += frag_size;
+ }
if (write)
req->in.argpages = 1;
else
req->out.argpages = 1;
- nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
- *nbytesp = min(*nbytesp, nbytes);
+ *nbytesp = nbytes;
return 0;
}
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write)
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
+{
+ struct iov_iter ii = *ii_p;
+ int npages = 0;
+
+ while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
+ unsigned long user_addr = fuse_get_user_addr(&ii);
+ unsigned offset = user_addr & ~PAGE_MASK;
+ size_t frag_size = iov_iter_single_seg_count(&ii);
+
+ npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ iov_iter_advance(&ii, frag_size);
+ }
+
+ return min(npages, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, size_t count, loff_t *ppos,
+ int write)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
loff_t pos = *ppos;
ssize_t res = 0;
struct fuse_req *req;
+ struct iov_iter ii;
+
+ iov_iter_init(&ii, iov, nr_segs, count, 0);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, buf, &nbytes, write);
+ int err = fuse_get_user_pages(req, &ii, &nbytes, write);
if (err) {
res = err;
break;
@@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
count -= nres;
res += nres;
pos += nres;
- buf += nres;
if (nres != nbytes)
break;
if (count) {
fuse_put_request(fc, req);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
break;
}
@@ -1122,31 +1211,40 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
ssize_t res;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
if (is_bad_inode(inode))
return -EIO;
- res = fuse_direct_io(file, buf, count, ppos, 0);
+ res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs),
+ ppos, 0);
fuse_invalidate_attr(inode);
return res;
}
-static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct iovec iov = { .iov_base = buf, .iov_len = count };
+ return __fuse_direct_read(file, &iov, 1, ppos);
+}
+
+static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ size_t count = iov_length(iov, nr_segs);
ssize_t res;
res = generic_write_checks(file, ppos, &count, 0);
if (!res) {
- res = fuse_direct_io(file, buf, count, ppos, 1);
+ res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
if (res > 0)
fuse_write_update_size(inode, *ppos);
}
@@ -1159,7 +1257,8 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct inode *inode = file_inode(file);
ssize_t res;
if (is_bad_inode(inode))
@@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(file, buf, count, ppos);
+ res = __fuse_direct_write(file, &iov, 1, ppos);
mutex_unlock(&inode->i_mutex);
return res;
@@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page)
set_page_writeback(page);
- req = fuse_request_alloc_nofs();
+ req = fuse_request_alloc_nofs(1);
if (!req)
goto err;
@@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page)
req->in.argpages = 1;
req->num_pages = 1;
req->pages[0] = tmp_page;
- req->page_offset = 0;
+ req->page_descs[0].offset = 0;
+ req->page_descs[0].length = PAGE_SIZE;
req->end = fuse_writepage_end;
req->inode = inode;
@@ -1385,7 +1485,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
@@ -1443,7 +1543,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,
const struct file_lock *fl, int opcode, pid_t pid,
int flock)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
struct fuse_lk_in *arg = &req->misc.lk_in;
@@ -1465,13 +1565,13 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
struct fuse_lk_out outarg;
int err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1490,7 +1590,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
@@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if (fl->fl_flags & FL_CLOSE)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1522,7 +1622,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
@@ -1545,7 +1645,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
@@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
if (!inode->i_sb->s_bdev || fc->no_bmap)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return 0;
@@ -1602,7 +1702,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
{
loff_t retval;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
if (whence == SEEK_CUR || whence == SEEK_SET)
@@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
num_pages++;
}
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, num_pages);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
req->num_pages = num_pages;
+ fuse_page_descs_length_init(req, 0, req->num_pages);
/* okay, let's send it to the client */
req->in.h.opcode = FUSE_IOCTL;
@@ -1978,10 +2079,10 @@ EXPORT_SYMBOL_GPL(fuse_do_ioctl);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
unsigned long arg, unsigned int flags)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (is_bad_inode(inode))
@@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
+ inarg.events = (__u32)poll_requested_events(wait);
/*
* Ask for notification iff there's someone waiting for it.
@@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
fuse_register_polled_file(fc, ff);
}
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return POLLERR;
@@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
return 0;
}
-static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos, int rw)
-{
- const struct iovec *vector = iov;
- ssize_t ret = 0;
-
- while (nr_segs > 0) {
- void __user *base;
- size_t len;
- ssize_t nr;
-
- base = vector->iov_base;
- len = vector->iov_len;
- vector++;
- nr_segs--;
-
- if (rw == WRITE)
- nr = __fuse_direct_write(filp, base, len, ppos);
- else
- nr = fuse_direct_read(filp, base, len, ppos);
-
- if (nr < 0) {
- if (!ret)
- ret = nr;
- break;
- }
- ret += nr;
- if (nr != len)
- break;
- }
-
- return ret;
-}
-
-
static ssize_t
fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
file = iocb->ki_filp;
pos = offset;
- ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
+ if (rw == WRITE)
+ ret = __fuse_direct_write(file, iov, nr_segs, &pos);
+ else
+ ret = __fuse_direct_read(file, iov, nr_segs, &pos);
return ret;
}
-long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
- loff_t length)
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t length)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (fc->no_fallocate)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return err;
}
-EXPORT_SYMBOL_GPL(fuse_file_fallocate);
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e105a53fc72d..6aeba864f070 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,6 +44,9 @@
doing the mount will be allowed to access the filesystem */
#define FUSE_ALLOW_OTHER (1 << 1)
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
/** List of active connections */
extern struct list_head fuse_conn_list;
@@ -103,6 +106,15 @@ struct fuse_inode {
/** List of writepage requestst (pending or sent) */
struct list_head writepages;
+
+ /** Miscellaneous bits describing inode state */
+ unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+ /** Advise readdirplus */
+ FUSE_I_ADVISE_RDPLUS,
};
struct fuse_conn;
@@ -200,6 +212,12 @@ struct fuse_out {
struct fuse_arg args[3];
};
+/** FUSE page descriptor */
+struct fuse_page_desc {
+ unsigned int length;
+ unsigned int offset;
+};
+
/** The request state */
enum fuse_req_state {
FUSE_REQ_INIT = 0,
@@ -291,14 +309,23 @@ struct fuse_req {
} misc;
/** page vector */
- struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+ struct page **pages;
+
+ /** page-descriptor vector */
+ struct fuse_page_desc *page_descs;
+
+ /** size of the 'pages' array */
+ unsigned max_pages;
+
+ /** inline page vector */
+ struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+ /** inline page-descriptor vector */
+ struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
/** number of pages in vector */
unsigned num_pages;
- /** offset of data on first page */
- unsigned page_offset;
-
/** File used in the request (or NULL) */
struct fuse_file *ff;
@@ -487,6 +514,12 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
+ /** Does the filesystem support readdirplus? */
+ unsigned do_readdirplus:1;
+
+ /** Does the filesystem want adaptive readdirplus? */
+ unsigned readdirplus_auto:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
/**
* Initialize READ or READDIR request
*/
@@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void);
/**
* Allocate a request
*/
-struct fuse_req *fuse_request_alloc(void);
+struct fuse_req *fuse_request_alloc(unsigned npages);
-struct fuse_req *fuse_request_alloc_nofs(void);
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
/**
* Free a request
@@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void);
void fuse_request_free(struct fuse_req *req);
/**
- * Get a request, may fail with -ENOMEM
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
*/
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * useful for callers who doesn't use req->pages[]
+ */
+static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
+{
+ return fuse_get_req(fc, 0);
+}
/**
* Gets a requests for a file operation, always succeeds
*/
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file);
/**
* Decrement reference count of a request. If count goes to zero free
@@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
int fuse_valid_type(int m);
/**
- * Is task allowed to perform filesystem operation?
+ * Is current process allowed to perform filesystem operation?
*/
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write);
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, size_t count, loff_t *ppos,
+ int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73ca6b72beaf..df00993ed108 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->attr_version = 0;
fi->writectr = 0;
fi->orig_ino = 0;
+ fi->state = 0;
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
INIT_LIST_HEAD(&fi->writepages);
@@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_task(fc, current)) {
+ if (!fuse_allow_current_process(fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -678,7 +679,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
if (*max_len < len) {
*max_len = len;
- return 255;
+ return FILEID_INVALID;
}
nodeid = get_fuse_inode(inode)->nodeid;
@@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
+ if (arg->flags & FUSE_DO_READDIRPLUS)
+ fc->do_readdirplus = 1;
+ if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
- FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
+ FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
/* only now - we want root dentry with NULL ->d_op */
sb->s_d_op = &fuse_dentry_operations;
- init_req = fuse_request_alloc();
+ init_req = fuse_request_alloc(0);
if (!init_req)
goto err_put_root;
if (is_bdev) {
- fc->destroy_req = fuse_request_alloc();
+ fc->destroy_req = fuse_request_alloc(0);
if (!fc->destroy_req)
goto err_free_init_req;
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f850020ad906..f69ac0af5496 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
return -EINVAL;
if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
return -EPERM;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 30de4f2a2ea9..24f414f0ce61 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
continue;
if (gfs2_is_jdata(ip))
set_buffer_uptodate(bh);
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
}
}
@@ -230,16 +230,14 @@ out_ignore:
}
/**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
* @mapping: The mapping to write
* @wbc: Write-back control
*
- * For the data=writeback case we can already ignore buffer heads
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
+ * Used for both ordered and writeback modes.
*/
-static int gfs2_writeback_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int gfs2_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
}
@@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
goto failed;
}
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip))
return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
@@ -1102,7 +1100,7 @@ cannot_release:
static const struct address_space_operations gfs2_writeback_aops = {
.writepage = gfs2_writeback_writepage,
- .writepages = gfs2_writeback_writepages,
+ .writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
@@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
static const struct address_space_operations gfs2_ordered_aops = {
.writepage = gfs2_ordered_writepage,
+ .writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a68e91bcef3d..5e83657f046e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -22,6 +22,7 @@
#include "meta_io.h"
#include "quota.h"
#include "rgrp.h"
+#include "log.h"
#include "super.h"
#include "trans.h"
#include "dir.h"
@@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
if (!gfs2_is_jdata(ip))
mark_buffer_dirty(bh);
if (!gfs2_is_writeback(ip))
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
if (release) {
unlock_page(page);
@@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
/* Set up the pointer to the new block */
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
BUG_ON(i < 1);
BUG_ON(mp->mp_bh[i] != NULL);
mp->mp_bh[i] = gfs2_meta_new(gl, bn);
- gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+ gfs2_trans_add_meta(gl, mp->mp_bh[i]);
gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
ptr += offset;
@@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
BUG_ON(sheight < 1);
BUG_ON(dibh == NULL);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (height == sheight) {
struct buffer_head *bh;
@@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
/* Branching from existing tree */
case ALLOC_GROW_DEPTH:
if (i > 1 && i < height)
- gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+ gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
for (; i < height && n > 0; i++, n--)
gfs2_indirect_init(mp, ip->i_gl, i,
mp->mp_list[i-1], bn++);
@@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
case ALLOC_DATA:
BUG_ON(n > dblks);
BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
- gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+ gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
dblks = n;
ptr = metapointer(end_of_metadata, mp);
dblock = bn;
@@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
down_write(&ip->i_rw_mutex);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_trans_add_meta(ip->i_gl, bh);
bstart = 0;
blen = 0;
@@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
}
if (!gfs2_is_writeback(ip))
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
zero_user(page, offset, length);
mark_buffer_dirty(bh);
@@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
if (error)
goto out;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip)) {
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
@@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
return error;
@@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip)
ip->i_height = 0;
ip->i_goal = ip->i_no_addr;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ gfs2_ordered_del_inode(ip);
}
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size)
i_size_write(inode, size);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
inode_dio_wait(inode);
+ ret = gfs2_rs_alloc(GFS2_I(inode));
+ if (ret)
+ return ret;
+
oldsize = inode->i_size;
if (newsize >= oldsize)
return do_grow(inode, newsize);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9a35670fdc38..c3e82bd23179 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head *bh;
bh = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
*bhp = bh;
@@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
@@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
if (error)
goto fail;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
memcpy(bh->b_data + o, buf, amount);
brelse(bh);
@@ -231,7 +231,7 @@ out:
i_size_write(&ip->i_inode, offset + copied);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
return;
}
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
/* If there is no prev entry, this is the first entry in the block.
The de_rec_len is already as big as it needs to be. Just zero
@@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
totlen = be16_to_cpu(dent->de_rec_len);
BUG_ON(offset + name->len > totlen);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
ndent = (struct gfs2_dirent *)((char *)dent + offset);
dent->de_rec_len = cpu_to_be16(offset);
gfs2_qstr2dirent(name, totlen - offset, ndent);
@@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
return NULL;
gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
leaf = (struct gfs2_leaf *)bh->b_data;
leaf->lf_depth = cpu_to_be16(depth);
@@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode)
/* We're done with the new leaf block, now setup the new
hash table. */
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
@@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
return 1; /* can't split */
}
- gfs2_trans_add_bh(dip->i_gl, obh, 1);
+ gfs2_trans_add_meta(dip->i_gl, obh);
nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
if (!nleaf) {
@@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
error = gfs2_meta_inode_buffer(dip, &dibh);
if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
gfs2_add_inode_blocks(&dip->i_inode, 1);
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
@@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
return error;
} while(1);
- gfs2_trans_add_bh(ip->i_gl, obh, 1);
+ gfs2_trans_add_meta(ip->i_gl, obh);
leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
if (!leaf) {
@@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_add_inode_blocks(&ip->i_inode, 1);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
@@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
if (IS_ERR(dent))
return PTR_ERR(dent);
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(new_type);
@@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
error = gfs2_meta_inode_buffer(dip, &bh);
if (error)
return error;
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
}
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
@@ -1849,7 +1849,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (!ht)
return -ENOMEM;
- error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto out;
@@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out_end_trans;
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
/* On the last dealloc, make this a regular file in case we crash.
(We don't want to free these blocks a second time.) */
if (last_dealloc)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 4767774a5f3e..9973df4ff565 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -37,10 +37,10 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
*len = GFS2_LARGE_FH_SIZE;
- return 255;
+ return FILEID_INVALID;
} else if (*len < GFS2_SMALL_FH_SIZE) {
*len = GFS2_SMALL_FH_SIZE;
- return 255;
+ return FILEID_INVALID;
}
fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 991ab2d484dd..019f45e45097 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -157,7 +157,7 @@ static const u32 gfs2_to_fsflags[32] = {
static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int error;
@@ -217,7 +217,7 @@ void gfs2_set_inode_flags(struct inode *inode)
*/
static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *bh;
@@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_trans_end;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
@@ -293,7 +293,7 @@ out_drop_write:
static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
u32 fsflags, gfsflags;
if (get_user(fsflags, ptr))
@@ -336,7 +336,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
{
- struct inode *inode = filep->f_dentry->d_inode;
+ struct inode *inode = file_inode(filep);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
@@ -386,7 +386,7 @@ static int gfs2_allocate_page_backing(struct page *page)
static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned long last_index;
@@ -483,7 +483,7 @@ out:
gfs2_holder_uninit(&gh);
if (ret == 0) {
set_page_dirty(page);
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
}
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -673,8 +673,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
size_t writesize = iov_length(iov, nr_segs);
- struct dentry *dentry = file->f_dentry;
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
ret = gfs2_rs_alloc(ip);
@@ -709,7 +708,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
if (unlikely(error))
return error;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip)) {
error = gfs2_unstuff_dinode(ip, NULL);
@@ -772,7 +771,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
@@ -938,7 +937,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct gfs2_file *fp = file->private_data;
struct gfs2_holder *fl_gh = &fp->f_fl_gh;
- struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(file_inode(file));
struct gfs2_glock *gl;
unsigned int state;
int flags;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 992c5c0cb504..cf3515546739 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -30,6 +30,7 @@
#include <linux/rculist_bl.h>
#include <linux/bit_spinlock.h>
#include <linux/percpu.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
gfs2_glock_put(gl);
}
+static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_glock *gla, *glb;
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
- struct shrink_control *sc)
+ gla = list_entry(a, struct gfs2_glock, gl_lru);
+ glb = list_entry(b, struct gfs2_glock, gl_lru);
+
+ if (gla->gl_name.ln_number > glb->gl_name.ln_number)
+ return 1;
+ if (gla->gl_name.ln_number < glb->gl_name.ln_number)
+ return -1;
+
+ return 0;
+}
+
+/**
+ * gfs2_dispose_glock_lru - Demote a list of glocks
+ * @list: The list to dispose of
+ *
+ * Disposing of glocks may involve disk accesses, so that here we sort
+ * the glocks by number (i.e. disk location of the inodes) so that if
+ * there are any such accesses, they'll be sent in order (mostly).
+ *
+ * Must be called under the lru_lock, but may drop and retake this
+ * lock. While the lru_lock is dropped, entries may vanish from the
+ * list, but no new entries will appear on the list (since it is
+ * private)
+ */
+
+static void gfs2_dispose_glock_lru(struct list_head *list)
+__releases(&lru_lock)
+__acquires(&lru_lock)
{
struct gfs2_glock *gl;
- int may_demote;
- int nr_skipped = 0;
- int nr = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
- LIST_HEAD(skipped);
- if (nr == 0)
- goto out;
+ list_sort(NULL, list, glock_cmp);
- if (!(gfp_mask & __GFP_FS))
- return -1;
+ while(!list_empty(list)) {
+ gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+ list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
+ gfs2_glock_hold(gl);
+ spin_unlock(&lru_lock);
+ spin_lock(&gl->gl_spin);
+ if (demote_ok(gl))
+ handle_callback(gl, LM_ST_UNLOCKED, 0);
+ WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
+ smp_mb__after_clear_bit();
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put_nolock(gl);
+ spin_unlock(&gl->gl_spin);
+ spin_lock(&lru_lock);
+ }
+}
+
+/**
+ * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
+ * @nr: The number of entries to scan
+ *
+ * This function selects the entries on the LRU which are able to
+ * be demoted, and then kicks off the process by calling
+ * gfs2_dispose_glock_lru() above.
+ */
+
+static void gfs2_scan_glock_lru(int nr)
+{
+ struct gfs2_glock *gl;
+ LIST_HEAD(skipped);
+ LIST_HEAD(dispose);
spin_lock(&lru_lock);
while(nr && !list_empty(&lru_list)) {
gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
- list_del_init(&gl->gl_lru);
- clear_bit(GLF_LRU, &gl->gl_flags);
- atomic_dec(&lru_count);
/* Test for being demotable */
if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- gfs2_glock_hold(gl);
- spin_unlock(&lru_lock);
- spin_lock(&gl->gl_spin);
- may_demote = demote_ok(gl);
- if (may_demote) {
- handle_callback(gl, LM_ST_UNLOCKED, 0);
- nr--;
- }
- clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_clear_bit();
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put_nolock(gl);
- spin_unlock(&gl->gl_spin);
- spin_lock(&lru_lock);
+ list_move(&gl->gl_lru, &dispose);
+ atomic_dec(&lru_count);
+ nr--;
continue;
}
- nr_skipped++;
- list_add(&gl->gl_lru, &skipped);
- set_bit(GLF_LRU, &gl->gl_flags);
+
+ list_move(&gl->gl_lru, &skipped);
}
list_splice(&skipped, &lru_list);
- atomic_add(nr_skipped, &lru_count);
+ if (!list_empty(&dispose))
+ gfs2_dispose_glock_lru(&dispose);
spin_unlock(&lru_lock);
-out:
+}
+
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (sc->nr_to_scan) {
+ if (!(sc->gfp_mask & __GFP_FS))
+ return -1;
+ gfs2_scan_glock_lru(sc->nr_to_scan);
+ }
+
return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78d4184ffc7d..444b6503ebc4 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -322,8 +322,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
break;
};
- ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
- ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+ i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
+ i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c373a24fedd9..156e42ec84ea 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,6 @@ struct gfs2_log_header_host {
*/
struct gfs2_log_operations {
- void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
void (*lo_before_commit) (struct gfs2_sbd *sdp);
void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -341,6 +340,7 @@ enum {
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
+ GIF_ORDERED = 4,
};
struct gfs2_inode {
@@ -357,6 +357,7 @@ struct gfs2_inode {
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
+ struct list_head i_ordered;
struct list_head i_trunc_list;
__be64 *i_hash_cache;
u32 i_entries;
@@ -391,7 +392,6 @@ struct gfs2_revoke_replay {
};
enum {
- QDF_USER = 0,
QDF_CHANGE = 1,
QDF_LOCKED = 2,
QDF_REFRESH = 3,
@@ -403,7 +403,7 @@ struct gfs2_quota_data {
atomic_t qd_count;
- u32 qd_id;
+ struct kqid qd_id;
unsigned long qd_flags; /* QDF_... */
s64 qd_change;
@@ -641,6 +641,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
+ struct completion sd_wdack;
struct delayed_work sd_control_work;
/* Inode Stuff */
@@ -723,6 +724,7 @@ struct gfs2_sbd {
struct list_head sd_log_le_revoke;
struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
+ spinlock_t sd_ordered_lock;
atomic_t sd_log_thresh1;
atomic_t sd_log_thresh2;
@@ -758,10 +760,7 @@ struct gfs2_sbd {
unsigned int sd_replayed_blocks;
/* For quiescing the filesystem */
-
struct gfs2_holder sd_freeze_gh;
- struct mutex sd_freeze_lock;
- unsigned int sd_freeze_count;
char sd_fsname[GFS2_FSNAME_LEN];
char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b6f5698ef18..cc00bd1d1f87 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -368,10 +368,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
struct inode *inode)
{
if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
- (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
+ (dip->i_inode.i_mode & S_ISUID) &&
+ !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) {
if (S_ISDIR(inode->i_mode))
inode->i_mode |= S_ISUID;
- else if (dip->i_inode.i_uid != current_fsuid())
+ else if (!uid_eq(dip->i_inode.i_uid, current_fsuid()))
inode->i_mode &= ~07111;
inode->i_uid = dip->i_inode.i_uid;
} else
@@ -447,7 +448,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
struct timespec tv = CURRENT_TIME;
dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
di = (struct gfs2_dinode *)dibh->b_data;
@@ -455,8 +456,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
- di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+ di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+ di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
di->di_nlink = 0;
di->di_size = cpu_to_be64(ip->i_inode.i_size);
di->di_blocks = cpu_to_be64(1);
@@ -548,7 +549,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
return error;
- error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto fail;
@@ -584,7 +585,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
goto fail_end_trans;
set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
return 0;
@@ -931,7 +932,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out_brelse;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
inc_nlink(&ip->i_inode);
ip->i_inode.i_ctime = CURRENT_TIME;
ihold(inode);
@@ -978,8 +979,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
return -EPERM;
if ((dip->i_inode.i_mode & S_ISVTX) &&
- dip->i_inode.i_uid != current_fsuid() &&
- ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+ !uid_eq(dip->i_inode.i_uid, current_fsuid()) &&
+ !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER))
return -EPERM;
if (IS_APPEND(&dip->i_inode))
@@ -1412,7 +1413,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
goto out_end_trans;
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -1580,7 +1581,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- u32 ouid, ogid, nuid, ngid;
+ kuid_t ouid, nuid;
+ kgid_t ogid, ngid;
int error;
ouid = inode->i_uid;
@@ -1588,16 +1590,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
nuid = attr->ia_uid;
ngid = attr->ia_gid;
- if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
- ouid = nuid = NO_QUOTA_CHANGE;
- if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
- ogid = ngid = NO_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid))
+ ouid = nuid = NO_UID_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
+ ogid = ngid = NO_GID_QUOTA_CHANGE;
error = gfs2_quota_lock(ip, nuid, ngid);
if (error)
return error;
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+ !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
error = gfs2_quota_check(ip, nuid, ngid);
if (error)
goto out_gunlock_q;
@@ -1611,7 +1614,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_end_trans;
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+ !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
gfs2_quota_change(ip, -blocks, ouid, ogid);
gfs2_quota_change(ip, blocks, nuid, ngid);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 8dad6b093716..9802de0f85e6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -241,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
static void gfs2_reverse_hex(char *c, u64 value)
{
+ *c = '0';
while (value) {
*c-- = hex_asc[value & 0x0f];
value >>= 4;
@@ -280,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int lvb_needs_unlock = 0;
int error;
if (gl->gl_lksb.sb_lkid == 0) {
@@ -293,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
gfs2_update_request_times(gl);
/* don't want to skip dlm_unlock writing the lvb when lock is ex */
+
+ if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+ lvb_needs_unlock = 1;
+
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
- gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+ !lvb_needs_unlock) {
gfs2_glock_free(gl);
return;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f4beeb9c81c1..9a2ca8be7647 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp)
}
}
-static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
{
- struct gfs2_bufdata *bda, *bdb;
+ struct gfs2_inode *ipa, *ipb;
- bda = list_entry(a, struct gfs2_bufdata, bd_list);
- bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+ ipa = list_entry(a, struct gfs2_inode, i_ordered);
+ ipb = list_entry(b, struct gfs2_inode, i_ordered);
- if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ if (ipa->i_no_addr < ipb->i_no_addr)
return -1;
- if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ if (ipa->i_no_addr > ipb->i_no_addr)
return 1;
return 0;
}
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
- struct gfs2_bufdata *bd;
- struct buffer_head *bh;
+ struct gfs2_inode *ip;
LIST_HEAD(written);
- gfs2_log_lock(sdp);
- list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
+ spin_lock(&sdp->sd_ordered_lock);
+ list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
- list_move(&bd->bd_list, &written);
- bh = bd->bd_bh;
- if (!buffer_dirty(bh))
+ ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+ list_move(&ip->i_ordered, &written);
+ if (ip->i_inode.i_mapping->nrpages == 0)
continue;
- get_bh(bh);
- gfs2_log_unlock(sdp);
- lock_buffer(bh);
- if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE_SYNC, bh);
- } else {
- unlock_buffer(bh);
- brelse(bh);
- }
- gfs2_log_lock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
+ filemap_fdatawrite(ip->i_inode.i_mapping);
+ spin_lock(&sdp->sd_ordered_lock);
}
list_splice(&written, &sdp->sd_log_le_ordered);
- gfs2_log_unlock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
}
static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
{
- struct gfs2_bufdata *bd;
- struct buffer_head *bh;
+ struct gfs2_inode *ip;
- gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
- bh = bd->bd_bh;
- if (buffer_locked(bh)) {
- get_bh(bh);
- gfs2_log_unlock(sdp);
- wait_on_buffer(bh);
- brelse(bh);
- gfs2_log_lock(sdp);
+ ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+ list_del(&ip->i_ordered);
+ WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+ if (ip->i_inode.i_mapping->nrpages == 0)
continue;
- }
- list_del_init(&bd->bd_list);
+ spin_unlock(&sdp->sd_ordered_lock);
+ filemap_fdatawait(ip->i_inode.i_mapping);
+ spin_lock(&sdp->sd_ordered_lock);
}
- gfs2_log_unlock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_ordered_del_inode(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ spin_lock(&sdp->sd_ordered_lock);
+ if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
+ list_del(&ip->i_ordered);
+ spin_unlock(&sdp->sd_ordered_lock);
}
/**
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3fd5215ea25f..3566f35915e0 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
sdp->sd_log_head = sdp->sd_log_tail = value;
}
+static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+ spin_lock(&sdp->sd_ordered_lock);
+ if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+ list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+ spin_unlock(&sdp->sd_ordered_lock);
+ }
+}
+extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9ceccb1595a3..a5055977a214 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -37,7 +37,7 @@
*
* The log lock must be held when calling this function
*/
-static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
struct gfs2_bufdata *bd;
@@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
return page;
}
-static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_meta_header *mh;
- struct gfs2_trans *tr;
-
- tr = current->journal_info;
- tr->tr_touched = 1;
- if (!list_empty(&bd->bd_list))
- return;
- set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
- set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
- mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
- if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
- printk(KERN_ERR
- "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
- (unsigned long long)bd->bd_bh->b_blocknr);
- BUG();
- }
- gfs2_pin(sdp, bd->bd_bh);
- mh->__pad0 = cpu_to_be64(0);
- mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
- sdp->sd_log_num_buf++;
- list_add(&bd->bd_list, &sdp->sd_log_le_buf);
- tr->tr_num_buf_new++;
-}
-
static void gfs2_check_magic(struct buffer_head *bh)
{
void *kaddr;
@@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
}
-static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_glock *gl = bd->bd_gl;
- struct gfs2_trans *tr;
-
- tr = current->journal_info;
- tr->tr_touched = 1;
- tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
-}
-
static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
{
struct gfs2_meta_header *mh;
@@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
}
/**
- * databuf_lo_add - Add a databuf to the transaction.
- *
- * This is used in two distinct cases:
- * i) In ordered write mode
- * We put the data buffer on a list so that we can ensure that its
- * synced to disk at the right time
- * ii) In journaled data mode
- * We need to journal the data block in the same way as metadata in
- * the functions above. The difference is that here we have a tag
- * which is two __be64's being the block number (as per meta data)
- * and a flag which says whether the data block needs escaping or
- * not. This means we need a new log entry for each 251 or so data
- * blocks, which isn't an enormous overhead but twice as much as
- * for normal metadata blocks.
- */
-static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_trans *tr = current->journal_info;
- struct address_space *mapping = bd->bd_bh->b_page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
-
- if (tr)
- tr->tr_touched = 1;
- if (!list_empty(&bd->bd_list))
- return;
- set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
- set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
- if (gfs2_is_jdata(ip)) {
- gfs2_pin(sdp, bd->bd_bh);
- tr->tr_num_databuf_new++;
- sdp->sd_log_num_databuf++;
- list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
- } else {
- list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
- }
-}
-
-/**
* databuf_lo_before_commit - Scan the data buffers, writing as we go
*
*/
@@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
const struct gfs2_log_operations gfs2_buf_lops = {
- .lo_add = buf_lo_add,
.lo_before_commit = buf_lo_before_commit,
.lo_after_commit = buf_lo_after_commit,
.lo_before_scan = buf_lo_before_scan,
@@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = {
};
const struct gfs2_log_operations gfs2_revoke_lops = {
- .lo_add = revoke_lo_add,
.lo_before_commit = revoke_lo_before_commit,
.lo_after_commit = revoke_lo_after_commit,
.lo_before_scan = revoke_lo_before_scan,
@@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
};
const struct gfs2_log_operations gfs2_databuf_lops = {
- .lo_add = databuf_lo_add,
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
.lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 954a330585f4..ba77b7da8325 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
@@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
return limit;
}
-static inline void lops_init_le(struct gfs2_bufdata *bd,
- const struct gfs2_log_operations *lops)
-{
- INIT_LIST_HEAD(&bd->bd_list);
- bd->bd_ops = lops;
-}
-
-static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- if (bd->bd_ops->lo_add)
- bd->bd_ops->lo_add(sdp, bd);
-}
-
static inline void lops_before_commit(struct gfs2_sbd *sdp)
{
int x;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 22255d96b27e..b059bbb5059e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
return 0;
}
-/**
- * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
- * @gl: the glock the buffer belongs to
- * @bh: The buffer to be attached to
- * @meta: Flag to indicate whether its metadata or not
- */
-
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
- int meta)
-{
- struct gfs2_bufdata *bd;
-
- if (meta)
- lock_page(bh->b_page);
-
- if (bh->b_private) {
- if (meta)
- unlock_page(bh->b_page);
- return;
- }
-
- bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
- bd->bd_bh = bh;
- bd->bd_gl = gl;
-
- if (meta)
- lops_init_le(bd, &gfs2_buf_lops);
- else
- lops_init_le(bd, &gfs2_databuf_lops);
- bh->b_private = bd;
-
- if (meta)
- unlock_page(bh->b_page);
-}
-
void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
{
struct address_space *mapping = bh->b_page->mapping;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c30973b07a7c..0d4c843b6f8e 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
- int meta);
-
void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
int meta);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0e3554edb8f2..1b612be4b873 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
init_waitqueue_head(&sdp->sd_glock_wait);
atomic_set(&sdp->sd_glock_disposal, 0);
init_completion(&sdp->sd_locking_init);
+ init_completion(&sdp->sd_wdack);
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
@@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+ spin_lock_init(&sdp->sd_ordered_lock);
init_waitqueue_head(&sdp->sd_log_waitq);
init_waitqueue_head(&sdp->sd_logd_waitq);
@@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_revoke_list);
- mutex_init(&sdp->sd_freeze_lock);
-
return sdp;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index ae55e248c3b7..c7c840e916f8 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -65,13 +65,10 @@
#include "inode.h"
#include "util.h"
-#define QUOTA_USER 1
-#define QUOTA_GROUP 0
-
struct gfs2_quota_change_host {
u64 qc_change;
u32 qc_flags; /* GFS2_QCF_... */
- u32 qc_id;
+ struct kqid qc_id;
};
static LIST_HEAD(qd_lru_list);
@@ -120,17 +117,24 @@ out:
return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
}
+static u64 qd2index(struct gfs2_quota_data *qd)
+{
+ struct kqid qid = qd->qd_id;
+ return (2 * (u64)from_kqid(&init_user_ns, qid)) +
+ (qid.type == USRQUOTA) ? 0 : 1;
+}
+
static u64 qd2offset(struct gfs2_quota_data *qd)
{
u64 offset;
- offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+ offset = qd2index(qd);
offset *= sizeof(struct gfs2_quota);
return offset;
}
-static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd;
@@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
return -ENOMEM;
atomic_set(&qd->qd_count, 1);
- qd->qd_id = id;
- if (user)
- set_bit(QDF_USER, &qd->qd_flags);
+ qd->qd_id = qid;
qd->qd_slot = -1;
INIT_LIST_HEAD(&qd->qd_reclaim);
- error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+ error = gfs2_glock_get(sdp, qd2index(qd),
&gfs2_quota_glops, CREATE, &qd->qd_gl);
if (error)
goto fail;
@@ -161,7 +163,7 @@ fail:
return error;
}
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
found = 0;
spin_lock(&qd_lru_lock);
list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- if (qd->qd_id == id &&
- !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+ if (qid_eq(qd->qd_id, qid)) {
if (!atomic_read(&qd->qd_count) &&
!list_empty(&qd->qd_reclaim)) {
/* Remove it from reclaim list */
@@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
return 0;
}
- error = qd_alloc(sdp, user, id, &new_qd);
+ error = qd_alloc(sdp, qid, &new_qd);
if (error)
return error;
}
@@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
qd_put(qd);
}
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
int error;
- error = qd_get(sdp, user, id, qdp);
+ error = qd_get(sdp, qid, qdp);
if (error)
return error;
@@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
qd_put(qd);
}
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data **qd;
@@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
+ error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
- error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
+ error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
- if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
- error = qdsb_get(sdp, QUOTA_USER, uid, qd);
+ if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
+ !uid_eq(uid, ip->i_inode.i_uid)) {
+ error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
}
- if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
- error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
+ if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) &&
+ !gid_eq(gid, ip->i_inode.i_gid)) {
+ error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
@@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b)
const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
- if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
- !test_bit(QDF_USER, &qd_b->qd_flags)) {
- if (test_bit(QDF_USER, &qd_a->qd_flags))
- return -1;
- else
- return 1;
- }
- if (qd_a->qd_id < qd_b->qd_id)
+ if (qid_lt(qd_a->qd_id, qd_b->qd_id))
return -1;
- if (qd_a->qd_id > qd_b->qd_id)
+ if (qid_lt(qd_b->qd_id, qd_a->qd_id))
return 1;
-
return 0;
}
@@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
s64 x;
mutex_lock(&sdp->sd_quota_mutex);
- gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
qc->qc_change = 0;
qc->qc_flags = 0;
- if (test_bit(QDF_USER, &qd->qd_flags))
+ if (qd->qd_id.type == USRQUOTA)
qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
- qc->qc_id = cpu_to_be32(qd->qd_id);
+ qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));
}
x = be64_to_cpu(qc->qc_change) + change;
@@ -726,7 +721,7 @@ get_a_page:
goto unlock_out;
}
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_meta(ip->i_gl, bh);
kaddr = kmap_atomic(page);
if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -925,7 +920,7 @@ fail:
return error;
}
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
@@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
sdp->sd_fsname, type,
- (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
- qd->qd_id);
+ (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+ from_kqid(&init_user_ns, qd->qd_id));
return 0;
}
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
@@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
qd = ip->i_res->rs_qa_qd[x];
- if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
- (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+ if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+ qid_eq(qd->qd_id, make_kqid_gid(gid))))
continue;
value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
@@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
print_message(qd, "exceeded");
- quota_send_warning(make_kqid(&init_user_ns,
- test_bit(QDF_USER, &qd->qd_flags) ?
- USRQUOTA : GRPQUOTA,
- qd->qd_id),
+ quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
error = -EDQUOT;
@@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
time_after_eq(jiffies, qd->qd_last_warn +
gfs2_tune_get(sdp,
gt_quota_warn_period) * HZ)) {
- quota_send_warning(make_kqid(&init_user_ns,
- test_bit(QDF_USER, &qd->qd_flags) ?
- USRQUOTA : GRPQUOTA,
- qd->qd_id),
+ quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
error = print_message(qd, "warning");
qd->qd_last_warn = jiffies;
@@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
}
void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- u32 uid, u32 gid)
+ kuid_t uid, kgid_t gid)
{
struct gfs2_quota_data *qd;
unsigned int x;
@@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
qd = ip->i_res->rs_qa_qd[x];
- if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
- (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+ if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+ qid_eq(qd->qd_id, make_kqid_gid(gid))) {
do_qc(qd, change);
}
}
@@ -1170,13 +1159,13 @@ static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
return gfs2_quota_sync(sb, type);
}
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
struct gfs2_holder q_gh;
int error;
- error = qd_get(sdp, user, id, &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
@@ -1194,7 +1183,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
qc->qc_change = be64_to_cpu(str->qc_change);
qc->qc_flags = be32_to_cpu(str->qc_flags);
- qc->qc_id = be32_to_cpu(str->qc_id);
+ qc->qc_id = make_kqid(&init_user_ns,
+ (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
+ be32_to_cpu(str->qc_id));
}
int gfs2_quota_init(struct gfs2_sbd *sdp)
@@ -1257,8 +1248,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
if (!qc.qc_change)
continue;
- error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
- qc.qc_id, &qd);
+ error = qd_alloc(sdp, qc.qc_id, &qd);
if (error) {
brelse(bh);
goto fail;
@@ -1485,21 +1475,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
struct gfs2_quota_data *qd;
struct gfs2_holder q_gh;
int error;
- int type;
memset(fdq, 0, sizeof(struct fs_disk_quota));
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return -ESRCH; /* Crazy XFS error code */
- if (qid.type == USRQUOTA)
- type = QUOTA_USER;
- else if (qid.type == GRPQUOTA)
- type = QUOTA_GROUP;
- else
+ if ((qid.type != USRQUOTA) &&
+ (qid.type != GRPQUOTA))
return -EINVAL;
- error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
error = do_glock(qd, FORCE, &q_gh);
@@ -1508,8 +1494,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
fdq->d_version = FS_DQUOT_VERSION;
- fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
- fdq->d_id = from_kqid(&init_user_ns, qid);
+ fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+ fdq->d_id = from_kqid_munged(current_user_ns(), qid);
fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
@@ -1535,32 +1521,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
int alloc_required;
loff_t offset;
int error;
- int type;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return -ESRCH; /* Crazy XFS error code */
- switch(qid.type) {
- case USRQUOTA:
- type = QUOTA_USER;
- if (fdq->d_flags != FS_USER_QUOTA)
- return -EINVAL;
- break;
- case GRPQUOTA:
- type = QUOTA_GROUP;
- if (fdq->d_flags != FS_GROUP_QUOTA)
- return -EINVAL;
- break;
- default:
+ if ((qid.type != USRQUOTA) &&
+ (qid.type != GRPQUOTA))
return -EINVAL;
- }
if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
return -EINVAL;
- if (fdq->d_id != from_kqid(&init_user_ns, qid))
- return -EINVAL;
- error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index f25d98b87904..4f5e6e44ed83 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -14,20 +14,21 @@ struct gfs2_inode;
struct gfs2_sbd;
struct shrink_control;
-#define NO_QUOTA_CHANGE ((u32)-1)
+#define NO_UID_QUOTA_CHANGE INVALID_UID
+#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- u32 uid, u32 gid);
+ kuid_t uid, kgid_t gid);
extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
@@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
int ret;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 37ee061d899e..d1f51fd73f86 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -350,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
BUG_ON(len < chunk_size);
len -= chunk_size;
block = gfs2_rbm_to_block(&rbm);
- gfs2_rbm_from_block(&rbm, block + chunk_size);
- n_unaligned = 3;
- if (ptr)
+ if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
+ n_unaligned = 0;
break;
+ }
+ if (ptr) {
+ n_unaligned = 3;
+ break;
+ }
n_unaligned = len & 3;
}
@@ -557,22 +561,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
*/
int gfs2_rs_alloc(struct gfs2_inode *ip)
{
- struct gfs2_blkreserv *res;
+ int error = 0;
+ down_write(&ip->i_rw_mutex);
if (ip->i_res)
- return 0;
-
- res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
- if (!res)
- return -ENOMEM;
+ goto out;
- RB_CLEAR_NODE(&res->rs_node);
+ ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+ if (!ip->i_res) {
+ error = -ENOMEM;
+ goto out;
+ }
- down_write(&ip->i_rw_mutex);
- if (ip->i_res)
- kmem_cache_free(gfs2_rsrv_cachep, res);
- else
- ip->i_res = res;
+ RB_CLEAR_NODE(&ip->i_res->rs_node);
+out:
up_write(&ip->i_rw_mutex);
return 0;
}
@@ -1255,7 +1257,7 @@ fail:
int gfs2_fitrim(struct file *filp, void __user *argp)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
struct buffer_head *bh;
@@ -1321,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (ret == 0) {
bh = rgd->rd_bits[0].bi_bh;
rgd->rd_flags |= GFS2_RGF_TRIMMED;
- gfs2_trans_add_bh(rgd->rd_gl, bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, bh);
gfs2_rgrp_out(rgd, bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
gfs2_trans_end(sdp);
@@ -1424,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
rs->rs_free = extlen;
rs->rs_inum = ip->i_no_addr;
rs_insert(ip);
+ } else {
+ if (goal == rgd->rd_last_alloc + rgd->rd_data0)
+ rgd->rd_last_alloc = 0;
}
}
@@ -1963,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
*n = 1;
block = gfs2_rbm_to_block(rbm);
- gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
+ gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
block++;
while (*n < elen) {
ret = gfs2_rbm_from_block(&pos, block);
if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
break;
- gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
+ gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
gfs2_setbit(&pos, true, GFS2_BLKST_USED);
(*n)++;
block++;
@@ -2009,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
rbm.bi->bi_len);
}
- gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
gfs2_setbit(&rbm, false, new_state);
}
@@ -2152,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
if (error == 0) {
struct gfs2_dinode *di =
(struct gfs2_dinode *)dibh->b_data;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
di->di_goal_meta = di->di_goal_data =
cpu_to_be64(ip->i_goal);
brelse(dibh);
@@ -2171,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
*generation = rbm.rgd->rd_igeneration++;
}
- gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
@@ -2218,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
rgd->rd_free += blen;
rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
@@ -2255,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode)
if (!rgd)
return;
trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
update_rgrp_lvb_unlinked(rgd, 1);
@@ -2276,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
rgd->rd_dinodes--;
rgd->rd_free++;
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
update_rgrp_lvb_unlinked(rgd, -1);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d6488674d916..cab77b8ba84f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
if (error)
return;
- gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ gfs2_trans_add_meta(l_ip->i_gl, l_bh);
spin_lock(&sdp->sd_statfs_spin);
l_sc->sc_total += total;
@@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
- gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ gfs2_trans_add_meta(l_ip->i_gl, l_bh);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
@@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
0, sizeof(struct gfs2_statfs_change));
spin_unlock(&sdp->sd_statfs_spin);
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+ gfs2_trans_add_meta(m_ip->i_gl, m_bh);
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
}
@@ -663,54 +663,6 @@ out:
return error;
}
-/**
- * gfs2_freeze_fs - freezes the file system
- * @sdp: the file system
- *
- * This function flushes data and meta data for all machines by
- * acquiring the transaction log exclusively. All journals are
- * ensured to be in a clean state as well.
- *
- * Returns: errno
- */
-
-int gfs2_freeze_fs(struct gfs2_sbd *sdp)
-{
- int error = 0;
-
- mutex_lock(&sdp->sd_freeze_lock);
-
- if (!sdp->sd_freeze_count++) {
- error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
- if (error)
- sdp->sd_freeze_count--;
- }
-
- mutex_unlock(&sdp->sd_freeze_lock);
-
- return error;
-}
-
-/**
- * gfs2_unfreeze_fs - unfreezes the file system
- * @sdp: the file system
- *
- * This function allows the file system to proceed by unlocking
- * the exclusively held transaction lock. Other GFS2 nodes are
- * now free to acquire the lock shared and go on with their lives.
- *
- */
-
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
-{
- mutex_lock(&sdp->sd_freeze_lock);
-
- if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
- gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-
- mutex_unlock(&sdp->sd_freeze_lock);
-}
-
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
struct gfs2_dinode *str = buf;
@@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
- str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+ str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+ str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
ret = gfs2_meta_inode_buffer(ip, &bh);
if (ret == 0) {
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
}
@@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb)
int error;
struct gfs2_jdesc *jd;
- /* Unfreeze the filesystem, if we need to */
-
- mutex_lock(&sdp->sd_freeze_lock);
- if (sdp->sd_freeze_count)
- gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
- mutex_unlock(&sdp->sd_freeze_lock);
-
/* No more recovery requests */
set_bit(SDF_NORECOVERY, &sdp->sd_flags);
smp_mb();
@@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb)
return -EINVAL;
for (;;) {
- error = gfs2_freeze_fs(sdp);
+ error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
if (!error)
break;
@@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb)
static int gfs2_unfreeze(struct super_block *sb)
{
- gfs2_unfreeze_fs(sb->s_fs_info);
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
return 0;
}
@@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
return error;
@@ -1577,6 +1524,7 @@ out:
/* Case 3 starts here */
truncate_inode_pages(&inode->i_data, 0);
gfs2_rs_delete(ip);
+ gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
ip->i_gl->gl_object = NULL;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index a0464680af0b..90e3322ffa10 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
-
extern struct file_system_type gfs2_fs_type;
extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8056b7b7238e..aa5c48044966 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -91,39 +91,37 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
{
- unsigned int count;
-
- mutex_lock(&sdp->sd_freeze_lock);
- count = sdp->sd_freeze_count;
- mutex_unlock(&sdp->sd_freeze_lock);
+ struct super_block *sb = sdp->sd_vfs;
+ int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
- return snprintf(buf, PAGE_SIZE, "%u\n", count);
+ return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
}
static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
- ssize_t ret = len;
- int error = 0;
+ int error;
int n = simple_strtol(buf, NULL, 0);
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
switch (n) {
case 0:
- gfs2_unfreeze_fs(sdp);
+ error = thaw_super(sdp->sd_vfs);
break;
case 1:
- error = gfs2_freeze_fs(sdp);
+ error = freeze_super(sdp->sd_vfs);
break;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
- if (error)
+ if (error) {
fs_warn(sdp, "freeze %d error %d", n, error);
+ return error;
+ }
- return ret;
+ return len;
}
static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
@@ -135,7 +133,7 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
@@ -150,7 +148,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
@@ -163,7 +161,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
@@ -175,30 +173,40 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ struct kqid qid;
int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
id = simple_strtoul(buf, NULL, 0);
- error = gfs2_quota_refresh(sdp, 1, id);
+ qid = make_kqid(current_user_ns(), USRQUOTA, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+
+ error = gfs2_quota_refresh(sdp, qid);
return error ? error : len;
}
static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ struct kqid qid;
int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
id = simple_strtoul(buf, NULL, 0);
- error = gfs2_quota_refresh(sdp, 0, id);
+ qid = make_kqid(current_user_ns(), GRPQUOTA, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+
+ error = gfs2_quota_refresh(sdp, qid);
return error ? error : len;
}
@@ -213,7 +221,7 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
int rv;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
mode);
@@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
return ret;
}
+static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
+{
+ int val = completion_done(&sdp->sd_wdack) ? 1 : 0;
+
+ return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int val;
+
+ val = simple_strtol(buf, NULL, 0);
+
+ if ((val == 1) &&
+ !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+ complete(&sdp->sd_wdack);
+ else
+ ret = -EINVAL;
+ return ret;
+}
+
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
GDLM_ATTR(block, 0644, block_show, block_store);
-GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store);
GDLM_ATTR(jid, 0644, jid_show, jid_store);
GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store);
GDLM_ATTR(first_done, 0444, first_done_show, NULL);
@@ -502,7 +532,7 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
unsigned int x, y;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
return -EINVAL;
@@ -521,7 +551,7 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
unsigned int x;
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
x = simple_strtoul(buf, NULL, 0);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 413627072f36..88162fae27a5 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -18,6 +18,7 @@
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
+#include "inode.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
@@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
sb_end_intwrite(sdp->sd_vfs);
}
+static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
+ struct buffer_head *bh,
+ const struct gfs2_log_operations *lops)
+{
+ struct gfs2_bufdata *bd;
+
+ bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+ bd->bd_bh = bh;
+ bd->bd_gl = gl;
+ bd->bd_ops = lops;
+ INIT_LIST_HEAD(&bd->bd_list);
+ bh->b_private = bd;
+ return bd;
+}
+
/**
- * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
- * @gl: the glock the buffer belongs to
+ * gfs2_trans_add_data - Add a databuf to the transaction.
+ * @gl: The inode glock associated with the buffer
* @bh: The buffer to add
- * @meta: True in the case of adding metadata
*
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ * We put the data buffer on a list so that we can ensure that its
+ * synced to disk at the right time
+ * ii) In journaled data mode
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
*/
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+ struct gfs2_trans *tr = current->journal_info;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct address_space *mapping = bh->b_page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_bufdata *bd;
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+ if (!gfs2_is_jdata(ip)) {
+ gfs2_ordered_add_inode(ip);
+ return;
+ }
+
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
+ bd = bh->b_private;
+ if (bd == NULL) {
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
+ if (bh->b_private == NULL)
+ bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
+ }
+ gfs2_assert(sdp, bd->bd_gl == gl);
+ tr->tr_touched = 1;
+ if (list_empty(&bd->bd_list)) {
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+ gfs2_pin(sdp, bd->bd_bh);
+ tr->tr_num_databuf_new++;
+ sdp->sd_log_num_databuf++;
+ list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
+ }
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
+}
+
+static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
+ struct gfs2_meta_header *mh;
+ struct gfs2_trans *tr;
+
+ tr = current->journal_info;
+ tr->tr_touched = 1;
+ if (!list_empty(&bd->bd_list))
+ return;
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+ mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+ if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+ printk(KERN_ERR
+ "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+ (unsigned long long)bd->bd_bh->b_blocknr);
+ BUG();
+ }
+ gfs2_pin(sdp, bd->bd_bh);
+ mh->__pad0 = cpu_to_be64(0);
+ mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+ sdp->sd_log_num_buf++;
+ list_add(&bd->bd_list, &sdp->sd_log_le_buf);
+ tr->tr_num_buf_new++;
+}
+
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_bufdata *bd;
lock_buffer(bh);
gfs2_log_lock(sdp);
bd = bh->b_private;
- if (bd)
- gfs2_assert(sdp, bd->bd_gl == gl);
- else {
+ if (bd == NULL) {
gfs2_log_unlock(sdp);
unlock_buffer(bh);
- gfs2_attach_bufdata(gl, bh, meta);
- bd = bh->b_private;
+ lock_page(bh->b_page);
+ if (bh->b_private == NULL)
+ bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
}
- lops_add(sdp, bd);
+ gfs2_assert(sdp, bd->bd_gl == gl);
+ meta_lo_add(sdp, bd);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
}
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
+ struct gfs2_glock *gl = bd->bd_gl;
+ struct gfs2_trans *tr = current->journal_info;
+
BUG_ON(!list_empty(&bd->bd_list));
BUG_ON(!list_empty(&bd->bd_ail_st_list));
BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- lops_init_le(bd, &gfs2_revoke_lops);
- lops_add(sdp, bd);
+ bd->bd_ops = &gfs2_revoke_lops;
+ tr->tr_touched = 1;
+ tr->tr_num_revoke++;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index bf2ae9aeee7a..1e6e7da25a17 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
unsigned int revokes);
extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f00d7c5744f6..6402fb69d71b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+ if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+ wait_for_completion(&sdp->sd_wdack);
+
if (lm->lm_unmount) {
fs_err(sdp, "telling LM to unmount\n");
lm->lm_unmount(sdp);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 76c144b3c9bb..ecd37f30ab91 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out_gunlock;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
dataptrs = GFS2_EA2DATAPTRS(ea);
for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
@@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto out_alloc;
@@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
}
if (din) {
- gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+ gfs2_trans_add_meta(ip->i_gl, bh[x]);
memcpy(pos, din, cp_size);
din += sdp->sd_jbsize;
}
@@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
*bhp = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+ gfs2_trans_add_meta(ip->i_gl, *bhp);
gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
@@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
bh = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
gfs2_add_inode_blocks(&ip->i_inode, 1);
@@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip,
struct gfs2_ea_header *prev = el->el_prev;
u32 len;
- gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, el->el_bh);
if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
ea->ea_type = GFS2_EATYPE_UNUSED;
@@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
if (es->ea_split)
ea = ea_split_ea(ea);
@@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out;
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
out:
@@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip,
struct gfs2_ea_header *ea = es->es_ea;
int error;
- gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, es->es_bh);
if (es->ea_split)
ea = ea_split_ea(ea);
@@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out;
}
- gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_trans_add_meta(ip->i_gl, indbh);
} else {
u64 blk;
unsigned int n = 1;
@@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
return error;
gfs2_trans_add_unrevoke(sdp, blk, 1);
indbh = gfs2_meta_new(ip->i_gl, blk);
- gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_trans_add_meta(ip->i_gl, indbh);
gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
gfs2_buffer_clear_tail(indbh, mh_size);
@@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, el->el_bh);
if (prev) {
u32 len;
@@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
if (GFS2_EA_IS_STUFFED(el.el_ea)) {
error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
if (error == 0) {
- gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, el.el_bh);
memcpy(GFS2_EA2DATA(el.el_ea), data,
GFS2_EA_DATA_LEN(el.el_ea));
}
@@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
if (error)
goto out_gunlock;
- gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_trans_add_meta(ip->i_gl, indbh);
eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
bstart = 0;
@@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -1461,7 +1461,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
return error;
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index b77c5bc20f8a..998e3a6decf3 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -1,6 +1,6 @@
config HFS_FS
- tristate "Apple Macintosh file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "Apple Macintosh file system support"
+ depends on BLOCK
select NLS
help
If you say Y here, you will be able to mount Macintosh-formatted
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 422dde2ec0a1..5f7f1abd5f6d 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,7 +51,7 @@ done:
*/
static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFS_MAX_NAMELEN];
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 0b35903219bc..3031dfdd2358 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, hfs_get_block);
}
+static void hfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hfs_file_truncate(inode);
+ }
+}
+
static int hfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hfs_write_failed(mapping, pos + len);
return ret;
}
@@ -120,7 +127,8 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = file_inode(file)->i_mapping->host;
ssize_t ret;
ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
@@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ hfs_write_failed(mapping, end);
}
return ret;
@@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
attr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ hfs_file_truncate(inode);
}
setattr_copy(inode, attr);
@@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {
static const struct inode_operations hfs_file_inode_operations = {
.lookup = hfs_file_lookup,
- .truncate = hfs_file_truncate,
.setattr = hfs_inode_setattr,
.setxattr = hfs_setxattr,
.getxattr = hfs_getxattr,
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 3cc0df730156..09d278bb7b91 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -5,5 +5,5 @@
obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
- bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o
-
+ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
+ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
new file mode 100644
index 000000000000..8d691f124714
--- /dev/null
+++ b/fs/hfsplus/attributes.c
@@ -0,0 +1,399 @@
+/*
+ * linux/fs/hfsplus/attributes.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handling of records in attributes tree
+ */
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static struct kmem_cache *hfsplus_attr_tree_cachep;
+
+int hfsplus_create_attr_tree_cache(void)
+{
+ if (hfsplus_attr_tree_cachep)
+ return -EEXIST;
+
+ hfsplus_attr_tree_cachep =
+ kmem_cache_create("hfsplus_attr_cache",
+ sizeof(hfsplus_attr_entry), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!hfsplus_attr_tree_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void hfsplus_destroy_attr_tree_cache(void)
+{
+ kmem_cache_destroy(hfsplus_attr_tree_cachep);
+}
+
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
+ const hfsplus_btree_key *k2)
+{
+ __be32 k1_cnid, k2_cnid;
+
+ k1_cnid = k1->attr.cnid;
+ k2_cnid = k2->attr.cnid;
+ if (k1_cnid != k2_cnid)
+ return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1;
+
+ return hfsplus_strcmp(
+ (const struct hfsplus_unistr *)&k1->attr.key_name,
+ (const struct hfsplus_unistr *)&k2->attr.key_name);
+}
+
+int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
+ u32 cnid, const char *name)
+{
+ int len;
+
+ memset(key, 0, sizeof(struct hfsplus_attr_key));
+ key->attr.cnid = cpu_to_be32(cnid);
+ if (name) {
+ len = strlen(name);
+ if (len > HFSPLUS_ATTR_MAX_STRLEN) {
+ printk(KERN_ERR "hfs: invalid xattr name's length\n");
+ return -EINVAL;
+ }
+ hfsplus_asc2uni(sb,
+ (struct hfsplus_unistr *)&key->attr.key_name,
+ HFSPLUS_ATTR_MAX_STRLEN, name, len);
+ len = be16_to_cpu(key->attr.key_name.length);
+ } else {
+ key->attr.key_name.length = 0;
+ len = 0;
+ }
+
+ /* The length of the key, as stored in key_len field, does not include
+ * the size of the key_len field itself.
+ * So, offsetof(hfsplus_attr_key, key_name) is a trick because
+ * it takes into consideration key_len field (__be16) of
+ * hfsplus_attr_key structure instead of length field (__be16) of
+ * hfsplus_attr_unistr structure.
+ */
+ key->key_len =
+ cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
+ 2 * len);
+
+ return 0;
+}
+
+void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
+ u32 cnid,
+ struct hfsplus_attr_unistr *name)
+{
+ int ustrlen;
+
+ memset(key, 0, sizeof(struct hfsplus_attr_key));
+ ustrlen = be16_to_cpu(name->length);
+ key->attr.cnid = cpu_to_be32(cnid);
+ key->attr.key_name.length = cpu_to_be16(ustrlen);
+ ustrlen *= 2;
+ memcpy(key->attr.key_name.unicode, name->unicode, ustrlen);
+
+ /* The length of the key, as stored in key_len field, does not include
+ * the size of the key_len field itself.
+ * So, offsetof(hfsplus_attr_key, key_name) is a trick because
+ * it takes into consideration key_len field (__be16) of
+ * hfsplus_attr_key structure instead of length field (__be16) of
+ * hfsplus_attr_unistr structure.
+ */
+ key->key_len =
+ cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
+ ustrlen);
+}
+
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void)
+{
+ return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL);
+}
+
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry)
+{
+ if (entry)
+ kmem_cache_free(hfsplus_attr_tree_cachep, entry);
+}
+
+#define HFSPLUS_INVALID_ATTR_RECORD -1
+
+static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type,
+ u32 cnid, const void *value, size_t size)
+{
+ if (record_type == HFSPLUS_ATTR_FORK_DATA) {
+ /*
+ * Mac OS X supports only inline data attributes.
+ * Do nothing
+ */
+ memset(entry, 0, sizeof(*entry));
+ return sizeof(struct hfsplus_attr_fork_data);
+ } else if (record_type == HFSPLUS_ATTR_EXTENTS) {
+ /*
+ * Mac OS X supports only inline data attributes.
+ * Do nothing.
+ */
+ memset(entry, 0, sizeof(*entry));
+ return sizeof(struct hfsplus_attr_extents);
+ } else if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+ u16 len;
+
+ memset(entry, 0, sizeof(struct hfsplus_attr_inline_data));
+ entry->inline_data.record_type = cpu_to_be32(record_type);
+ if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE)
+ len = size;
+ else
+ return HFSPLUS_INVALID_ATTR_RECORD;
+ entry->inline_data.length = cpu_to_be16(len);
+ memcpy(entry->inline_data.raw_bytes, value, len);
+ /*
+ * Align len on two-byte boundary.
+ * It needs to add pad byte if we have odd len.
+ */
+ len = round_up(len, 2);
+ return offsetof(struct hfsplus_attr_inline_data, raw_bytes) +
+ len;
+ } else /* invalid input */
+ memset(entry, 0, sizeof(*entry));
+
+ return HFSPLUS_INVALID_ATTR_RECORD;
+}
+
+int hfsplus_find_attr(struct super_block *sb, u32 cnid,
+ const char *name, struct hfs_find_data *fd)
+{
+ int err = 0;
+
+ dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+
+ if (!HFSPLUS_SB(sb)->attr_tree) {
+ printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ return -EINVAL;
+ }
+
+ if (name) {
+ err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name);
+ if (err)
+ goto failed_find_attr;
+ err = hfs_brec_find(fd, hfs_find_rec_by_key);
+ if (err)
+ goto failed_find_attr;
+ } else {
+ err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL);
+ if (err)
+ goto failed_find_attr;
+ err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid);
+ if (err)
+ goto failed_find_attr;
+ }
+
+failed_find_attr:
+ return err;
+}
+
+int hfsplus_attr_exists(struct inode *inode, const char *name)
+{
+ int err = 0;
+ struct super_block *sb = inode->i_sb;
+ struct hfs_find_data fd;
+
+ if (!HFSPLUS_SB(sb)->attr_tree)
+ return 0;
+
+ err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+ if (err)
+ return 0;
+
+ err = hfsplus_find_attr(sb, inode->i_ino, name, &fd);
+ if (err)
+ goto attr_not_found;
+
+ hfs_find_exit(&fd);
+ return 1;
+
+attr_not_found:
+ hfs_find_exit(&fd);
+ return 0;
+}
+
+int hfsplus_create_attr(struct inode *inode,
+ const char *name,
+ const void *value, size_t size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct hfs_find_data fd;
+ hfsplus_attr_entry *entry_ptr;
+ int entry_size;
+ int err;
+
+ dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n",
+ name ? name : NULL, inode->i_ino);
+
+ if (!HFSPLUS_SB(sb)->attr_tree) {
+ printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ return -EINVAL;
+ }
+
+ entry_ptr = hfsplus_alloc_attr_entry();
+ if (!entry_ptr)
+ return -ENOMEM;
+
+ err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+ if (err)
+ goto failed_init_create_attr;
+
+ if (name) {
+ err = hfsplus_attr_build_key(sb, fd.search_key,
+ inode->i_ino, name);
+ if (err)
+ goto failed_create_attr;
+ } else {
+ err = -EINVAL;
+ goto failed_create_attr;
+ }
+
+ /* Mac OS X supports only inline data attributes. */
+ entry_size = hfsplus_attr_build_record(entry_ptr,
+ HFSPLUS_ATTR_INLINE_DATA,
+ inode->i_ino,
+ value, size);
+ if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) {
+ err = -EINVAL;
+ goto failed_create_attr;
+ }
+
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+ if (err != -ENOENT) {
+ if (!err)
+ err = -EEXIST;
+ goto failed_create_attr;
+ }
+
+ err = hfs_brec_insert(&fd, entry_ptr, entry_size);
+ if (err)
+ goto failed_create_attr;
+
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+
+failed_create_attr:
+ hfs_find_exit(&fd);
+
+failed_init_create_attr:
+ hfsplus_destroy_attr_entry(entry_ptr);
+ return err;
+}
+
+static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
+ struct hfs_find_data *fd)
+{
+ int err = 0;
+ __be32 found_cnid, record_type;
+
+ hfs_bnode_read(fd->bnode, &found_cnid,
+ fd->keyoffset +
+ offsetof(struct hfsplus_attr_key, cnid),
+ sizeof(__be32));
+ if (cnid != be32_to_cpu(found_cnid))
+ return -ENOENT;
+
+ hfs_bnode_read(fd->bnode, &record_type,
+ fd->entryoffset, sizeof(record_type));
+
+ switch (be32_to_cpu(record_type)) {
+ case HFSPLUS_ATTR_INLINE_DATA:
+ /* All is OK. Do nothing. */
+ break;
+ case HFSPLUS_ATTR_FORK_DATA:
+ case HFSPLUS_ATTR_EXTENTS:
+ printk(KERN_ERR "hfs: only inline data xattr are supported\n");
+ return -EOPNOTSUPP;
+ default:
+ printk(KERN_ERR "hfs: invalid extended attribute record\n");
+ return -ENOENT;
+ }
+
+ err = hfs_brec_remove(fd);
+ if (err)
+ return err;
+
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+ return err;
+}
+
+int hfsplus_delete_attr(struct inode *inode, const char *name)
+{
+ int err = 0;
+ struct super_block *sb = inode->i_sb;
+ struct hfs_find_data fd;
+
+ dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n",
+ name ? name : NULL, inode->i_ino);
+
+ if (!HFSPLUS_SB(sb)->attr_tree) {
+ printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ return -EINVAL;
+ }
+
+ err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+ if (err)
+ return err;
+
+ if (name) {
+ err = hfsplus_attr_build_key(sb, fd.search_key,
+ inode->i_ino, name);
+ if (err)
+ goto out;
+ } else {
+ printk(KERN_ERR "hfs: invalid extended attribute name\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+ if (err)
+ goto out;
+
+ err = __hfsplus_delete_attr(inode, inode->i_ino, &fd);
+ if (err)
+ goto out;
+
+out:
+ hfs_find_exit(&fd);
+ return err;
+}
+
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
+{
+ int err = 0;
+ struct hfs_find_data fd;
+
+ dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+
+ if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
+ printk(KERN_ERR "hfs: attributes file doesn't exist\n");
+ return -EINVAL;
+ }
+
+ err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd);
+ if (err)
+ return err;
+
+ for (;;) {
+ err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd);
+ if (err) {
+ if (err != -ENOENT)
+ printk(KERN_ERR "hfs: xattr search failed.\n");
+ goto end_delete_all;
+ }
+
+ err = __hfsplus_delete_attr(dir, cnid, &fd);
+ if (err)
+ goto end_delete_all;
+ }
+
+end_delete_all:
+ hfs_find_exit(&fd);
+ return err;
+}
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5d799c13205f..d73c98d1ee99 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -24,7 +24,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->key = ptr + tree->max_key_len + 2;
dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
- mutex_lock(&tree->tree_lock);
+ switch (tree->cnid) {
+ case HFSPLUS_CAT_CNID:
+ mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+ break;
+ case HFSPLUS_EXT_CNID:
+ mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+ break;
+ case HFSPLUS_ATTR_CNID:
+ mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+ break;
+ default:
+ BUG();
+ }
return 0;
}
@@ -38,15 +50,73 @@ void hfs_find_exit(struct hfs_find_data *fd)
fd->tree = NULL;
}
-/* Find the record in bnode that best matches key (not greater than...)*/
-int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode,
+ struct hfs_find_data *fd,
+ int *begin,
+ int *end,
+ int *cur_rec)
+{
+ __be32 cur_cnid, search_cnid;
+
+ if (bnode->tree->cnid == HFSPLUS_EXT_CNID) {
+ cur_cnid = fd->key->ext.cnid;
+ search_cnid = fd->search_key->ext.cnid;
+ } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) {
+ cur_cnid = fd->key->cat.parent;
+ search_cnid = fd->search_key->cat.parent;
+ } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) {
+ cur_cnid = fd->key->attr.cnid;
+ search_cnid = fd->search_key->attr.cnid;
+ } else
+ BUG();
+
+ if (cur_cnid == search_cnid) {
+ (*end) = (*cur_rec);
+ if ((*begin) == (*end))
+ return 1;
+ } else {
+ if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid))
+ (*begin) = (*cur_rec) + 1;
+ else
+ (*end) = (*cur_rec) - 1;
+ }
+
+ return 0;
+}
+
+int hfs_find_rec_by_key(struct hfs_bnode *bnode,
+ struct hfs_find_data *fd,
+ int *begin,
+ int *end,
+ int *cur_rec)
{
int cmpval;
+
+ cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+ if (!cmpval) {
+ (*end) = (*cur_rec);
+ return 1;
+ }
+ if (cmpval < 0)
+ (*begin) = (*cur_rec) + 1;
+ else
+ *(end) = (*cur_rec) - 1;
+
+ return 0;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+ search_strategy_t rec_found)
+{
u16 off, len, keylen;
int rec;
int b, e;
int res;
+ if (!rec_found)
+ BUG();
+
b = 0;
e = bnode->num_recs - 1;
res = -ENOENT;
@@ -59,17 +129,12 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
goto fail;
}
hfs_bnode_read(bnode, fd->key, off, keylen);
- cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
- if (!cmpval) {
- e = rec;
+ if (rec_found(bnode, fd, &b, &e, &rec)) {
res = 0;
goto done;
}
- if (cmpval < 0)
- b = rec + 1;
- else
- e = rec - 1;
} while (b <= e);
+
if (rec != e && e >= 0) {
len = hfs_brec_lenoff(bnode, e, &off);
keylen = hfs_brec_keylen(bnode, e);
@@ -79,19 +144,21 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
}
hfs_bnode_read(bnode, fd->key, off, keylen);
}
+
done:
fd->record = e;
fd->keyoffset = off;
fd->keylength = keylen;
fd->entryoffset = off + keylen;
fd->entrylength = len - keylen;
+
fail:
return res;
}
/* Traverse a B*Tree from the root to a leaf finding best fit to key */
/* Return allocated copy of node found, set recnum to best record */
-int hfs_brec_find(struct hfs_find_data *fd)
+int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
{
struct hfs_btree *tree;
struct hfs_bnode *bnode;
@@ -122,7 +189,7 @@ int hfs_brec_find(struct hfs_find_data *fd)
goto invalid;
bnode->parent = parent;
- res = __hfs_brec_find(bnode, fd);
+ res = __hfs_brec_find(bnode, fd, do_key_compare);
if (!height)
break;
if (fd->record < 0)
@@ -149,7 +216,7 @@ int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
{
int res;
- res = hfs_brec_find(fd);
+ res = hfs_brec_find(fd, hfs_find_rec_by_key);
if (res)
return res;
if (fd->entrylength > rec_len)
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 4cfbe2edd296..6feefc0cb48a 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -176,12 +176,14 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
/* are all of the bits in range? */
if ((offset + count) > sbi->total_blocks)
- return -2;
+ return -ENOENT;
mutex_lock(&sbi->alloc_mutex);
mapping = sbi->alloc_file->i_mapping;
pnr = offset / PAGE_CACHE_BITS;
page = read_mapping_page(mapping, pnr, NULL);
+ if (IS_ERR(page))
+ goto kaboom;
pptr = kmap(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
end = pptr + PAGE_CACHE_BITS / 32;
@@ -214,6 +216,8 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
set_page_dirty(page);
kunmap(page);
page = read_mapping_page(mapping, ++pnr, NULL);
+ if (IS_ERR(page))
+ goto kaboom;
pptr = kmap(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
@@ -232,4 +236,11 @@ out:
mutex_unlock(&sbi->alloc_mutex);
return 0;
+
+kaboom:
+ printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n",
+ PTR_ERR(page));
+ mutex_unlock(&sbi->alloc_mutex);
+
+ return -EIO;
}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 1c42cc5b899f..f31ac6f404f1 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -62,7 +62,8 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
tree = node->tree;
if (node->type == HFS_NODE_LEAF ||
- tree->attributes & HFS_TREE_VARIDXKEYS)
+ tree->attributes & HFS_TREE_VARIDXKEYS ||
+ node->tree->cnid == HFSPLUS_ATTR_CNID)
key_len = hfs_bnode_read_u16(node, off) + 2;
else
key_len = tree->max_key_len + 2;
@@ -314,7 +315,8 @@ void hfs_bnode_dump(struct hfs_bnode *node)
if (i && node->type == HFS_NODE_INDEX) {
int tmp;
- if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
+ if (node->tree->attributes & HFS_TREE_VARIDXKEYS ||
+ node->tree->cnid == HFSPLUS_ATTR_CNID)
tmp = hfs_bnode_read_u16(node, key_off) + 2;
else
tmp = node->tree->max_key_len + 2;
@@ -646,6 +648,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
hfs_bnode_unhash(node);
spin_unlock(&tree->hash_lock);
+ hfs_bnode_clear(node, 0,
+ PAGE_CACHE_SIZE * tree->pages_per_bnode);
hfs_bmap_free(node);
hfs_bnode_free(node);
return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2a734cfccc92..298d4e45604b 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -36,7 +36,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
return 0;
if ((node->type == HFS_NODE_INDEX) &&
- !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
+ !(node->tree->attributes & HFS_TREE_VARIDXKEYS) &&
+ (node->tree->cnid != HFSPLUS_ATTR_CNID)) {
retval = node->tree->max_key_len + 2;
} else {
recoff = hfs_bnode_read_u16(node,
@@ -151,12 +152,13 @@ skip:
/* get index key */
hfs_bnode_read_key(new_node, fd->search_key, 14);
- __hfs_brec_find(fd->bnode, fd);
+ __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
hfs_bnode_put(new_node);
new_node = NULL;
- if (tree->attributes & HFS_TREE_VARIDXKEYS)
+ if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+ (tree->cnid == HFSPLUS_ATTR_CNID))
key_len = be16_to_cpu(fd->search_key->key_len) + 2;
else {
fd->search_key->key_len =
@@ -201,7 +203,7 @@ again:
hfs_bnode_put(node);
node = fd->bnode = parent;
- __hfs_brec_find(node, fd);
+ __hfs_brec_find(node, fd, hfs_find_rec_by_key);
goto again;
}
hfs_bnode_write_u16(node,
@@ -367,12 +369,13 @@ again:
parent = hfs_bnode_find(tree, node->parent);
if (IS_ERR(parent))
return PTR_ERR(parent);
- __hfs_brec_find(parent, fd);
+ __hfs_brec_find(parent, fd, hfs_find_rec_by_key);
hfs_bnode_dump(parent);
rec = fd->record;
/* size difference between old and new key */
- if (tree->attributes & HFS_TREE_VARIDXKEYS)
+ if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+ (tree->cnid == HFSPLUS_ATTR_CNID))
newkeylen = hfs_bnode_read_u16(node, 14) + 2;
else
fd->keylength = newkeylen = tree->max_key_len + 2;
@@ -427,7 +430,7 @@ skip:
hfs_bnode_read_key(new_node, fd->search_key, 14);
cnid = cpu_to_be32(new_node->this);
- __hfs_brec_find(fd->bnode, fd);
+ __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
hfs_brec_insert(fd, &cnid, sizeof(cnid));
hfs_bnode_put(fd->bnode);
hfs_bnode_put(new_node);
@@ -495,13 +498,15 @@ static int hfs_btree_inc_height(struct hfs_btree *tree)
/* insert old root idx into new root */
node->parent = tree->root;
if (node->type == HFS_NODE_LEAF ||
- tree->attributes & HFS_TREE_VARIDXKEYS)
+ tree->attributes & HFS_TREE_VARIDXKEYS ||
+ tree->cnid == HFSPLUS_ATTR_CNID)
key_size = hfs_bnode_read_u16(node, 14) + 2;
else
key_size = tree->max_key_len + 2;
hfs_bnode_copy(new_node, 14, node, 14, key_size);
- if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+ if (!(tree->attributes & HFS_TREE_VARIDXKEYS) &&
+ (tree->cnid != HFSPLUS_ATTR_CNID)) {
key_size = tree->max_key_len + 2;
hfs_bnode_write_u16(new_node, 14, tree->max_key_len);
}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 21023d9f8ff3..efb689c21a95 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -98,6 +98,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
}
break;
+ case HFSPLUS_ATTR_CNID:
+ if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) {
+ printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n",
+ tree->max_key_len);
+ goto fail_page;
+ }
+ tree->keycmp = hfsplus_attr_bin_cmp_key;
+ break;
default:
printk(KERN_ERR "hfs: unknown B*Tree requested\n");
goto fail_page;
@@ -159,7 +167,7 @@ void hfs_btree_close(struct hfs_btree *tree)
kfree(tree);
}
-void hfs_btree_write(struct hfs_btree *tree)
+int hfs_btree_write(struct hfs_btree *tree)
{
struct hfs_btree_header_rec *head;
struct hfs_bnode *node;
@@ -168,7 +176,7 @@ void hfs_btree_write(struct hfs_btree *tree)
node = hfs_bnode_find(tree, 0);
if (IS_ERR(node))
/* panic? */
- return;
+ return -EIO;
/* Load the header */
page = node->page[0];
head = (struct hfs_btree_header_rec *)(kmap(page) +
@@ -186,6 +194,7 @@ void hfs_btree_write(struct hfs_btree *tree)
kunmap(page);
set_page_dirty(page);
hfs_bnode_put(node);
+ return 0;
}
static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 798d9c4c5e71..840d71edd193 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -45,7 +45,8 @@ void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
key->cat.parent = cpu_to_be32(parent);
if (str) {
- hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len);
+ hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+ str->name, str->len);
len = be16_to_cpu(key->cat.name.length);
} else {
key->cat.name.length = 0;
@@ -167,7 +168,8 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,
entry->type = cpu_to_be16(type);
entry->thread.reserved = 0;
entry->thread.parentID = cpu_to_be32(parentid);
- hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len);
+ hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+ str->name, str->len);
return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
}
@@ -198,7 +200,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
hfsplus_cat_build_key_uni(fd->search_key,
be32_to_cpu(tmp.thread.parentID),
&tmp.thread.nodeName);
- return hfs_brec_find(fd);
+ return hfs_brec_find(fd, hfs_find_rec_by_key);
}
int hfsplus_create_cat(u32 cnid, struct inode *dir,
@@ -221,7 +223,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
S_ISDIR(inode->i_mode) ?
HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
dir->i_ino, str);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
err = -EEXIST;
@@ -233,7 +235,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
/* panic? */
if (!err)
@@ -253,7 +255,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
err1:
hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
- if (!hfs_brec_find(&fd))
+ if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
hfs_brec_remove(&fd);
err2:
hfs_find_exit(&fd);
@@ -279,7 +281,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
int len;
hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -296,7 +298,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
} else
hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -326,7 +328,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
goto out;
hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -337,6 +339,12 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
dir->i_size--;
dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
+ if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
+ if (HFSPLUS_SB(sb)->attr_tree)
+ hfsplus_delete_all_attrs(dir, cnid);
+ }
+
out:
hfs_find_exit(&fd);
@@ -363,7 +371,7 @@ int hfsplus_rename_cat(u32 cnid,
/* find the old dir entry and read the data */
hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
- err = hfs_brec_find(&src_fd);
+ err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
@@ -376,7 +384,7 @@ int hfsplus_rename_cat(u32 cnid,
/* create new dir entry with the data from the old entry */
hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
- err = hfs_brec_find(&dst_fd);
+ err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
err = -EEXIST;
@@ -391,7 +399,7 @@ int hfsplus_rename_cat(u32 cnid,
/* finally remove the old entry */
hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
- err = hfs_brec_find(&src_fd);
+ err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
err = hfs_brec_remove(&src_fd);
@@ -402,7 +410,7 @@ int hfsplus_rename_cat(u32 cnid,
/* remove old thread entry */
hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
- err = hfs_brec_find(&src_fd);
+ err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
if (err)
goto out;
type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset);
@@ -414,7 +422,7 @@ int hfsplus_rename_cat(u32 cnid,
hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
dst_dir->i_ino, dst_name);
- err = hfs_brec_find(&dst_fd);
+ err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
if (err != -ENOENT) {
if (!err)
err = -EEXIST;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 6b9f921ef2fa..031c24e50521 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -15,6 +15,7 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
+#include "xattr.h"
static inline void hfsplus_instantiate(struct dentry *dentry,
struct inode *inode, u32 cnid)
@@ -122,7 +123,7 @@ fail:
static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
int len, err;
char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -138,7 +139,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (err)
return err;
hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
- err = hfs_brec_find(&fd);
+ err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
goto out;
@@ -421,6 +422,15 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
if (res)
goto out_err;
+ res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+ if (res == -EOPNOTSUPP)
+ res = 0; /* Operation is not supported. */
+ else if (res) {
+ /* Try to delete anyway without error analysis. */
+ hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+ goto out_err;
+ }
+
hfsplus_instantiate(dentry, inode, inode->i_ino);
mark_inode_dirty(inode);
goto out;
@@ -450,15 +460,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, mode, rdev);
res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
- if (res) {
- clear_nlink(inode);
- hfsplus_delete_inode(inode);
- iput(inode);
- goto out;
+ if (res)
+ goto failed_mknod;
+
+ res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+ if (res == -EOPNOTSUPP)
+ res = 0; /* Operation is not supported. */
+ else if (res) {
+ /* Try to delete anyway without error analysis. */
+ hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+ goto failed_mknod;
}
hfsplus_instantiate(dentry, inode, inode->i_ino);
mark_inode_dirty(inode);
+ goto out;
+
+failed_mknod:
+ clear_nlink(inode);
+ hfsplus_delete_inode(inode);
+ iput(inode);
out:
mutex_unlock(&sbi->vh_mutex);
return res;
@@ -499,15 +520,19 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
}
const struct inode_operations hfsplus_dir_inode_operations = {
- .lookup = hfsplus_lookup,
- .create = hfsplus_create,
- .link = hfsplus_link,
- .unlink = hfsplus_unlink,
- .mkdir = hfsplus_mkdir,
- .rmdir = hfsplus_rmdir,
- .symlink = hfsplus_symlink,
- .mknod = hfsplus_mknod,
- .rename = hfsplus_rename,
+ .lookup = hfsplus_lookup,
+ .create = hfsplus_create,
+ .link = hfsplus_link,
+ .unlink = hfsplus_unlink,
+ .mkdir = hfsplus_mkdir,
+ .rmdir = hfsplus_rmdir,
+ .symlink = hfsplus_symlink,
+ .mknod = hfsplus_mknod,
+ .rename = hfsplus_rename,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = hfsplus_listxattr,
+ .removexattr = hfsplus_removexattr,
};
const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 5849e3ef35cc..a94f0f779d5e 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -95,7 +95,7 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
HFSPLUS_IS_RSRC(inode) ?
HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
- res = hfs_brec_find(fd);
+ res = hfs_brec_find(fd, hfs_find_rec_by_key);
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
return;
@@ -154,7 +154,7 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
hfsplus_ext_build_key(fd->search_key, cnid, block, type);
fd->key->ext.cnid = 0;
- res = hfs_brec_find(fd);
+ res = hfs_brec_find(fd, hfs_find_rec_by_key);
if (res && res != -ENOENT)
return res;
if (fd->key->ext.cnid != fd->search_key->ext.cnid ||
@@ -329,6 +329,7 @@ static int hfsplus_free_extents(struct super_block *sb,
{
u32 count, start;
int i;
+ int err = 0;
hfsplus_dump_extent(extent);
for (i = 0; i < 8; extent++, i++) {
@@ -345,18 +346,33 @@ found:
for (;;) {
start = be32_to_cpu(extent->start_block);
if (count <= block_nr) {
- hfsplus_block_free(sb, start, count);
+ err = hfsplus_block_free(sb, start, count);
+ if (err) {
+ printk(KERN_ERR "hfs: can't free extent\n");
+ dprint(DBG_EXTENT, " start: %u count: %u\n",
+ start, count);
+ }
extent->block_count = 0;
extent->start_block = 0;
block_nr -= count;
} else {
count -= block_nr;
- hfsplus_block_free(sb, start + count, block_nr);
+ err = hfsplus_block_free(sb, start + count, block_nr);
+ if (err) {
+ printk(KERN_ERR "hfs: can't free extent\n");
+ dprint(DBG_EXTENT, " start: %u count: %u\n",
+ start, count);
+ }
extent->block_count = cpu_to_be32(count);
block_nr = 0;
}
- if (!block_nr || !i)
- return 0;
+ if (!block_nr || !i) {
+ /*
+ * Try to free all extents and
+ * return only last error
+ */
+ return err;
+ }
i--;
extent--;
count = be32_to_cpu(extent->block_count);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c571de224b15..05b11f36024c 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,6 +23,7 @@
#define DBG_SUPER 0x00000010
#define DBG_EXTENT 0x00000020
#define DBG_BITMAP 0x00000040
+#define DBG_ATTR_MOD 0x00000080
#if 0
#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
@@ -46,6 +47,13 @@ typedef int (*btree_keycmp)(const hfsplus_btree_key *,
#define NODE_HASH_SIZE 256
+/* B-tree mutex nested subclasses */
+enum hfsplus_btree_mutex_classes {
+ CATALOG_BTREE_MUTEX,
+ EXTENTS_BTREE_MUTEX,
+ ATTR_BTREE_MUTEX,
+};
+
/* An HFS+ BTree held in memory */
struct hfs_btree {
struct super_block *sb;
@@ -223,6 +231,7 @@ struct hfsplus_inode_info {
#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */
#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */
#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */
+#define HFSPLUS_I_ATTR_DIRTY 4 /* has changes in the attributes tree */
#define HFSPLUS_IS_RSRC(inode) \
test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
@@ -302,7 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_brec_remove hfsplus_brec_remove
#define hfs_find_init hfsplus_find_init
#define hfs_find_exit hfsplus_find_exit
-#define __hfs_brec_find __hplusfs_brec_find
+#define __hfs_brec_find __hfsplus_brec_find
#define hfs_brec_find hfsplus_brec_find
#define hfs_brec_read hfsplus_brec_read
#define hfs_brec_goto hfsplus_brec_goto
@@ -324,10 +333,33 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
*/
#define HFSPLUS_IOC_BLESS _IO('h', 0x80)
+typedef int (*search_strategy_t)(struct hfs_bnode *,
+ struct hfs_find_data *,
+ int *, int *, int *);
+
/*
* Functions in any *.c used in other files
*/
+/* attributes.c */
+int hfsplus_create_attr_tree_cache(void);
+void hfsplus_destroy_attr_tree_cache(void);
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *,
+ const hfsplus_btree_key *);
+int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *,
+ u32, const char *);
+void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
+ u32 cnid,
+ struct hfsplus_attr_unistr *name);
+int hfsplus_find_attr(struct super_block *, u32,
+ const char *, struct hfs_find_data *);
+int hfsplus_attr_exists(struct inode *inode, const char *name);
+int hfsplus_create_attr(struct inode *, const char *, const void *, size_t);
+int hfsplus_delete_attr(struct inode *, const char *);
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid);
+
/* bitmap.c */
int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
int hfsplus_block_free(struct super_block *, u32, u32);
@@ -335,7 +367,7 @@ int hfsplus_block_free(struct super_block *, u32, u32);
/* btree.c */
struct hfs_btree *hfs_btree_open(struct super_block *, u32);
void hfs_btree_close(struct hfs_btree *);
-void hfs_btree_write(struct hfs_btree *);
+int hfs_btree_write(struct hfs_btree *);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *);
void hfs_bmap_free(struct hfs_bnode *);
@@ -369,8 +401,15 @@ int hfs_brec_remove(struct hfs_find_data *);
/* bfind.c */
int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
void hfs_find_exit(struct hfs_find_data *);
-int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *);
-int hfs_brec_find(struct hfs_find_data *);
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *,
+ struct hfs_find_data *,
+ int *, int *, int *);
+int hfs_find_rec_by_key(struct hfs_bnode *,
+ struct hfs_find_data *,
+ int *, int *, int *);
+int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *,
+ search_strategy_t);
+int hfs_brec_find(struct hfs_find_data *, search_strategy_t);
int hfs_brec_read(struct hfs_find_data *, void *, int);
int hfs_brec_goto(struct hfs_find_data *, int);
@@ -417,11 +456,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags);
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size);
-ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* options.c */
int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
@@ -446,7 +480,7 @@ int hfsplus_strcmp(const struct hfsplus_unistr *,
int hfsplus_uni2asc(struct super_block *,
const struct hfsplus_unistr *, char *, int *);
int hfsplus_asc2uni(struct super_block *,
- struct hfsplus_unistr *, const char *, int);
+ struct hfsplus_unistr *, int, const char *, int);
int hfsplus_hash_dentry(const struct dentry *dentry,
const struct inode *inode, struct qstr *str);
int hfsplus_compare_dentry(const struct dentry *parent,
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 921967e5abb1..452ede01b036 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -52,13 +52,23 @@
typedef __be32 hfsplus_cnid;
typedef __be16 hfsplus_unichr;
+#define HFSPLUS_MAX_STRLEN 255
+#define HFSPLUS_ATTR_MAX_STRLEN 127
+
/* A "string" as used in filenames, etc. */
struct hfsplus_unistr {
__be16 length;
- hfsplus_unichr unicode[255];
+ hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];
} __packed;
-#define HFSPLUS_MAX_STRLEN 255
+/*
+ * A "string" is used in attributes file
+ * for name of extended attribute
+ */
+struct hfsplus_attr_unistr {
+ __be16 length;
+ hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN];
+} __packed;
/* POSIX permissions */
struct hfsplus_perm {
@@ -291,6 +301,8 @@ struct hfsplus_cat_file {
/* File attribute bits */
#define HFSPLUS_FILE_LOCKED 0x0001
#define HFSPLUS_FILE_THREAD_EXISTS 0x0002
+#define HFSPLUS_XATTR_EXISTS 0x0004
+#define HFSPLUS_ACL_EXISTS 0x0008
/* HFS+ catalog thread (part of a cat_entry) */
struct hfsplus_cat_thread {
@@ -327,11 +339,63 @@ struct hfsplus_ext_key {
#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
+#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo"
+#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security"
+
+#define HFSPLUS_ATTR_INLINE_DATA 0x10
+#define HFSPLUS_ATTR_FORK_DATA 0x20
+#define HFSPLUS_ATTR_EXTENTS 0x30
+
+/* HFS+ attributes tree key */
+struct hfsplus_attr_key {
+ __be16 key_len;
+ __be16 pad;
+ hfsplus_cnid cnid;
+ __be32 start_block;
+ struct hfsplus_attr_unistr key_name;
+} __packed;
+
+#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key)
+
+/* HFS+ fork data attribute */
+struct hfsplus_attr_fork_data {
+ __be32 record_type;
+ __be32 reserved;
+ struct hfsplus_fork_raw the_fork;
+} __packed;
+
+/* HFS+ extension attribute */
+struct hfsplus_attr_extents {
+ __be32 record_type;
+ __be32 reserved;
+ struct hfsplus_extent extents;
+} __packed;
+
+#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802
+
+/* HFS+ attribute inline data */
+struct hfsplus_attr_inline_data {
+ __be32 record_type;
+ __be32 reserved1;
+ u8 reserved2[6];
+ __be16 length;
+ u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE];
+} __packed;
+
+/* A data record in the attributes tree */
+typedef union {
+ __be32 record_type;
+ struct hfsplus_attr_fork_data fork_data;
+ struct hfsplus_attr_extents extents;
+ struct hfsplus_attr_inline_data inline_data;
+} __packed hfsplus_attr_entry;
+
/* HFS+ generic BTree key */
typedef union {
__be16 key_len;
struct hfsplus_cat_key cat;
struct hfsplus_ext_key ext;
+ struct hfsplus_attr_key attr;
} __packed hfsplus_btree_key;
#endif
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2172aa5976f5..160ccc9cdb4b 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -17,6 +17,7 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
+#include "xattr.h"
static int hfsplus_readpage(struct file *file, struct page *page)
{
@@ -28,6 +29,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, hfsplus_get_block, wbc);
}
+static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hfsplus_file_truncate(inode);
+ }
+}
+
static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -38,11 +49,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hfsplus_write_failed(mapping, pos + len);
return ret;
}
@@ -116,7 +124,8 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = file_inode(file)->i_mapping->host;
ssize_t ret;
ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
@@ -131,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ hfsplus_write_failed(mapping, end);
}
return ret;
@@ -300,10 +309,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
-
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
+ truncate_setsize(inode, attr->ia_size);
+ hfsplus_file_truncate(inode);
}
setattr_copy(inode, attr);
@@ -342,6 +349,18 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
error = error2;
}
+ if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) {
+ if (sbi->attr_tree) {
+ error2 =
+ filemap_write_and_wait(
+ sbi->attr_tree->inode->i_mapping);
+ if (!error)
+ error = error2;
+ } else {
+ printk(KERN_ERR "hfs: sync non-existent attributes tree\n");
+ }
+ }
+
if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
if (!error)
@@ -358,11 +377,11 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.lookup = hfsplus_file_lookup,
- .truncate = hfsplus_file_truncate,
.setattr = hfsplus_setattr,
- .setxattr = hfsplus_setxattr,
- .getxattr = hfsplus_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = hfsplus_listxattr,
+ .removexattr = hfsplus_removexattr,
};
static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 09addc8615fa..d3ff5cc317d7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -16,7 +16,6 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/sched.h>
-#include <linux/xattr.h>
#include <asm/uaccess.h>
#include "hfsplus_fs.h"
@@ -59,7 +58,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
unsigned int flags = 0;
@@ -75,7 +74,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
unsigned int flags;
int err = 0;
@@ -151,110 +150,3 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return -ENOTTY;
}
}
-
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- struct inode *inode = dentry->d_inode;
- struct hfs_find_data fd;
- hfsplus_cat_entry entry;
- struct hfsplus_cat_file *file;
- int res;
-
- if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
- return -EOPNOTSUPP;
-
- res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
- if (res)
- return res;
- res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
- if (res)
- goto out;
- hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
- sizeof(struct hfsplus_cat_file));
- file = &entry.file;
-
- if (!strcmp(name, "hfs.type")) {
- if (size == 4)
- memcpy(&file->user_info.fdType, value, 4);
- else
- res = -ERANGE;
- } else if (!strcmp(name, "hfs.creator")) {
- if (size == 4)
- memcpy(&file->user_info.fdCreator, value, 4);
- else
- res = -ERANGE;
- } else
- res = -EOPNOTSUPP;
- if (!res) {
- hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
- sizeof(struct hfsplus_cat_file));
- hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
- }
-out:
- hfs_find_exit(&fd);
- return res;
-}
-
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
- void *value, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct hfs_find_data fd;
- hfsplus_cat_entry entry;
- struct hfsplus_cat_file *file;
- ssize_t res = 0;
-
- if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
- return -EOPNOTSUPP;
-
- if (size) {
- res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
- if (res)
- return res;
- res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
- if (res)
- goto out;
- hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
- sizeof(struct hfsplus_cat_file));
- }
- file = &entry.file;
-
- if (!strcmp(name, "hfs.type")) {
- if (size >= 4) {
- memcpy(value, &file->user_info.fdType, 4);
- res = 4;
- } else
- res = size ? -ERANGE : 4;
- } else if (!strcmp(name, "hfs.creator")) {
- if (size >= 4) {
- memcpy(value, &file->user_info.fdCreator, 4);
- res = 4;
- } else
- res = size ? -ERANGE : 4;
- } else
- res = -EOPNOTSUPP;
-out:
- if (size)
- hfs_find_exit(&fd);
- return res;
-}
-
-#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
-
-ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- struct inode *inode = dentry->d_inode;
-
- if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
- return -EOPNOTSUPP;
-
- if (!buffer || !size)
- return HFSPLUS_ATTRLIST_SIZE;
- if (size < HFSPLUS_ATTRLIST_SIZE)
- return -ERANGE;
- strcpy(buffer, "hfs.type");
- strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
-
- return HFSPLUS_ATTRLIST_SIZE;
-}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 811a84d2d964..974c26f96fae 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -20,6 +20,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb);
static void hfsplus_destroy_inode(struct inode *inode);
#include "hfsplus_fs.h"
+#include "xattr.h"
static int hfsplus_system_read_inode(struct inode *inode)
{
@@ -118,6 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode)
case HFSPLUS_ATTR_CNID:
fork = &vhdr->attr_file;
tree = sbi->attr_tree;
+ break;
default:
return -EIO;
}
@@ -127,8 +129,14 @@ static int hfsplus_system_write_inode(struct inode *inode)
hfsplus_mark_mdb_dirty(inode->i_sb);
}
hfsplus_inode_write_fork(inode, fork);
- if (tree)
- hfs_btree_write(tree);
+ if (tree) {
+ int err = hfs_btree_write(tree);
+ if (err) {
+ printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n",
+ err, inode->i_ino);
+ return err;
+ }
+ }
return 0;
}
@@ -185,6 +193,12 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
if (!error)
error = error2;
+ if (sbi->attr_tree) {
+ error2 =
+ filemap_write_and_wait(sbi->attr_tree->inode->i_mapping);
+ if (!error)
+ error = error2;
+ }
error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
if (!error)
error = error2;
@@ -226,6 +240,7 @@ out:
static void delayed_sync_fs(struct work_struct *work)
{
+ int err;
struct hfsplus_sb_info *sbi;
sbi = container_of(work, struct hfsplus_sb_info, sync_work.work);
@@ -234,7 +249,9 @@ static void delayed_sync_fs(struct work_struct *work)
sbi->work_queued = 0;
spin_unlock(&sbi->work_lock);
- hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
+ err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
+ if (err)
+ printk(KERN_ERR "hfs: delayed sync fs err %d\n", err);
}
void hfsplus_mark_mdb_dirty(struct super_block *sb)
@@ -272,6 +289,7 @@ static void hfsplus_put_super(struct super_block *sb)
hfsplus_sync_fs(sb, 1);
}
+ hfs_btree_close(sbi->attr_tree);
hfs_btree_close(sbi->cat_tree);
hfs_btree_close(sbi->ext_tree);
iput(sbi->alloc_file);
@@ -468,12 +486,20 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_ERR "hfs: failed to load catalog file\n");
goto out_close_ext_tree;
}
+ if (vhdr->attr_file.total_blocks != 0) {
+ sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+ if (!sbi->attr_tree) {
+ printk(KERN_ERR "hfs: failed to load attributes file\n");
+ goto out_close_cat_tree;
+ }
+ }
+ sb->s_xattr = hfsplus_xattr_handlers;
inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
if (IS_ERR(inode)) {
printk(KERN_ERR "hfs: failed to load allocation file\n");
err = PTR_ERR(inode);
- goto out_close_cat_tree;
+ goto out_close_attr_tree;
}
sbi->alloc_file = inode;
@@ -533,10 +559,27 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
}
err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
&str, sbi->hidden_dir);
- mutex_unlock(&sbi->vh_mutex);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->vh_mutex);
+ goto out_put_hidden_dir;
+ }
+
+ err = hfsplus_init_inode_security(sbi->hidden_dir,
+ root, &str);
+ if (err == -EOPNOTSUPP)
+ err = 0; /* Operation is not supported. */
+ else if (err) {
+ /*
+ * Try to delete anyway without
+ * error analysis.
+ */
+ hfsplus_delete_cat(sbi->hidden_dir->i_ino,
+ root, &str);
+ mutex_unlock(&sbi->vh_mutex);
goto out_put_hidden_dir;
+ }
+ mutex_unlock(&sbi->vh_mutex);
hfsplus_mark_inode_dirty(sbi->hidden_dir,
HFSPLUS_I_CAT_DIRTY);
}
@@ -553,6 +596,8 @@ out_put_root:
sb->s_root = NULL;
out_put_alloc_file:
iput(sbi->alloc_file);
+out_close_attr_tree:
+ hfs_btree_close(sbi->attr_tree);
out_close_cat_tree:
hfs_btree_close(sbi->cat_tree);
out_close_ext_tree:
@@ -626,9 +671,20 @@ static int __init init_hfsplus_fs(void)
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
+ err = hfsplus_create_attr_tree_cache();
+ if (err)
+ goto destroy_inode_cache;
err = register_filesystem(&hfsplus_fs_type);
if (err)
- kmem_cache_destroy(hfsplus_inode_cachep);
+ goto destroy_attr_tree_cache;
+ return 0;
+
+destroy_attr_tree_cache:
+ hfsplus_destroy_attr_tree_cache();
+
+destroy_inode_cache:
+ kmem_cache_destroy(hfsplus_inode_cachep);
+
return err;
}
@@ -641,6 +697,7 @@ static void __exit exit_hfsplus_fs(void)
* destroy cache.
*/
rcu_barrier();
+ hfsplus_destroy_attr_tree_cache();
kmem_cache_destroy(hfsplus_inode_cachep);
}
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index a32998f29f0b..2c2e47dcfdd8 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -295,7 +295,8 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)
return hfsplus_decompose_table + (off / 4);
}
-int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+int hfsplus_asc2uni(struct super_block *sb,
+ struct hfsplus_unistr *ustr, int max_unistr_len,
const char *astr, int len)
{
int size, dsize, decompose;
@@ -303,7 +304,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
wchar_t c;
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
- while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
+ while (outlen < max_unistr_len && len > 0) {
size = asc2unichar(sb, astr, len, &c);
if (decompose)
@@ -311,7 +312,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
else
dstr = NULL;
if (dstr) {
- if (outlen + dsize > HFSPLUS_MAX_STRLEN)
+ if (outlen + dsize > max_unistr_len)
break;
do {
ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
new file mode 100644
index 000000000000..e8a4b0815c61
--- /dev/null
+++ b/fs/hfsplus/xattr.c
@@ -0,0 +1,709 @@
+/*
+ * linux/fs/hfsplus/xattr.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+const struct xattr_handler *hfsplus_xattr_handlers[] = {
+ &hfsplus_xattr_osx_handler,
+ &hfsplus_xattr_user_handler,
+ &hfsplus_xattr_trusted_handler,
+ &hfsplus_xattr_security_handler,
+ NULL
+};
+
+static int strcmp_xattr_finder_info(const char *name)
+{
+ if (name) {
+ return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME,
+ sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME));
+ }
+ return -1;
+}
+
+static int strcmp_xattr_acl(const char *name)
+{
+ if (name) {
+ return strncmp(name, HFSPLUS_XATTR_ACL_NAME,
+ sizeof(HFSPLUS_XATTR_ACL_NAME));
+ }
+ return -1;
+}
+
+static inline int is_known_namespace(const char *name)
+{
+ if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+ strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ return false;
+
+ return true;
+}
+
+static int can_set_xattr(struct inode *inode, const char *name,
+ const void *value, size_t value_len)
+{
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return -EOPNOTSUPP; /* TODO: implement ACL support */
+
+ if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
+ /*
+ * This makes sure that we aren't trying to set an
+ * attribute in a different namespace by prefixing it
+ * with "osx."
+ */
+ if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+ /*
+ * Don't allow setting an attribute in an unknown namespace.
+ */
+ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
+ strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+ strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err = 0;
+ struct hfs_find_data cat_fd;
+ hfsplus_cat_entry entry;
+ u16 cat_entry_flags, cat_entry_type;
+ u16 folder_finderinfo_len = sizeof(struct DInfo) +
+ sizeof(struct DXInfo);
+ u16 file_finderinfo_len = sizeof(struct FInfo) +
+ sizeof(struct FXInfo);
+
+ if ((!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) ||
+ HFSPLUS_IS_RSRC(inode))
+ return -EOPNOTSUPP;
+
+ err = can_set_xattr(inode, name, value, size);
+ if (err)
+ return err;
+
+ if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+ XATTR_MAC_OSX_PREFIX_LEN) == 0)
+ name += XATTR_MAC_OSX_PREFIX_LEN;
+
+ if (value == NULL) {
+ value = "";
+ size = 0;
+ }
+
+ err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+ if (err) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ return err;
+ }
+
+ err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+ if (err) {
+ printk(KERN_ERR "hfs: catalog searching failed\n");
+ goto end_setxattr;
+ }
+
+ if (!strcmp_xattr_finder_info(name)) {
+ if (flags & XATTR_CREATE) {
+ printk(KERN_ERR "hfs: xattr exists yet\n");
+ err = -EOPNOTSUPP;
+ goto end_setxattr;
+ }
+ hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset,
+ sizeof(hfsplus_cat_entry));
+ if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
+ if (size == folder_finderinfo_len) {
+ memcpy(&entry.folder.user_info, value,
+ folder_finderinfo_len);
+ hfs_bnode_write(cat_fd.bnode, &entry,
+ cat_fd.entryoffset,
+ sizeof(struct hfsplus_cat_folder));
+ hfsplus_mark_inode_dirty(inode,
+ HFSPLUS_I_CAT_DIRTY);
+ } else {
+ err = -ERANGE;
+ goto end_setxattr;
+ }
+ } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
+ if (size == file_finderinfo_len) {
+ memcpy(&entry.file.user_info, value,
+ file_finderinfo_len);
+ hfs_bnode_write(cat_fd.bnode, &entry,
+ cat_fd.entryoffset,
+ sizeof(struct hfsplus_cat_file));
+ hfsplus_mark_inode_dirty(inode,
+ HFSPLUS_I_CAT_DIRTY);
+ } else {
+ err = -ERANGE;
+ goto end_setxattr;
+ }
+ } else {
+ err = -EOPNOTSUPP;
+ goto end_setxattr;
+ }
+ goto end_setxattr;
+ }
+
+ if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
+ err = -EOPNOTSUPP;
+ goto end_setxattr;
+ }
+
+ if (hfsplus_attr_exists(inode, name)) {
+ if (flags & XATTR_CREATE) {
+ printk(KERN_ERR "hfs: xattr exists yet\n");
+ err = -EOPNOTSUPP;
+ goto end_setxattr;
+ }
+ err = hfsplus_delete_attr(inode, name);
+ if (err)
+ goto end_setxattr;
+ err = hfsplus_create_attr(inode, name, value, size);
+ if (err)
+ goto end_setxattr;
+ } else {
+ if (flags & XATTR_REPLACE) {
+ printk(KERN_ERR "hfs: cannot replace xattr\n");
+ err = -EOPNOTSUPP;
+ goto end_setxattr;
+ }
+ err = hfsplus_create_attr(inode, name, value, size);
+ if (err)
+ goto end_setxattr;
+ }
+
+ cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+ if (cat_entry_type == HFSPLUS_FOLDER) {
+ cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+ cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, flags));
+ cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+ if (!strcmp_xattr_acl(name))
+ cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+ hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, flags),
+ cat_entry_flags);
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+ } else if (cat_entry_type == HFSPLUS_FILE) {
+ cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+ cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, flags));
+ cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+ if (!strcmp_xattr_acl(name))
+ cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+ hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, flags),
+ cat_entry_flags);
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+ } else {
+ printk(KERN_ERR "hfs: invalid catalog entry type\n");
+ err = -EIO;
+ goto end_setxattr;
+ }
+
+end_setxattr:
+ hfs_find_exit(&cat_fd);
+ return err;
+}
+
+static inline int is_osx_xattr(const char *xattr_name)
+{
+ return !is_known_namespace(xattr_name);
+}
+
+static int name_len(const char *xattr_name, int xattr_name_len)
+{
+ int len = xattr_name_len + 1;
+
+ if (is_osx_xattr(xattr_name))
+ len += XATTR_MAC_OSX_PREFIX_LEN;
+
+ return len;
+}
+
+static int copy_name(char *buffer, const char *xattr_name, int name_len)
+{
+ int len = name_len;
+ int offset = 0;
+
+ if (is_osx_xattr(xattr_name)) {
+ strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
+ offset += XATTR_MAC_OSX_PREFIX_LEN;
+ len += XATTR_MAC_OSX_PREFIX_LEN;
+ }
+
+ strncpy(buffer + offset, xattr_name, name_len);
+ memset(buffer + offset + name_len, 0, 1);
+ len += 1;
+
+ return len;
+}
+
+static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry,
+ void *value, size_t size)
+{
+ ssize_t res = 0;
+ struct inode *inode = dentry->d_inode;
+ struct hfs_find_data fd;
+ u16 entry_type;
+ u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+ u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+ u16 record_len = max(folder_rec_len, file_rec_len);
+ u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+ u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+
+ if (size >= record_len) {
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+ if (res) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ return res;
+ }
+ res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+ if (res)
+ goto end_getxattr_finder_info;
+ entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+
+ if (entry_type == HFSPLUS_FOLDER) {
+ hfs_bnode_read(fd.bnode, folder_finder_info,
+ fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, user_info),
+ folder_rec_len);
+ memcpy(value, folder_finder_info, folder_rec_len);
+ res = folder_rec_len;
+ } else if (entry_type == HFSPLUS_FILE) {
+ hfs_bnode_read(fd.bnode, file_finder_info,
+ fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, user_info),
+ file_rec_len);
+ memcpy(value, file_finder_info, file_rec_len);
+ res = file_rec_len;
+ } else {
+ res = -EOPNOTSUPP;
+ goto end_getxattr_finder_info;
+ }
+ } else
+ res = size ? -ERANGE : record_len;
+
+end_getxattr_finder_info:
+ if (size >= record_len)
+ hfs_find_exit(&fd);
+ return res;
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct hfs_find_data fd;
+ hfsplus_attr_entry *entry;
+ __be32 xattr_record_type;
+ u32 record_type;
+ u16 record_length = 0;
+ ssize_t res = 0;
+
+ if ((!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) ||
+ HFSPLUS_IS_RSRC(inode))
+ return -EOPNOTSUPP;
+
+ if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+ XATTR_MAC_OSX_PREFIX_LEN) == 0) {
+ /* skip "osx." prefix */
+ name += XATTR_MAC_OSX_PREFIX_LEN;
+ /*
+ * Don't allow retrieving properly prefixed attributes
+ * by prepending them with "osx."
+ */
+ if (is_known_namespace(name))
+ return -EOPNOTSUPP;
+ }
+
+ if (!strcmp_xattr_finder_info(name))
+ return hfsplus_getxattr_finder_info(dentry, value, size);
+
+ if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+ return -EOPNOTSUPP;
+
+ entry = hfsplus_alloc_attr_entry();
+ if (!entry) {
+ printk(KERN_ERR "hfs: can't allocate xattr entry\n");
+ return -ENOMEM;
+ }
+
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+ if (res) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ goto failed_getxattr_init;
+ }
+
+ res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd);
+ if (res) {
+ if (res == -ENOENT)
+ res = -ENODATA;
+ else
+ printk(KERN_ERR "hfs: xattr searching failed\n");
+ goto out;
+ }
+
+ hfs_bnode_read(fd.bnode, &xattr_record_type,
+ fd.entryoffset, sizeof(xattr_record_type));
+ record_type = be32_to_cpu(xattr_record_type);
+ if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+ record_length = hfs_bnode_read_u16(fd.bnode,
+ fd.entryoffset +
+ offsetof(struct hfsplus_attr_inline_data,
+ length));
+ if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) {
+ printk(KERN_ERR "hfs: invalid xattr record size\n");
+ res = -EIO;
+ goto out;
+ }
+ } else if (record_type == HFSPLUS_ATTR_FORK_DATA ||
+ record_type == HFSPLUS_ATTR_EXTENTS) {
+ printk(KERN_ERR "hfs: only inline data xattr are supported\n");
+ res = -EOPNOTSUPP;
+ goto out;
+ } else {
+ printk(KERN_ERR "hfs: invalid xattr record\n");
+ res = -EIO;
+ goto out;
+ }
+
+ if (size) {
+ hfs_bnode_read(fd.bnode, entry, fd.entryoffset,
+ offsetof(struct hfsplus_attr_inline_data,
+ raw_bytes) + record_length);
+ }
+
+ if (size >= record_length) {
+ memcpy(value, entry->inline_data.raw_bytes, record_length);
+ res = record_length;
+ } else
+ res = size ? -ERANGE : record_length;
+
+out:
+ hfs_find_exit(&fd);
+
+failed_getxattr_init:
+ hfsplus_destroy_attr_entry(entry);
+ return res;
+}
+
+static inline int can_list(const char *xattr_name)
+{
+ if (!xattr_name)
+ return 0;
+
+ return strncmp(xattr_name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) ||
+ capable(CAP_SYS_ADMIN);
+}
+
+static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
+ char *buffer, size_t size)
+{
+ ssize_t res = 0;
+ struct inode *inode = dentry->d_inode;
+ struct hfs_find_data fd;
+ u16 entry_type;
+ u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+ u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+ unsigned long len, found_bit;
+ int xattr_name_len, symbols_count;
+
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+ if (res) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ return res;
+ }
+
+ res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+ if (res)
+ goto end_listxattr_finder_info;
+
+ entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+ if (entry_type == HFSPLUS_FOLDER) {
+ len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+ hfs_bnode_read(fd.bnode, folder_finder_info,
+ fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, user_info),
+ len);
+ found_bit = find_first_bit((void *)folder_finder_info, len*8);
+ } else if (entry_type == HFSPLUS_FILE) {
+ len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+ hfs_bnode_read(fd.bnode, file_finder_info,
+ fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, user_info),
+ len);
+ found_bit = find_first_bit((void *)file_finder_info, len*8);
+ } else {
+ res = -EOPNOTSUPP;
+ goto end_listxattr_finder_info;
+ }
+
+ if (found_bit >= (len*8))
+ res = 0;
+ else {
+ symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1;
+ xattr_name_len =
+ name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count);
+ if (!buffer || !size) {
+ if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME))
+ res = xattr_name_len;
+ } else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) {
+ if (size < xattr_name_len)
+ res = -ERANGE;
+ else {
+ res = copy_name(buffer,
+ HFSPLUS_XATTR_FINDER_INFO_NAME,
+ symbols_count);
+ }
+ }
+ }
+
+end_listxattr_finder_info:
+ hfs_find_exit(&fd);
+
+ return res;
+}
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ ssize_t err;
+ ssize_t res = 0;
+ struct inode *inode = dentry->d_inode;
+ struct hfs_find_data fd;
+ u16 key_len = 0;
+ struct hfsplus_attr_key attr_key;
+ char strbuf[HFSPLUS_ATTR_MAX_STRLEN +
+ XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+ int xattr_name_len;
+
+ if ((!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) ||
+ HFSPLUS_IS_RSRC(inode))
+ return -EOPNOTSUPP;
+
+ res = hfsplus_listxattr_finder_info(dentry, buffer, size);
+ if (res < 0)
+ return res;
+ else if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+ return (res == 0) ? -EOPNOTSUPP : res;
+
+ err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+ if (err) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ return err;
+ }
+
+ err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd);
+ if (err) {
+ if (err == -ENOENT) {
+ if (res == 0)
+ res = -ENODATA;
+ goto end_listxattr;
+ } else {
+ res = err;
+ goto end_listxattr;
+ }
+ }
+
+ for (;;) {
+ key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset);
+ if (key_len == 0 || key_len > fd.tree->max_key_len) {
+ printk(KERN_ERR "hfs: invalid xattr key length: %d\n",
+ key_len);
+ res = -EIO;
+ goto end_listxattr;
+ }
+
+ hfs_bnode_read(fd.bnode, &attr_key,
+ fd.keyoffset, key_len + sizeof(key_len));
+
+ if (be32_to_cpu(attr_key.cnid) != inode->i_ino)
+ goto end_listxattr;
+
+ xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN;
+ if (hfsplus_uni2asc(inode->i_sb,
+ (const struct hfsplus_unistr *)&fd.key->attr.key_name,
+ strbuf, &xattr_name_len)) {
+ printk(KERN_ERR "hfs: unicode conversion failed\n");
+ res = -EIO;
+ goto end_listxattr;
+ }
+
+ if (!buffer || !size) {
+ if (can_list(strbuf))
+ res += name_len(strbuf, xattr_name_len);
+ } else if (can_list(strbuf)) {
+ if (size < (res + name_len(strbuf, xattr_name_len))) {
+ res = -ERANGE;
+ goto end_listxattr;
+ } else
+ res += copy_name(buffer + res,
+ strbuf, xattr_name_len);
+ }
+
+ if (hfs_brec_goto(&fd, 1))
+ goto end_listxattr;
+ }
+
+end_listxattr:
+ hfs_find_exit(&fd);
+ return res;
+}
+
+int hfsplus_removexattr(struct dentry *dentry, const char *name)
+{
+ int err = 0;
+ struct inode *inode = dentry->d_inode;
+ struct hfs_find_data cat_fd;
+ u16 flags;
+ u16 cat_entry_type;
+ int is_xattr_acl_deleted = 0;
+ int is_all_xattrs_deleted = 0;
+
+ if ((!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) ||
+ HFSPLUS_IS_RSRC(inode))
+ return -EOPNOTSUPP;
+
+ if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+ return -EOPNOTSUPP;
+
+ err = can_set_xattr(inode, name, NULL, 0);
+ if (err)
+ return err;
+
+ if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+ XATTR_MAC_OSX_PREFIX_LEN) == 0)
+ name += XATTR_MAC_OSX_PREFIX_LEN;
+
+ if (!strcmp_xattr_finder_info(name))
+ return -EOPNOTSUPP;
+
+ err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+ if (err) {
+ printk(KERN_ERR "hfs: can't init xattr find struct\n");
+ return err;
+ }
+
+ err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+ if (err) {
+ printk(KERN_ERR "hfs: catalog searching failed\n");
+ goto end_removexattr;
+ }
+
+ err = hfsplus_delete_attr(inode, name);
+ if (err)
+ goto end_removexattr;
+
+ is_xattr_acl_deleted = !strcmp_xattr_acl(name);
+ is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL);
+
+ if (!is_xattr_acl_deleted && !is_all_xattrs_deleted)
+ goto end_removexattr;
+
+ cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+
+ if (cat_entry_type == HFSPLUS_FOLDER) {
+ flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, flags));
+ if (is_xattr_acl_deleted)
+ flags &= ~HFSPLUS_ACL_EXISTS;
+ if (is_all_xattrs_deleted)
+ flags &= ~HFSPLUS_XATTR_EXISTS;
+ hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_folder, flags),
+ flags);
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+ } else if (cat_entry_type == HFSPLUS_FILE) {
+ flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, flags));
+ if (is_xattr_acl_deleted)
+ flags &= ~HFSPLUS_ACL_EXISTS;
+ if (is_all_xattrs_deleted)
+ flags &= ~HFSPLUS_XATTR_EXISTS;
+ hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+ offsetof(struct hfsplus_cat_file, flags),
+ flags);
+ hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+ } else {
+ printk(KERN_ERR "hfs: invalid catalog entry type\n");
+ err = -EIO;
+ goto end_removexattr;
+ }
+
+end_removexattr:
+ hfs_find_exit(&cat_fd);
+ return err;
+}
+
+static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
+ XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+ strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+
+ return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
+ XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+ strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+
+ return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ /*
+ * This method is not used.
+ * It is used hfsplus_listxattr() instead of generic_listxattr().
+ */
+ return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_osx_handler = {
+ .prefix = XATTR_MAC_OSX_PREFIX,
+ .list = hfsplus_osx_listxattr,
+ .get = hfsplus_osx_getxattr,
+ .set = hfsplus_osx_setxattr,
+};
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
new file mode 100644
index 000000000000..847b695b984d
--- /dev/null
+++ b/fs/hfsplus/xattr.h
@@ -0,0 +1,60 @@
+/*
+ * linux/fs/hfsplus/xattr.h
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#ifndef _LINUX_HFSPLUS_XATTR_H
+#define _LINUX_HFSPLUS_XATTR_H
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler hfsplus_xattr_osx_handler;
+extern const struct xattr_handler hfsplus_xattr_user_handler;
+extern const struct xattr_handler hfsplus_xattr_trusted_handler;
+/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/
+/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/
+extern const struct xattr_handler hfsplus_xattr_security_handler;
+
+extern const struct xattr_handler *hfsplus_xattr_handlers[];
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+
+static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size);
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+int hfsplus_removexattr(struct dentry *dentry, const char *name);
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
+
+static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir)
+{
+ /*TODO: implement*/
+ return 0;
+}
+
+static inline int hfsplus_init_inode_security(struct inode *inode,
+ struct inode *dir,
+ const struct qstr *qstr)
+{
+ int err;
+
+ err = hfsplus_init_acl(inode, dir);
+ if (!err)
+ err = hfsplus_init_security(inode, dir, qstr);
+ return err;
+}
+
+#endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
new file mode 100644
index 000000000000..83b842f113c5
--- /dev/null
+++ b/fs/hfsplus/xattr_security.c
@@ -0,0 +1,104 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/security.h>
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+ strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
+
+ return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+ strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
+
+ return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ /*
+ * This method is not used.
+ * It is used hfsplus_listxattr() instead of generic_listxattr().
+ */
+ return -EOPNOTSUPP;
+}
+
+static int hfsplus_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t xattr_name_len;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ xattr_name_len = strlen(xattr->name);
+
+ if (xattr_name_len == 0)
+ continue;
+
+ if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN >
+ HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+ strcpy(xattr_name +
+ XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ memset(xattr_name +
+ XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1);
+
+ err = __hfsplus_setxattr(inode, xattr_name,
+ xattr->value, xattr->value_len, 0);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &hfsplus_initxattrs, NULL);
+}
+
+const struct xattr_handler hfsplus_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = hfsplus_security_listxattr,
+ .get = hfsplus_security_getxattr,
+ .set = hfsplus_security_setxattr,
+};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
new file mode 100644
index 000000000000..426cee277542
--- /dev/null
+++ b/fs/hfsplus/xattr_trusted.c
@@ -0,0 +1,63 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for trusted extended attributes.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
+ strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
+
+ return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
+ strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
+
+ return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ /*
+ * This method is not used.
+ * It is used hfsplus_listxattr() instead of generic_listxattr().
+ */
+ return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = hfsplus_trusted_listxattr,
+ .get = hfsplus_trusted_getxattr,
+ .set = hfsplus_trusted_setxattr,
+};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
new file mode 100644
index 000000000000..e34016561ae0
--- /dev/null
+++ b/fs/hfsplus/xattr_user.c
@@ -0,0 +1,63 @@
+/*
+ * linux/fs/hfsplus/xattr_user.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for user extended attributes.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_USER_PREFIX);
+ strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
+
+ return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags, int type)
+{
+ char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+ size_t len = strlen(name);
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+ return -EOPNOTSUPP;
+
+ strcpy(xattr_name, XATTR_USER_PREFIX);
+ strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
+
+ return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ /*
+ * This method is not used.
+ * It is used hfsplus_listxattr() instead of generic_listxattr().
+ */
+ return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = hfsplus_user_listxattr,
+ .get = hfsplus_user_getxattr,
+ .set = hfsplus_user_setxattr,
+};
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 457addc5c91f..fbabb906066f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -30,7 +30,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
return list_entry(inode, struct hostfs_inode_info, vfs_inode);
}
-#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
+#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
static int hostfs_d_delete(const struct dentry *dentry)
{
@@ -861,14 +861,6 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
}
static const struct inode_operations hostfs_iops = {
- .create = hostfs_create,
- .link = hostfs_link,
- .unlink = hostfs_unlink,
- .symlink = hostfs_symlink,
- .mkdir = hostfs_mkdir,
- .rmdir = hostfs_rmdir,
- .mknod = hostfs_mknod,
- .rename = hostfs_rename,
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 78e12b2e0ea2..546f6d39713a 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -25,7 +25,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);
loff_t pos;
struct quad_buffer_head qbh;
- struct inode *i = filp->f_path.dentry->d_inode;
+ struct inode *i = file_inode(filp);
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct super_block *s = i->i_sb;
@@ -57,7 +57,7 @@ fail:
static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
struct quad_buffer_head qbh;
struct hpfs_dirent *de;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89d2a5803ae3..9f9dbeceeee7 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
return disk_secno;
}
-static void hpfs_truncate(struct inode *i)
+void hpfs_truncate(struct inode *i)
{
if (IS_IMMUTABLE(i)) return /*-EPERM*/;
hpfs_lock_assert(i->i_sb);
@@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page,hpfs_get_block);
}
+static void hpfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hpfs_truncate(inode);
+ }
+}
+
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hpfs_write_failed(mapping, pos + len);
return ret;
}
@@ -145,7 +152,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
retval = do_sync_write(file, buf, count, ppos);
if (retval > 0) {
hpfs_lock(file->f_path.dentry->d_sb);
- hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1;
+ hpfs_i(file_inode(file))->i_dirty = 1;
hpfs_unlock(file->f_path.dentry->d_sb);
}
return retval;
@@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =
const struct inode_operations hpfs_file_iops =
{
- .truncate = hpfs_truncate,
.setattr = hpfs_setattr,
};
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 7102aaecc244..b7ae286646b5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
/* file.c */
int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
+void hpfs_truncate(struct inode *);
extern const struct file_operations hpfs_file_ops;
extern const struct inode_operations hpfs_file_iops;
extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 804a9a842cbc..9edeeb0ea97e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
/*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {
Some unknown structures like ACL may be in fnode,
we'd better not overwrite them
- hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
+ hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino);
} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
__le32 ea;
if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
@@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
goto out_unlock;
+
+ truncate_setsize(inode, attr->ia_size);
+ hpfs_truncate(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 43b315f2002b..74f55703be49 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -180,7 +180,7 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
ssize_t n;
- read = file->f_path.dentry->d_inode->i_fop->read;
+ read = file_inode(file)->i_fop->read;
if (!is_user)
set_fs(KERNEL_DS);
@@ -288,7 +288,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf,
struct file *proc_file = data->proc_file;
ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
- write = proc_file->f_path.dentry->d_inode->i_fop->write;
+ write = file_inode(proc_file)->i_fop->write;
return (*write)(proc_file, buf, len, ppos);
}
@@ -513,7 +513,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
loff_t (*llseek)(struct file *, loff_t, int);
loff_t ret;
- llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
+ llseek = file_inode(proc_file)->i_fop->llseek;
if (llseek != NULL) {
ret = (*llseek)(proc_file, off, where);
if (ret < 0)
@@ -561,7 +561,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
});
int err;
- readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
+ readdir = file_inode(proc_file)->i_fop->readdir;
proc_file->f_pos = file->f_pos;
err = (*readdir)(proc_file, &dirent, hppfs_filldir);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 78bde32ea951..7f94e0cbc69c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -97,7 +97,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
@@ -918,16 +918,25 @@ static int get_hstate_idx(int page_size_log)
return h - hstates;
}
+static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
+ dentry->d_name.name);
+}
+
+static struct dentry_operations anon_ops = {
+ .d_dname = hugetlb_dname
+};
+
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acctflag,
struct user_struct **user,
int creat_flags, int page_size_log)
{
- int error = -ENOMEM;
- struct file *file;
+ struct file *file = ERR_PTR(-ENOMEM);
struct inode *inode;
struct path path;
- struct dentry *root;
+ struct super_block *sb;
struct qstr quick_string;
struct hstate *hstate;
unsigned long num_pages;
@@ -955,17 +964,18 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
}
}
- root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
+ sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
quick_string.name = name;
quick_string.len = strlen(quick_string.name);
quick_string.hash = 0;
- path.dentry = d_alloc(root, &quick_string);
+ path.dentry = d_alloc_pseudo(sb, &quick_string);
if (!path.dentry)
goto out_shm_unlock;
+ d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
- error = -ENOSPC;
- inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+ file = ERR_PTR(-ENOSPC);
+ inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out_dentry;
@@ -973,7 +983,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size += addr & ~huge_page_mask(hstate);
num_pages = ALIGN(size, huge_page_size(hstate)) >>
huge_page_shift(hstate);
- error = -ENOMEM;
+ file = ERR_PTR(-ENOMEM);
if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
goto out_inode;
@@ -981,10 +991,9 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
inode->i_size = size;
clear_nlink(inode);
- error = -ENFILE;
file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
&hugetlbfs_file_operations);
- if (!file)
+ if (IS_ERR(file))
goto out_dentry; /* inode is already attached */
return file;
@@ -998,7 +1007,7 @@ out_shm_unlock:
user_shm_unlock(size, *user);
*user = NULL;
}
- return ERR_PTR(error);
+ return file;
}
static int __init init_hugetlbfs_fs(void)
diff --git a/fs/inode.c b/fs/inode.c
index 14084b72b259..f5f7c06c36fb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -798,11 +798,10 @@ static struct inode *find_inode(struct super_block *sb,
int (*test)(struct inode *, void *),
void *data)
{
- struct hlist_node *node;
struct inode *inode = NULL;
repeat:
- hlist_for_each_entry(inode, node, head, i_hash) {
+ hlist_for_each_entry(inode, head, i_hash) {
spin_lock(&inode->i_lock);
if (inode->i_sb != sb) {
spin_unlock(&inode->i_lock);
@@ -830,11 +829,10 @@ repeat:
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino)
{
- struct hlist_node *node;
struct inode *inode = NULL;
repeat:
- hlist_for_each_entry(inode, node, head, i_hash) {
+ hlist_for_each_entry(inode, head, i_hash) {
spin_lock(&inode->i_lock);
if (inode->i_ino != ino) {
spin_unlock(&inode->i_lock);
@@ -1132,11 +1130,10 @@ EXPORT_SYMBOL(iget_locked);
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
struct hlist_head *b = inode_hashtable + hash(sb, ino);
- struct hlist_node *node;
struct inode *inode;
spin_lock(&inode_hash_lock);
- hlist_for_each_entry(inode, node, b, i_hash) {
+ hlist_for_each_entry(inode, b, i_hash) {
if (inode->i_ino == ino && inode->i_sb == sb) {
spin_unlock(&inode_hash_lock);
return 0;
@@ -1291,10 +1288,9 @@ int insert_inode_locked(struct inode *inode)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
while (1) {
- struct hlist_node *node;
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
- hlist_for_each_entry(old, node, head, i_hash) {
+ hlist_for_each_entry(old, head, i_hash) {
if (old->i_ino != ino)
continue;
if (old->i_sb != sb)
@@ -1306,7 +1302,7 @@ int insert_inode_locked(struct inode *inode)
}
break;
}
- if (likely(!node)) {
+ if (likely(!old)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
@@ -1334,11 +1330,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
while (1) {
- struct hlist_node *node;
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
- hlist_for_each_entry(old, node, head, i_hash) {
+ hlist_for_each_entry(old, head, i_hash) {
if (old->i_sb != sb)
continue;
if (!test(old, data))
@@ -1350,7 +1345,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
}
break;
}
- if (likely(!node)) {
+ if (likely(!old)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW;
hlist_add_head(&inode->i_hash, head);
@@ -1655,7 +1650,7 @@ EXPORT_SYMBOL(file_remove_suid);
int file_update_time(struct file *file)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct timespec now;
int sync_it = 0;
int ret;
diff --git a/fs/internal.h b/fs/internal.h
index 2f6af7f645eb..507141fceb99 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -69,7 +69,7 @@ extern void __mnt_drop_write_file(struct file *);
/*
* fs_struct.c
*/
-extern void chroot_fs_refs(struct path *, struct path *);
+extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 3bdad6d1f268..fd507fb460f8 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -175,7 +175,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
struct fiemap fiemap;
struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
u64 len;
int error;
@@ -424,7 +424,7 @@ EXPORT_SYMBOL(generic_block_fiemap);
*/
int ioctl_preallocate(struct file *filp, void __user *argp)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct space_resv sr;
if (copy_from_user(&sr, argp, sizeof(sr)))
@@ -449,7 +449,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp)
static int file_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int __user *p = (int __user *)arg;
switch (cmd) {
@@ -512,7 +512,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
static int ioctl_fsfreeze(struct file *filp)
{
- struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+ struct super_block *sb = file_inode(filp)->i_sb;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -527,7 +527,7 @@ static int ioctl_fsfreeze(struct file *filp)
static int ioctl_fsthaw(struct file *filp)
{
- struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+ struct super_block *sb = file_inode(filp)->i_sb;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -548,7 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
{
int error = 0;
int __user *argp = (int __user *)arg;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
switch (cmd) {
case FIOCLEX:
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 0b3fa7974fa8..592e5115a561 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -296,7 +296,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
*/
static int zisofs_readpage(struct file *file, struct page *page)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
int err;
int i, pcount, full_page;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index f20437c068a0..a7d5c3c3d4e6 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -253,7 +253,7 @@ static int isofs_readdir(struct file *filp,
int result;
char *tmpname;
struct iso_directory_record *tmpde;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
tmpname = (char *)__get_free_page(GFP_KERNEL);
if (tmpname == NULL)
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 2b4f2358eadb..12088d8de3fa 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -125,10 +125,10 @@ isofs_export_encode_fh(struct inode *inode,
*/
if (parent && (len < 5)) {
*max_len = 5;
- return 255;
+ return FILEID_INVALID;
} else if (len < 3) {
*max_len = 3;
- return 255;
+ return FILEID_INVALID;
}
len = 3;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a2862339323b..81cc7eaff863 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target)
* currently running transaction (if it exists). Otherwise,
* the target tid must be an old one.
*/
- if (journal->j_running_transaction &&
+ if (journal->j_commit_request != target &&
+ journal->j_running_transaction &&
journal->j_running_transaction->t_tid == target) {
/*
* We want a new commit: OK, mark the request and wakeup the
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3091d42992f0..750c70148eff 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -435,7 +435,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
trace_jbd2_commit_locking(journal, commit_transaction);
stats.run.rs_wait = commit_transaction->t_max_wait;
+ stats.run.rs_request_delay = 0;
stats.run.rs_locked = jiffies;
+ if (commit_transaction->t_requested)
+ stats.run.rs_request_delay =
+ jbd2_time_diff(commit_transaction->t_requested,
+ stats.run.rs_locked);
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
@@ -1116,7 +1121,10 @@ restart_loop:
*/
spin_lock(&journal->j_history_lock);
journal->j_stats.ts_tid++;
+ if (commit_transaction->t_requested)
+ journal->j_stats.ts_requested++;
journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
journal->j_stats.run.rs_running += stats.run.rs_running;
journal->j_stats.run.rs_locked += stats.run.rs_locked;
journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index dbf41f9452db..ed10991ab006 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -35,7 +35,6 @@
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
@@ -51,6 +50,14 @@
#include <asm/uaccess.h>
#include <asm/page.h>
+#ifdef CONFIG_JBD2_DEBUG
+ushort jbd2_journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+
+module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
+MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
+#endif
+
EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -513,6 +520,10 @@ int __jbd2_log_space_left(journal_t *journal)
*/
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
+ /* Return if the txn has already requested to be committed */
+ if (journal->j_commit_request == target)
+ return 0;
+
/*
* The only transaction we can possibly wait upon is the
* currently running transaction (if it exists). Otherwise,
@@ -529,6 +540,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
jbd_debug(1, "JBD2: requesting commit %d/%d\n",
journal->j_commit_request,
journal->j_commit_sequence);
+ journal->j_running_transaction->t_requested = jiffies;
wake_up(&journal->j_wait_commit);
return 1;
} else if (!tid_geq(journal->j_commit_request, target))
@@ -894,13 +906,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
- seq_printf(seq, "%lu transaction, each up to %u blocks\n",
- s->stats->ts_tid,
- s->journal->j_max_transaction_buffers);
+ seq_printf(seq, "%lu transactions (%lu requested), "
+ "each up to %u blocks\n",
+ s->stats->ts_tid, s->stats->ts_requested,
+ s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+ seq_printf(seq, " %ums request delay\n",
+ (s->stats->ts_requested == 0) ? 0 :
+ jiffies_to_msecs(s->stats->run.rs_request_delay /
+ s->stats->ts_requested));
seq_printf(seq, " %ums running transaction\n",
jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
@@ -2485,45 +2502,6 @@ restart:
spin_unlock(&journal->j_list_lock);
}
-/*
- * debugfs tunables
- */
-#ifdef CONFIG_JBD2_DEBUG
-u8 jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
-
-#define JBD2_DEBUG_NAME "jbd2-debug"
-
-static struct dentry *jbd2_debugfs_dir;
-static struct dentry *jbd2_debug;
-
-static void __init jbd2_create_debugfs_entry(void)
-{
- jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
- if (jbd2_debugfs_dir)
- jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
- S_IRUGO | S_IWUSR,
- jbd2_debugfs_dir,
- &jbd2_journal_enable_debug);
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
- debugfs_remove(jbd2_debug);
- debugfs_remove(jbd2_debugfs_dir);
-}
-
-#else
-
-static void __init jbd2_create_debugfs_entry(void)
-{
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
-}
-
-#endif
#ifdef CONFIG_PROC_FS
@@ -2609,7 +2587,6 @@ static int __init journal_init(void)
ret = journal_init_caches();
if (ret == 0) {
- jbd2_create_debugfs_entry();
jbd2_create_jbd_stats_proc_entry();
} else {
jbd2_journal_destroy_caches();
@@ -2624,7 +2601,6 @@ static void __exit journal_exit(void)
if (n)
printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
#endif
- jbd2_remove_debugfs_entry();
jbd2_remove_jbd_stats_proc_entry();
jbd2_journal_destroy_caches();
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 42f6615af0ac..d6ee5aed56b1 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -30,6 +30,8 @@
#include <linux/bug.h>
#include <linux/module.h>
+#include <trace/events/jbd2.h>
+
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
@@ -100,6 +102,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
journal->j_running_transaction = transaction;
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
+ transaction->t_requested = 0;
return transaction;
}
@@ -209,7 +212,8 @@ repeat:
if (!new_transaction)
goto alloc_transaction;
write_lock(&journal->j_state_lock);
- if (!journal->j_running_transaction) {
+ if (!journal->j_running_transaction &&
+ !journal->j_barrier_count) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -305,6 +309,8 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
+ handle->h_requested_credits = nblocks;
+ handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
@@ -351,7 +357,8 @@ static handle_t *new_handle(int nblocks)
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
+ unsigned int type, unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -375,8 +382,13 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
if (err < 0) {
jbd2_free_handle(handle);
current->journal_info = NULL;
- handle = ERR_PTR(err);
+ return ERR_PTR(err);
}
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+ handle->h_transaction->t_tid, type,
+ line_no, nblocks);
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -384,7 +396,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+ return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
@@ -446,7 +458,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
goto unlock;
}
+ trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
+ handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_buffer_credits,
+ nblocks);
+
handle->h_buffer_credits += nblocks;
+ handle->h_requested_credits += nblocks;
atomic_add(nblocks, &transaction->t_outstanding_credits);
result = 0;
@@ -1375,6 +1394,13 @@ int jbd2_journal_stop(handle_t *handle)
}
jbd_debug(4, "Handle %p going down\n", handle);
+ trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
+ handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ jiffies - handle->h_start_jiffies,
+ handle->h_sync, handle->h_requested_credits,
+ (handle->h_requested_credits -
+ handle->h_buffer_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1839,7 +1865,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
BUFFER_TRACE(bh, "entry");
-retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1934,14 +1959,11 @@ retry:
* for commit and try again.
*/
if (partial_page) {
- tid_t tid = journal->j_committing_transaction->t_tid;
-
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);
- jbd2_log_wait_commit(journal, tid);
- goto retry;
+ return -EBUSY;
}
/*
* OK, buffer won't be reachable after truncate. We just set
@@ -2002,21 +2024,23 @@ zap_buffer_unlocked:
* @page: page to flush
* @offset: length of page to invalidate.
*
- * Reap page buffers containing data after offset in page.
- *
+ * Reap page buffers containing data after offset in page. Can return -EBUSY
+ * if buffers are part of the committing transaction and the page is straddling
+ * i_size. Caller then has to wait for current commit and try again.
*/
-void jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned long offset)
+int jbd2_journal_invalidatepage(journal_t *journal,
+ struct page *page,
+ unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
int may_free = 1;
+ int ret = 0;
if (!PageLocked(page))
BUG();
if (!page_has_buffers(page))
- return;
+ return 0;
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
@@ -2030,9 +2054,11 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh,
- offset > 0);
+ ret = journal_unmap_buffer(journal, bh, offset > 0);
unlock_buffer(bh);
+ if (ret < 0)
+ return ret;
+ may_free &= ret;
}
curr_off = next_off;
bh = next;
@@ -2043,6 +2069,7 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
+ return 0;
}
/*
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
index 6ae169cd8faa..d8bb6c411e96 100644
--- a/fs/jffs2/Kconfig
+++ b/fs/jffs2/Kconfig
@@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY
write-buffer, and check for errors.
config JFFS2_SUMMARY
- bool "JFFS2 summary support (EXPERIMENTAL)"
- depends on JFFS2_FS && EXPERIMENTAL
+ bool "JFFS2 summary support"
+ depends on JFFS2_FS
default n
help
This feature makes it possible to use summary information
@@ -63,8 +63,8 @@ config JFFS2_SUMMARY
If unsure, say 'N'.
config JFFS2_FS_XATTR
- bool "JFFS2 XATTR support (EXPERIMENTAL)"
- depends on JFFS2_FS && EXPERIMENTAL
+ bool "JFFS2 XATTR support"
+ depends on JFFS2_FS
default n
help
Extended attributes are name:value pairs associated with inodes by
@@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY
successful one.
config JFFS2_CMODE_SIZE
- bool "size (EXPERIMENTAL)"
+ bool "size"
help
Tries all compressors and chooses the one which has the smallest
result.
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ad7774d32095..acd46a4160cb 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -117,12 +117,12 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct jffs2_inode_info *f;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct jffs2_full_dirent *fd;
unsigned long offset, curofs;
jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
- filp->f_path.dentry->d_inode->i_ino);
+ file_inode(filp)->i_ino);
f = JFFS2_INODE_INFO(inode);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9d3afd157f99..dd7442c58358 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
iattr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
- rc = vmtruncate(inode, iattr->ia_size);
+ rc = inode_newsize_ok(inode, iattr->ia_size);
if (rc)
return rc;
+
+ truncate_setsize(inode, iattr->ia_size);
+ jfs_truncate(inode);
}
setattr_copy(inode, iattr);
@@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
const struct inode_operations jfs_file_inode_operations = {
- .truncate = jfs_truncate,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 4692bf3ca8cb..b7dc47ba675e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
}
+static void jfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ jfs_truncate(inode);
+ }
+}
+
static int jfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
jfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ jfs_write_failed(mapping, pos + len);
return ret;
}
@@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
@@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ jfs_write_failed(mapping, end);
}
return ret;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index bc555ff417e9..93a1232894f6 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -58,7 +58,7 @@ static long jfs_map_ext2(unsigned long flags, int from)
long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct jfs_inode_info *jfs_inode = JFS_IP(inode);
unsigned int flags;
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9197a1b0d02d..0ddbeceafc62 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3004,7 +3004,7 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
*/
int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *ip = filp->f_path.dentry->d_inode;
+ struct inode *ip = file_inode(filp);
struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
int rc = 0;
loff_t dtpos; /* legacy OS/2 style position */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1a543be09c79..060ba638becb 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* If we really return the number of allocated & free inodes, some
* applications will fail because they won't see enough free inodes.
- * We'll try to calculate some guess as to how may inodes we can
+ * We'll try to calculate some guess as to how many inodes we can
* really allocate
*
* buf->f_files = atomic_read(&imap->im_numinos);
diff --git a/fs/libfs.c b/fs/libfs.c
index 35fc6e74cd88..916da8c4158b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = dentry->d_inode;
int error;
- WARN_ON_ONCE(inode->i_op->truncate);
-
error = inode_change_ok(inode, iattr);
if (error)
return error;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index ca0a08001449..0796c45d0d4d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -11,7 +11,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/nfs_fs.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/kthread.h>
@@ -178,7 +178,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
continue;
if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
continue;
- if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
+ if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0)
continue;
/* Alright, we found a lock. Set the return status
* and wake up the caller
@@ -220,10 +220,19 @@ reclaimer(void *ptr)
{
struct nlm_host *host = (struct nlm_host *) ptr;
struct nlm_wait *block;
+ struct nlm_rqst *req;
struct file_lock *fl, *next;
u32 nsmstate;
struct net *net = host->net;
+ req = kmalloc(sizeof(*req), GFP_KERNEL);
+ if (!req) {
+ printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
+ " Locks for %s won't be reclaimed!\n",
+ host->h_name);
+ return 0;
+ }
+
allow_signal(SIGKILL);
down_write(&host->h_rwsem);
@@ -253,7 +262,7 @@ restart:
*/
if (signalled())
continue;
- if (nlmclnt_reclaim(host, fl) != 0)
+ if (nlmclnt_reclaim(host, fl, req) != 0)
continue;
list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
if (host->h_nsmstate != nsmstate) {
@@ -279,5 +288,6 @@ restart:
/* Release host handle after use */
nlmclnt_release_host(host);
lockd_down(net);
+ kfree(req);
return 0;
}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 54f9e6ce0430..7e529c3c45c0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -127,7 +127,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
struct nlm_lock *lock = &argp->lock;
nlmclnt_next_cookie(&argp->cookie);
- memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
+ memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
lock->caller = utsname()->nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
@@ -550,6 +550,9 @@ again:
status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
if (status < 0)
break;
+ /* Resend the blocking lock request after a server reboot */
+ if (resp->status == nlm_lck_denied_grace_period)
+ continue;
if (resp->status != nlm_lck_blocked)
break;
}
@@ -615,17 +618,15 @@ out_unlock:
* RECLAIM: Try to reclaim a lock
*/
int
-nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
+nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
+ struct nlm_rqst *req)
{
- struct nlm_rqst reqst, *req;
int status;
- req = &reqst;
memset(req, 0, sizeof(*req));
locks_init_lock(&req->a_args.lock.fl);
locks_init_lock(&req->a_res.lock.fl);
req->a_host = host;
- req->a_flags = 0;
/* Set up the argument struct */
nlmclnt_setlockargs(req, fl);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 0e17090c310f..969d589c848d 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -13,6 +13,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/mutex.h>
@@ -32,15 +33,15 @@
static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH];
static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH];
-#define for_each_host(host, pos, chain, table) \
+#define for_each_host(host, chain, table) \
for ((chain) = (table); \
(chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
- hlist_for_each_entry((host), (pos), (chain), h_hash)
+ hlist_for_each_entry((host), (chain), h_hash)
-#define for_each_host_safe(host, pos, next, chain, table) \
+#define for_each_host_safe(host, next, chain, table) \
for ((chain) = (table); \
(chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
- hlist_for_each_entry_safe((host), (pos), (next), \
+ hlist_for_each_entry_safe((host), (next), \
(chain), h_hash)
static unsigned long nrhosts;
@@ -225,7 +226,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
.net = net,
};
struct hlist_head *chain;
- struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -237,7 +237,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
mutex_lock(&nlm_host_mutex);
chain = &nlm_client_hosts[nlm_hash_address(sap)];
- hlist_for_each_entry(host, pos, chain, h_hash) {
+ hlist_for_each_entry(host, chain, h_hash) {
if (host->net != net)
continue;
if (!rpc_cmp_addr(nlm_addr(host), sap))
@@ -322,7 +322,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
const size_t hostname_len)
{
struct hlist_head *chain;
- struct hlist_node *pos;
struct nlm_host *host = NULL;
struct nsm_handle *nsm = NULL;
struct sockaddr *src_sap = svc_daddr(rqstp);
@@ -350,7 +349,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
nlm_gc_hosts(net);
chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
- hlist_for_each_entry(host, pos, chain, h_hash) {
+ hlist_for_each_entry(host, chain, h_hash) {
if (host->net != net)
continue;
if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
@@ -515,10 +514,9 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
{
struct nlm_host *host;
struct hlist_head *chain;
- struct hlist_node *pos;
mutex_lock(&nlm_host_mutex);
- for_each_host(host, pos, chain, cache) {
+ for_each_host(host, chain, cache) {
if (host->h_nsmhandle == nsm
&& host->h_nsmstate != info->state) {
host->h_nsmstate = info->state;
@@ -570,7 +568,6 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
static void nlm_complain_hosts(struct net *net)
{
struct hlist_head *chain;
- struct hlist_node *pos;
struct nlm_host *host;
if (net) {
@@ -587,7 +584,7 @@ static void nlm_complain_hosts(struct net *net)
dprintk("lockd: %lu hosts left:\n", nrhosts);
}
- for_each_host(host, pos, chain, nlm_server_hosts) {
+ for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
dprintk(" %s (cnt %d use %d exp %ld net %p)\n",
@@ -600,14 +597,13 @@ void
nlm_shutdown_hosts_net(struct net *net)
{
struct hlist_head *chain;
- struct hlist_node *pos;
struct nlm_host *host;
mutex_lock(&nlm_host_mutex);
/* First, make all hosts eligible for gc */
dprintk("lockd: nuking all hosts in net %p...\n", net);
- for_each_host(host, pos, chain, nlm_server_hosts) {
+ for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
host->h_expires = jiffies - 1;
@@ -644,11 +640,11 @@ static void
nlm_gc_hosts(struct net *net)
{
struct hlist_head *chain;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct nlm_host *host;
dprintk("lockd: host garbage collection for net %p\n", net);
- for_each_host(host, pos, chain, nlm_server_hosts) {
+ for_each_host(host, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
host->h_inuse = 0;
@@ -657,7 +653,7 @@ nlm_gc_hosts(struct net *net)
/* Mark all hosts that hold locks, blocks or shares */
nlmsvc_mark_resources(net);
- for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
+ for_each_host_safe(host, next, chain, nlm_server_hosts) {
if (net && host->net != net)
continue;
if (atomic_read(&host->h_count) || host->h_inuse
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3c2cfc683631..1812f026960c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtsock.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 8d80c990dffd..e703318c41df 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -406,8 +406,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
__be32 ret;
dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
- file->f_file->f_path.dentry->d_inode->i_sb->s_id,
- file->f_file->f_path.dentry->d_inode->i_ino,
+ file_inode(file->f_file)->i_sb->s_id,
+ file_inode(file->f_file)->i_ino,
lock->fl.fl_type, lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end,
@@ -513,8 +513,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
__be32 ret;
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
- file->f_file->f_path.dentry->d_inode->i_sb->s_id,
- file->f_file->f_path.dentry->d_inode->i_ino,
+ file_inode(file->f_file)->i_sb->s_id,
+ file_inode(file->f_file)->i_ino,
lock->fl.fl_type,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -606,8 +606,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
int error;
dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
- file->f_file->f_path.dentry->d_inode->i_sb->s_id,
- file->f_file->f_path.dentry->d_inode->i_ino,
+ file_inode(file->f_file)->i_sb->s_id,
+ file_inode(file->f_file)->i_ino,
lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -635,8 +635,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
int status = 0;
dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
- file->f_file->f_path.dentry->d_inode->i_sb->s_id,
- file->f_file->f_path.dentry->d_inode->i_ino,
+ file_inode(file->f_file)->i_sb->s_id,
+ file_inode(file->f_file)->i_ino,
lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0deb5f6c9dd4..97e87415b145 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -13,7 +13,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/nfsd/nfsfh.h>
#include <linux/nfsd/export.h>
#include <linux/lockd/lockd.h>
@@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
{
- struct inode *inode = file->f_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file->f_file);
dprintk("lockd: %s %s/%ld\n",
msg, inode->i_sb->s_id, inode->i_ino);
@@ -84,7 +84,6 @@ __be32
nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
struct nfs_fh *f)
{
- struct hlist_node *pos;
struct nlm_file *file;
unsigned int hash;
__be32 nfserr;
@@ -96,7 +95,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
/* Lock file table */
mutex_lock(&nlm_file_mutex);
- hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
+ hlist_for_each_entry(file, &nlm_files[hash], f_list)
if (!nfs_compare_fh(&file->f_handle, f))
goto found;
@@ -248,13 +247,13 @@ static int
nlm_traverse_files(void *data, nlm_host_match_fn_t match,
int (*is_failover_file)(void *data, struct nlm_file *file))
{
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct nlm_file *file;
int i, ret = 0;
mutex_lock(&nlm_file_mutex);
for (i = 0; i < FILE_NRHASH; i++) {
- hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
+ hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) {
if (is_failover_file && !is_failover_file(data, file))
continue;
file->f_count++;
diff --git a/fs/locks.c b/fs/locks.c
index a94e331a52a2..cb424a4fed71 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -334,7 +334,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
start = filp->f_pos;
break;
case SEEK_END:
- start = i_size_read(filp->f_path.dentry->d_inode);
+ start = i_size_read(file_inode(filp));
break;
default:
return -EINVAL;
@@ -384,7 +384,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
start = filp->f_pos;
break;
case SEEK_END:
- start = i_size_read(filp->f_path.dentry->d_inode);
+ start = i_size_read(file_inode(filp));
break;
default:
return -EINVAL;
@@ -627,7 +627,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock *cfl;
lock_flocks();
- for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
+ for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
if (!IS_POSIX(cfl))
continue;
if (posix_locks_conflict(fl, cfl))
@@ -708,7 +708,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
{
struct file_lock *new_fl = NULL;
struct file_lock **before;
- struct inode * inode = filp->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(filp);
int error = 0;
int found = 0;
@@ -1002,7 +1002,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
- return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock);
+ return __posix_lock_file(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
@@ -1326,8 +1326,8 @@ int fcntl_getlease(struct file *filp)
int type = F_UNLCK;
lock_flocks();
- time_out_leases(filp->f_path.dentry->d_inode);
- for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
+ time_out_leases(file_inode(filp));
+ for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
if (fl->fl_file == filp) {
type = target_leasetype(fl);
@@ -1843,7 +1843,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = filp->f_path.dentry->d_inode;
+ inode = file_inode(filp);
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
@@ -1961,7 +1961,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = filp->f_path.dentry->d_inode;
+ inode = file_inode(filp);
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
@@ -2030,7 +2030,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- if (!filp->f_path.dentry->d_inode->i_flock)
+ if (!file_inode(filp)->i_flock)
return;
lock.fl_type = F_UNLCK;
@@ -2056,7 +2056,7 @@ EXPORT_SYMBOL(locks_remove_posix);
*/
void locks_remove_flock(struct file *filp)
{
- struct inode * inode = filp->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(filp);
struct file_lock *fl;
struct file_lock **before;
@@ -2152,7 +2152,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
fl_pid = fl->fl_pid;
if (fl->fl_file != NULL)
- inode = fl->fl_file->f_path.dentry->d_inode;
+ inode = file_inode(fl->fl_file);
seq_printf(f, "%lld:%s ", id, pfx);
if (IS_POSIX(fl)) {
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index daf9a9b32dd3..09ed066c0221 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
config LOGFS
- tristate "LogFS file system (EXPERIMENTAL)"
- depends on (MTD || BLOCK) && EXPERIMENTAL
+ tristate "LogFS file system"
+ depends on (MTD || BLOCK)
select ZLIB_INFLATE
select ZLIB_DEFLATE
select CRC32
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 26e4a941532f..b82751082112 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -284,7 +284,7 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
#define IMPLICIT_NODES 2
static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
{
- struct inode *dir = file->f_dentry->d_inode;
+ struct inode *dir = file_inode(file);
loff_t pos = file->f_pos - IMPLICIT_NODES;
struct page *page;
struct logfs_disk_dentry *dd;
@@ -320,7 +320,7 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
ino_t pino = parent_ino(file->f_dentry);
int err;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 3886cded283c..c2219a6dd3c8 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -183,7 +183,7 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct logfs_inode *li = logfs_inode(inode);
unsigned int oldflags, flags;
int err;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e1a3b6bf6324..9a59cbade2fb 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)
logfs_put_wblocks(sb, NULL, 1);
}
- if (!err)
- err = vmtruncate(inode, target);
+ if (!err) {
+ err = inode_newsize_ok(inode, target);
+ if (err)
+ goto out;
+
+ truncate_setsize(inode, target);
+ }
+ out:
/* I don't trust error recovery yet. */
WARN_ON(err);
return err;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 685b2d981b87..a9ed6f36e6ea 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -85,7 +85,7 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
unsigned long pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
unsigned offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 4493ce695ab8..adc6f5494231 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ minix_truncate(inode);
}
setattr_copy(inode, attr);
@@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
}
const struct inode_operations minix_file_inode_operations = {
- .truncate = minix_truncate,
.setattr = minix_setattr,
.getattr = minix_getattr,
};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4fc5f8ab1c44..99541cceb584 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, minix_get_block);
}
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ minix_truncate(inode);
+ }
+}
+
static int minix_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
minix_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ minix_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/namei.c b/fs/namei.c
index 5f4cdf3ad913..961bc1268366 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -451,7 +451,7 @@ int inode_permission(struct inode *inode, int mask)
*
* Given a path increment the reference count to the dentry and the vfsmount.
*/
-void path_get(struct path *path)
+void path_get(const struct path *path)
{
mntget(path->mnt);
dget(path->dentry);
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(path_get);
*
* Given a path decrement the reference count to the dentry and the vfsmount.
*/
-void path_put(struct path *path)
+void path_put(const struct path *path)
{
dput(path->dentry);
mntput(path->mnt);
@@ -600,14 +600,10 @@ static int complete_walk(struct nameidata *nd)
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
- if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+ if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
return 0;
- if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
- return 0;
-
- /* Note: we do not d_invalidate() */
- status = d_revalidate(dentry, nd->flags);
+ status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
if (status > 0)
return 0;
@@ -1275,9 +1271,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
*need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
- if (d_need_lookup(dentry)) {
- *need_lookup = true;
- } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
+ if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
if (error < 0) {
@@ -1344,7 +1338,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
* small and for now I'd prefer to have fast path as straight as possible.
* It _is_ time-critical.
*/
-static int lookup_fast(struct nameidata *nd, struct qstr *name,
+static int lookup_fast(struct nameidata *nd,
struct path *path, struct inode **inode)
{
struct vfsmount *mnt = nd->path.mnt;
@@ -1360,7 +1354,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
- dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
+ dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode);
if (!dentry)
goto unlazy;
@@ -1383,8 +1377,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
return -ECHILD;
nd->seq = seq;
- if (unlikely(d_need_lookup(dentry)))
- goto unlazy;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
@@ -1404,17 +1396,12 @@ unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
} else {
- dentry = __d_lookup(parent, name);
+ dentry = __d_lookup(parent, &nd->last);
}
if (unlikely(!dentry))
goto need_lookup;
- if (unlikely(d_need_lookup(dentry))) {
- dput(dentry);
- goto need_lookup;
- }
-
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
@@ -1445,8 +1432,7 @@ need_lookup:
}
/* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct qstr *name,
- struct path *path)
+static int lookup_slow(struct nameidata *nd, struct path *path)
{
struct dentry *dentry, *parent;
int err;
@@ -1455,7 +1441,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name,
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
- dentry = __lookup_hash(name, parent, nd->flags);
+ dentry = __lookup_hash(&nd->last, parent, nd->flags);
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -1528,7 +1514,7 @@ static inline int should_follow_link(struct inode *inode, int follow)
}
static inline int walk_component(struct nameidata *nd, struct path *path,
- struct qstr *name, int type, int follow)
+ int follow)
{
struct inode *inode;
int err;
@@ -1537,14 +1523,14 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
* to be able to know about the current root directory and
* parent relationships.
*/
- if (unlikely(type != LAST_NORM))
- return handle_dots(nd, type);
- err = lookup_fast(nd, name, path, &inode);
+ if (unlikely(nd->last_type != LAST_NORM))
+ return handle_dots(nd, nd->last_type);
+ err = lookup_fast(nd, path, &inode);
if (unlikely(err)) {
if (err < 0)
goto out_err;
- err = lookup_slow(nd, name, path);
+ err = lookup_slow(nd, path);
if (err < 0)
goto out_err;
@@ -1603,8 +1589,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
res = follow_link(&link, nd, &cookie);
if (res)
break;
- res = walk_component(nd, path, &nd->last,
- nd->last_type, LOOKUP_FOLLOW);
+ res = walk_component(nd, path, LOOKUP_FOLLOW);
put_link(nd, &link, cookie);
} while (res > 0);
@@ -1811,8 +1796,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
}
+ nd->last = this;
+ nd->last_type = type;
+
if (!name[len])
- goto last_component;
+ return 0;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
@@ -1821,10 +1809,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
len++;
} while (unlikely(name[len] == '/'));
if (!name[len])
- goto last_component;
+ return 0;
+
name += len;
- err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+ err = walk_component(nd, &next, LOOKUP_FOLLOW);
if (err < 0)
return err;
@@ -1833,16 +1822,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (err)
return err;
}
- if (can_lookup(nd->inode))
- continue;
- err = -ENOTDIR;
- break;
- /* here ends the main loop */
-
-last_component:
- nd->last = this;
- nd->last_type = type;
- return 0;
+ if (!can_lookup(nd->inode)) {
+ err = -ENOTDIR;
+ break;
+ }
}
terminate_walk(nd);
return err;
@@ -1859,7 +1842,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (flags & LOOKUP_ROOT) {
struct inode *inode = nd->root.dentry->d_inode;
if (*name) {
- if (!inode->i_op->lookup)
+ if (!can_lookup(inode))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
@@ -1903,6 +1886,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
get_fs_pwd(current->fs, &nd->path);
}
} else {
+ /* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(dfd);
struct dentry *dentry;
@@ -1912,16 +1896,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
dentry = f.file->f_path.dentry;
if (*name) {
- if (!S_ISDIR(dentry->d_inode->i_mode)) {
+ if (!can_lookup(dentry->d_inode)) {
fdput(f);
return -ENOTDIR;
}
-
- retval = inode_permission(dentry->d_inode, MAY_EXEC);
- if (retval) {
- fdput(f);
- return retval;
- }
}
nd->path = f.file->f_path;
@@ -1946,8 +1924,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
nd->flags &= ~LOOKUP_PARENT;
- return walk_component(nd, path, &nd->last, nd->last_type,
- nd->flags & LOOKUP_FOLLOW);
+ return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -2189,15 +2166,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
* path-walking is complete.
*/
static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
+user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+ unsigned int flags)
{
struct filename *s = getname(path);
int error;
+ /* only LOOKUP_REVAL is allowed in extra flags */
+ flags &= LOOKUP_REVAL;
+
if (IS_ERR(s))
return s;
- error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
+ error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
if (error) {
putname(s);
return ERR_PTR(error);
@@ -2742,7 +2723,7 @@ static int do_last(struct nameidata *nd, struct path *path,
if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
symlink_ok = true;
/* we _can_ be in RCU mode here */
- error = lookup_fast(nd, &nd->last, path, &inode);
+ error = lookup_fast(nd, path, &inode);
if (likely(!error))
goto finish_lookup;
@@ -2788,7 +2769,7 @@ retry_lookup:
goto out;
if ((*opened & FILE_CREATED) ||
- !S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ !S_ISREG(file_inode(file)->i_mode))
will_truncate = false;
audit_inode(name, file->f_path.dentry, 0);
@@ -2951,8 +2932,8 @@ static struct file *path_openat(int dfd, struct filename *pathname,
int error;
file = get_empty_filp();
- if (!file)
- return ERR_PTR(-ENFILE);
+ if (IS_ERR(file))
+ return file;
file->f_flags = op->open_flag;
@@ -3044,12 +3025,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return file;
}
-struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
+struct dentry *kern_path_create(int dfd, const char *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct nameidata nd;
int err2;
- int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+ int error;
+ bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
+
+ /*
+ * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
+ * other flags passed in are ignored!
+ */
+ lookup_flags &= LOOKUP_REVAL;
+
+ error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
if (error)
return ERR_PTR(error);
@@ -3113,13 +3104,14 @@ void done_path_create(struct path *path, struct dentry *dentry)
}
EXPORT_SYMBOL(done_path_create);
-struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
+struct dentry *user_path_create(int dfd, const char __user *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct filename *tmp = getname(pathname);
struct dentry *res;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
- res = kern_path_create(dfd, tmp->name, path, is_dir);
+ res = kern_path_create(dfd, tmp->name, path, lookup_flags);
putname(tmp);
return res;
}
@@ -3175,12 +3167,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
struct dentry *dentry;
struct path path;
int error;
+ unsigned int lookup_flags = 0;
error = may_mknod(mode);
if (error)
return error;
-
- dentry = user_path_create(dfd, filename, &path, 0);
+retry:
+ dentry = user_path_create(dfd, filename, &path, lookup_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -3203,6 +3196,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
}
out:
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3241,8 +3238,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
struct dentry *dentry;
struct path path;
int error;
+ unsigned int lookup_flags = LOOKUP_DIRECTORY;
- dentry = user_path_create(dfd, pathname, &path, 1);
+retry:
+ dentry = user_path_create(dfd, pathname, &path, lookup_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -3252,6 +3251,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
if (!error)
error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3327,8 +3330,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
struct filename *name;
struct dentry *dentry;
struct nameidata nd;
-
- name = user_path_parent(dfd, pathname, &nd);
+ unsigned int lookup_flags = 0;
+retry:
+ name = user_path_parent(dfd, pathname, &nd, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -3370,6 +3374,10 @@ exit2:
exit1:
path_put(&nd.path);
putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3423,8 +3431,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
struct dentry *dentry;
struct nameidata nd;
struct inode *inode = NULL;
-
- name = user_path_parent(dfd, pathname, &nd);
+ unsigned int lookup_flags = 0;
+retry:
+ name = user_path_parent(dfd, pathname, &nd, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -3462,6 +3471,11 @@ exit2:
exit1:
path_put(&nd.path);
putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ inode = NULL;
+ goto retry;
+ }
return error;
slashes:
@@ -3513,12 +3527,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
struct filename *from;
struct dentry *dentry;
struct path path;
+ unsigned int lookup_flags = 0;
from = getname(oldname);
if (IS_ERR(from))
return PTR_ERR(from);
-
- dentry = user_path_create(newdfd, newname, &path, 0);
+retry:
+ dentry = user_path_create(newdfd, newname, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_putname;
@@ -3527,6 +3542,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
if (!error)
error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out_putname:
putname(from);
return error;
@@ -3613,12 +3632,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
-
+retry:
error = user_path_at(olddfd, oldname, how, &old_path);
if (error)
return error;
- new_dentry = user_path_create(newdfd, newname, &new_path, 0);
+ new_dentry = user_path_create(newdfd, newname, &new_path,
+ (how & LOOKUP_REVAL));
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out;
@@ -3635,6 +3655,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
out_dput:
done_path_create(&new_path, new_dentry);
+ if (retry_estale(error, how)) {
+ how |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
path_put(&old_path);
@@ -3807,15 +3831,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
struct nameidata oldnd, newnd;
struct filename *from;
struct filename *to;
+ unsigned int lookup_flags = 0;
+ bool should_retry = false;
int error;
-
- from = user_path_parent(olddfd, oldname, &oldnd);
+retry:
+ from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
}
- to = user_path_parent(newdfd, newname, &newnd);
+ to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
@@ -3887,11 +3913,18 @@ exit3:
unlock_rename(new_dir, old_dir);
mnt_drop_write(oldnd.path.mnt);
exit2:
+ if (retry_estale(error, lookup_flags))
+ should_retry = true;
path_put(&newnd.path);
putname(to);
exit1:
path_put(&oldnd.path);
putname(from);
+ if (should_retry) {
+ should_retry = false;
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
exit:
return error;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 398a50ff2438..50ca17d3cb45 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+ while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
@@ -384,7 +384,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
*/
int __mnt_want_write_file(struct file *file)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
return __mnt_want_write(file->f_path.mnt);
@@ -1237,6 +1237,14 @@ static int do_umount(struct mount *mnt, int flags)
return retval;
}
+/*
+ * Is the caller allowed to modify his namespace?
+ */
+static inline bool may_mount(void)
+{
+ return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
/*
* Now umount can handle mount points as well as block devices.
* This is important for filesystems which use unnamed block devices.
@@ -1255,6 +1263,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
return -EINVAL;
+ if (!may_mount())
+ return -EPERM;
+
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
@@ -1268,10 +1279,6 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
if (!check_mnt(mnt))
goto dput_and_out;
- retval = -EPERM;
- if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
- goto dput_and_out;
-
retval = do_umount(mnt, flags);
dput_and_out:
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
@@ -1293,24 +1300,6 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
#endif
-static int mount_is_safe(struct path *path)
-{
- if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
- return 0;
- return -EPERM;
-#ifdef notyet
- if (S_ISLNK(path->dentry->d_inode->i_mode))
- return -EPERM;
- if (path->dentry->d_inode->i_mode & S_ISVTX) {
- if (current_uid() != path->dentry->d_inode->i_uid)
- return -EPERM;
- }
- if (inode_permission(path->dentry->d_inode, MAY_WRITE))
- return -EPERM;
- return 0;
-#endif
-}
-
static bool mnt_ns_loop(struct path *path)
{
/* Could bind mounting the mount namespace inode cause a
@@ -1633,9 +1622,6 @@ static int do_change_type(struct path *path, int flag)
int type;
int err = 0;
- if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
- return -EPERM;
-
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
@@ -1669,9 +1655,7 @@ static int do_loopback(struct path *path, const char *old_name,
LIST_HEAD(umount_list);
struct path old_path;
struct mount *mnt = NULL, *old;
- int err = mount_is_safe(path);
- if (err)
- return err;
+ int err;
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -1748,9 +1732,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (!check_mnt(mnt))
return -EINVAL;
@@ -1764,6 +1745,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
down_write(&sb->s_umount);
if (flags & MS_BIND)
err = change_mount_flags(path->mnt, flags);
+ else if (!capable(CAP_SYS_ADMIN))
+ err = -EPERM;
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
@@ -1796,9 +1779,7 @@ static int do_move_mount(struct path *path, const char *old_name)
struct path old_path, parent_path;
struct mount *p;
struct mount *old;
- int err = 0;
- if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
- return -EPERM;
+ int err;
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
@@ -1933,18 +1914,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct user_namespace *user_ns;
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
if (!fstype)
return -EINVAL;
- /* we need capabilities... */
- user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
- return -EPERM;
-
type = get_fs_type(fstype);
if (!type)
return -ENODEV;
@@ -2258,6 +2234,9 @@ long do_mount(const char *dev_name, const char *dir_name,
if (retval)
goto dput_out;
+ if (!may_mount())
+ return -EPERM;
+
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
@@ -2567,7 +2546,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
struct mount *new_mnt, *root_mnt;
int error;
- if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!may_mount())
return -EPERM;
error = user_path_dir(new_root, &new);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 4117e7b377bb..816326093656 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -593,14 +593,10 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
return 1; /* I'm not sure */
qname.name = __name;
- qname.hash = full_name_hash(qname.name, qname.len);
-
- if (dentry->d_op && dentry->d_op->d_hash)
- if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
- goto end_advance;
-
- newdent = d_lookup(dentry, &qname);
+ newdent = d_hash_and_lookup(dentry, &qname);
+ if (unlikely(IS_ERR(newdent)))
+ goto end_advance;
if (!newdent) {
newdent = d_alloc(dentry, &qname);
if (!newdent)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d7e9fe77188a..7dafd6899a62 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -331,12 +331,15 @@ static int ncp_show_options(struct seq_file *seq, struct dentry *root)
struct ncp_server *server = NCP_SBP(root->d_sb);
unsigned int tmp;
- if (server->m.uid != 0)
- seq_printf(seq, ",uid=%u", server->m.uid);
- if (server->m.gid != 0)
- seq_printf(seq, ",gid=%u", server->m.gid);
- if (server->m.mounted_uid != 0)
- seq_printf(seq, ",owner=%u", server->m.mounted_uid);
+ if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",uid=%u",
+ from_kuid_munged(&init_user_ns, server->m.uid));
+ if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u",
+ from_kgid_munged(&init_user_ns, server->m.gid));
+ if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",owner=%u",
+ from_kuid_munged(&init_user_ns, server->m.mounted_uid));
tmp = server->m.file_mode & S_IALLUGO;
if (tmp != NCP_DEFAULT_FILE_MODE)
seq_printf(seq, ",mode=0%o", tmp);
@@ -381,13 +384,13 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
data->flags = 0;
data->int_flags = 0;
- data->mounted_uid = 0;
+ data->mounted_uid = GLOBAL_ROOT_UID;
data->wdog_pid = NULL;
data->ncp_fd = ~0;
data->time_out = NCP_DEFAULT_TIME_OUT;
data->retry_count = NCP_DEFAULT_RETRY_COUNT;
- data->uid = 0;
- data->gid = 0;
+ data->uid = GLOBAL_ROOT_UID;
+ data->gid = GLOBAL_ROOT_GID;
data->file_mode = NCP_DEFAULT_FILE_MODE;
data->dir_mode = NCP_DEFAULT_DIR_MODE;
data->info_fd = -1;
@@ -399,13 +402,19 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
goto err;
switch (optval) {
case 'u':
- data->uid = optint;
+ data->uid = make_kuid(current_user_ns(), optint);
+ if (!uid_valid(data->uid))
+ goto err;
break;
case 'g':
- data->gid = optint;
+ data->gid = make_kgid(current_user_ns(), optint);
+ if (!gid_valid(data->gid))
+ goto err;
break;
case 'o':
- data->mounted_uid = optint;
+ data->mounted_uid = make_kuid(current_user_ns(), optint);
+ if (!uid_valid(data->mounted_uid))
+ goto err;
break;
case 'm':
data->file_mode = optint;
@@ -480,13 +489,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
data.flags = md->flags;
data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE;
- data.mounted_uid = md->mounted_uid;
+ data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
data.wdog_pid = find_get_pid(md->wdog_pid);
data.ncp_fd = md->ncp_fd;
data.time_out = md->time_out;
data.retry_count = md->retry_count;
- data.uid = md->uid;
- data.gid = md->gid;
+ data.uid = make_kuid(current_user_ns(), md->uid);
+ data.gid = make_kgid(current_user_ns(), md->gid);
data.file_mode = md->file_mode;
data.dir_mode = md->dir_mode;
data.info_fd = -1;
@@ -499,13 +508,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
data.flags = md->flags;
- data.mounted_uid = md->mounted_uid;
+ data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
data.wdog_pid = find_get_pid(md->wdog_pid);
data.ncp_fd = md->ncp_fd;
data.time_out = md->time_out;
data.retry_count = md->retry_count;
- data.uid = md->uid;
- data.gid = md->gid;
+ data.uid = make_kuid(current_user_ns(), md->uid);
+ data.gid = make_kgid(current_user_ns(), md->gid);
data.file_mode = md->file_mode;
data.dir_mode = md->dir_mode;
data.info_fd = -1;
@@ -520,12 +529,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
goto out;
break;
}
+ error = -EINVAL;
+ if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
+ !gid_valid(data.gid))
+ goto out;
error = -EBADF;
ncp_filp = fget(data.ncp_fd);
if (!ncp_filp)
goto out;
error = -ENOTSOCK;
- sock_inode = ncp_filp->f_path.dentry->d_inode;
+ sock_inode = file_inode(ncp_filp);
if (!S_ISSOCK(sock_inode->i_mode))
goto out_fput;
sock = SOCKET_I(sock_inode);
@@ -564,7 +577,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (!server->info_filp)
goto out_bdi;
error = -ENOTSOCK;
- sock_inode = server->info_filp->f_path.dentry->d_inode;
+ sock_inode = file_inode(server->info_filp);
if (!S_ISSOCK(sock_inode->i_mode))
goto out_fput2;
info_sock = SOCKET_I(sock_inode);
@@ -886,12 +899,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
goto out;
result = -EPERM;
- if (((attr->ia_valid & ATTR_UID) &&
- (attr->ia_uid != server->m.uid)))
+ if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid))
goto out;
- if (((attr->ia_valid & ATTR_GID) &&
- (attr->ia_gid != server->m.gid)))
+ if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid))
goto out;
if (((attr->ia_valid & ATTR_MODE) &&
@@ -976,9 +987,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
goto out;
if (attr->ia_size != i_size_read(inode)) {
- result = vmtruncate(inode, attr->ia_size);
- if (result)
- goto out;
+ truncate_setsize(inode, attr->ia_size);
mark_inode_dirty(inode);
}
}
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6958adfaff08..60426ccb3b65 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -45,7 +45,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
return -EINVAL;
}
/* TODO: info.addr = server->m.serv_addr; */
- SET_UID(info.mounted_uid, server->m.mounted_uid);
+ SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
info.connection = server->connection;
info.buffer_size = server->buffer_size;
info.volume_number = NCP_FINFO(inode)->volNumber;
@@ -69,7 +69,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
DPRINTK("info.version invalid: %d\n", info2.version);
return -EINVAL;
}
- info2.mounted_uid = server->m.mounted_uid;
+ info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
info2.connection = server->connection;
info2.buffer_size = server->buffer_size;
info2.volume_number = NCP_FINFO(inode)->volNumber;
@@ -135,7 +135,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
DPRINTK("info.version invalid: %d\n", info2.version);
return -EINVAL;
}
- info2.mounted_uid = server->m.mounted_uid;
+ info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
info2.connection = server->connection;
info2.buffer_size = server->buffer_size;
info2.volume_number = NCP_FINFO(inode)->volNumber;
@@ -348,22 +348,25 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
{
u16 uid;
- SET_UID(uid, server->m.mounted_uid);
+ SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
if (put_user(uid, (u16 __user *)argp))
return -EFAULT;
return 0;
}
case NCP_IOC_GETMOUNTUID32:
- if (put_user(server->m.mounted_uid,
- (u32 __user *)argp))
+ {
+ uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+ if (put_user(uid, (u32 __user *)argp))
return -EFAULT;
return 0;
+ }
case NCP_IOC_GETMOUNTUID64:
- if (put_user(server->m.mounted_uid,
- (u64 __user *)argp))
+ {
+ uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+ if (put_user(uid, (u64 __user *)argp))
return -EFAULT;
return 0;
-
+ }
case NCP_IOC_GETROOT:
{
struct ncp_setroot_ioctl sr;
@@ -808,9 +811,9 @@ outrel:
long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ncp_server *server = NCP_SERVER(inode);
- uid_t uid = current_uid();
+ kuid_t uid = current_uid();
int need_drop_write = 0;
long ret;
@@ -819,12 +822,12 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NCP_IOC_CONN_LOGGED_IN:
case NCP_IOC_SETROOT:
if (!capable(CAP_SYS_ADMIN)) {
- ret = -EACCES;
+ ret = -EPERM;
goto out;
}
break;
}
- if (server->m.mounted_uid != uid) {
+ if (!uid_eq(server->m.mounted_uid, uid)) {
switch (cmd) {
/*
* Only mount owner can issue these ioctls. Information
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 63d14a99483d..ee24df5af1f9 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -105,7 +105,7 @@ static const struct vm_operations_struct ncp_file_mmap =
/* This is used for a general mmap of a ncp file */
int ncp_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
DPRINTK("ncp_mmap: called\n");
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 54cc0cdb3dcb..c51b2c543539 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -23,15 +23,15 @@ struct ncp_mount_data_kernel {
unsigned long flags; /* NCP_MOUNT_* flags */
unsigned int int_flags; /* internal flags */
#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
- uid_t mounted_uid; /* Who may umount() this filesystem? */
+ kuid_t mounted_uid; /* Who may umount() this filesystem? */
struct pid *wdog_pid; /* Who cares for our watchdog packets? */
unsigned int ncp_fd; /* The socket to the ncp port */
unsigned int time_out; /* How long should I wait after
sending a NCP request? */
unsigned int retry_count; /* And how often should I retry? */
unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
umode_t file_mode;
umode_t dir_mode;
int info_fd;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4fa788c93f46..434b93ec0970 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {
static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
+ .owner = THIS_MODULE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 862a2f16db64..5f7b053720ee 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
struct super_block *pipefs_sb;
int ret = 0;
+ sunrpc_init_cache_detail(cd);
pipefs_sb = rpc_get_sb_net(net);
if (pipefs_sb) {
ret = nfs_cache_register_sb(pipefs_sb, cd);
rpc_put_sb_net(net);
+ if (ret)
+ sunrpc_destroy_cache_detail(cd);
}
return ret;
}
@@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
nfs_cache_unregister_sb(pipefs_sb, cd);
rpc_put_sb_net(net);
}
-}
-
-void nfs_cache_init(struct cache_detail *cd)
-{
- sunrpc_init_cache_detail(cd);
-}
-
-void nfs_cache_destroy(struct cache_detail *cd)
-{
sunrpc_destroy_cache_detail(cd);
}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 317db95e37f8..4116d2c3f52f 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
-extern void nfs_cache_init(struct cache_detail *cd);
-extern void nfs_cache_destroy(struct cache_detail *cd);
extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
extern int nfs_cache_register_sb(struct super_block *sb,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c89b26bc9759..2960512792c2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
static u32 initiate_bulk_draining(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
- struct nfs_server *server;
- struct pnfs_layout_hdr *lo;
- struct inode *ino;
- u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
- struct pnfs_layout_hdr *tmp;
- LIST_HEAD(recall_list);
- LIST_HEAD(free_me_list);
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
-
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- if ((args->cbl_recall_type == RETURN_FSID) &&
- memcmp(&server->fsid, &args->cbl_fsid,
- sizeof(struct nfs_fsid)))
- continue;
-
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
- ino = igrab(lo->plh_inode);
- if (ino)
- continue;
- spin_lock(&ino->i_lock);
- /* Is this layout in the process of being freed? */
- if (NFS_I(ino)->layout != lo) {
- spin_unlock(&ino->i_lock);
- iput(ino);
- continue;
- }
- pnfs_get_layout_hdr(lo);
- spin_unlock(&ino->i_lock);
- list_add(&lo->plh_bulk_recall, &recall_list);
- }
- }
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
+ int stat;
- list_for_each_entry_safe(lo, tmp,
- &recall_list, plh_bulk_recall) {
- ino = lo->plh_inode;
- spin_lock(&ino->i_lock);
- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
- rv = NFS4ERR_DELAY;
- list_del_init(&lo->plh_bulk_recall);
- spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&free_me_list);
- pnfs_put_layout_hdr(lo);
- iput(ino);
- }
- return rv;
+ if (args->cbl_recall_type == RETURN_FSID)
+ stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+ else
+ stat = pnfs_destroy_layouts_byclid(clp, true);
+ if (stat != 0)
+ return NFS4ERR_DELAY;
+ return NFS4ERR_NOMATCHING_LAYOUT;
}
static u32 do_callback_layoutrecall(struct nfs_client *clp,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9f3c66438d0e..84d8eae203a7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -197,7 +197,6 @@ error_0:
EXPORT_SYMBOL_GPL(nfs_alloc_client);
#if IS_ENABLED(CONFIG_NFS_V4)
-/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
void nfs_cleanup_cb_ident_idr(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 81c5eec3cf38..6390a4b5fee7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL && (delegation->type & flags) == flags) {
+ if (delegation != NULL && (delegation->type & flags) == flags &&
+ !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
nfs_mark_delegation_referenced(delegation);
ret = 1;
}
@@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
int status = 0;
if (inode->i_flock == NULL)
- goto out;
+ return 0;
+ if (inode->i_flock == NULL)
+ goto out;
/* Protect inode->i_flock using the file locks lock */
lock_flocks();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
@@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *ctx;
+ struct nfs4_state_owner *sp;
struct nfs4_state *state;
+ unsigned int seq;
int err;
again:
@@ -109,9 +114,16 @@ again:
continue;
get_nfs_open_context(ctx);
spin_unlock(&inode->i_lock);
+ sp = state->owner;
+ /* Block nfs4_proc_unlck */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
err = nfs4_open_delegation_recall(ctx, state, stateid);
- if (err >= 0)
+ if (!err)
err = nfs_delegation_claim_locks(ctx, state);
+ if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ err = -EAGAIN;
+ mutex_unlock(&sp->so_delegreturn_mutex);
put_nfs_open_context(ctx);
if (err != 0)
return err;
@@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
}
static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *ret = NULL;
+ struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+ if (delegation == NULL)
+ goto out;
+ spin_lock(&delegation->lock);
+ if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ ret = delegation;
+ spin_unlock(&delegation->lock);
+out:
+ return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs_start_delegation_return_locked(nfsi);
+ rcu_read_unlock();
+ return delegation;
+}
+
+static void
+nfs_abort_delegation_return(struct nfs_delegation *delegation,
+ struct nfs_client *clp)
+{
+
+ spin_lock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ spin_unlock(&delegation->lock);
+ set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+
+static struct nfs_delegation *
nfs_detach_delegation_locked(struct nfs_inode *nfsi,
- struct nfs_server *server)
+ struct nfs_delegation *delegation,
+ struct nfs_client *clp)
{
- struct nfs_delegation *delegation =
+ struct nfs_delegation *deleg_cur =
rcu_dereference_protected(nfsi->delegation,
- lockdep_is_held(&server->nfs_client->cl_lock));
+ lockdep_is_held(&clp->cl_lock));
- if (delegation == NULL)
- goto nomatch;
+ if (deleg_cur == NULL || delegation != deleg_cur)
+ return NULL;
spin_lock(&delegation->lock);
+ set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
list_del_rcu(&delegation->super_list);
delegation->inode = NULL;
nfsi->delegation_state = 0;
rcu_assign_pointer(nfsi->delegation, NULL);
spin_unlock(&delegation->lock);
return delegation;
-nomatch:
- return NULL;
}
static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- struct nfs_server *server)
+ struct nfs_delegation *delegation,
+ struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
- struct nfs_delegation *delegation;
spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(nfsi, server);
+ delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
spin_unlock(&clp->cl_lock);
return delegation;
}
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_start_delegation_return(nfsi);
+ if (delegation == NULL)
+ return NULL;
+ return nfs_detach_delegation(nfsi, delegation, server);
+}
+
/**
* nfs_inode_set_delegation - set up a delegation on an inode
* @inode: inode to which delegation applies
@@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation = NULL;
goto out;
}
- freeme = nfs_detach_delegation_locked(nfsi, server);
+ freeme = nfs_detach_delegation_locked(nfsi,
+ old_delegation, clp);
+ if (freeme == NULL)
+ goto out;
}
list_add_rcu(&delegation->super_list, &server->delegations);
nfsi->delegation_state = delegation->type;
@@ -292,19 +359,29 @@ out:
/*
* Basic procedure for returning a delegation to the server
*/
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
int err;
- /*
- * Guard against new delegated open/lock/unlock calls and against
- * state recovery
- */
- down_write(&nfsi->rwsem);
- err = nfs_delegation_claim_opens(inode, &delegation->stateid);
- up_write(&nfsi->rwsem);
- if (err)
+ if (delegation == NULL)
+ return 0;
+ do {
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+ if (!issync || err != -EAGAIN)
+ break;
+ /*
+ * Guard against state recovery
+ */
+ err = nfs4_wait_clnt_recover(clp);
+ } while (err == 0);
+
+ if (err) {
+ nfs_abort_delegation_return(delegation, clp);
+ goto out;
+ }
+ if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))
goto out;
err = nfs_do_return_delegation(inode, delegation, issync);
@@ -340,13 +417,10 @@ restart:
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
continue;
- delegation = nfs_detach_delegation(NFS_I(inode),
- server);
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
- if (delegation != NULL)
- err = __nfs_inode_return_delegation(inode,
- delegation, 0);
+ err = nfs_end_delegation_return(inode, delegation, 0);
iput(inode);
if (!err)
goto restart;
@@ -367,15 +441,11 @@ restart:
*/
void nfs_inode_return_delegation_noreclaim(struct inode *inode)
{
- struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- if (rcu_access_pointer(nfsi->delegation) != NULL) {
- delegation = nfs_detach_delegation(nfsi, server);
- if (delegation != NULL)
- nfs_do_return_delegation(inode, delegation, 0);
- }
+ delegation = nfs_inode_detach_delegation(inode);
+ if (delegation != NULL)
+ nfs_do_return_delegation(inode, delegation, 0);
}
/**
@@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
*/
int nfs4_inode_return_delegation(struct inode *inode)
{
- struct nfs_server *server = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
int err = 0;
nfs_wb_all(inode);
- if (rcu_access_pointer(nfsi->delegation) != NULL) {
- delegation = nfs_detach_delegation(nfsi, server);
- if (delegation != NULL) {
- err = __nfs_inode_return_delegation(inode, delegation, 1);
- }
- }
+ delegation = nfs_start_delegation_return(nfsi);
+ if (delegation != NULL)
+ err = nfs_end_delegation_return(inode, delegation, 1);
return err;
}
@@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
- delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+ delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
nfs_free_delegation(delegation);
@@ -649,7 +715,7 @@ restart:
if (inode == NULL)
continue;
delegation = nfs_detach_delegation(NFS_I(inode),
- server);
+ delegation, server);
rcu_read_unlock();
if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bbc6a4dba0d8..d54d4fca6793 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ enum {
NFS_DELEGATION_NEED_RECLAIM = 0,
NFS_DELEGATION_RETURN,
NFS_DELEGATION_REFERENCED,
+ NFS_DELEGATION_RETURNING,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32e6c53520e2..f23f455be42b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -281,7 +281,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
for (i = 0; i < array->size; i++) {
if (array->array[i].cookie == *desc->dir_cookie) {
- struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+ struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
struct nfs_open_dir_context *ctx = desc->file->private_data;
new_pos = desc->current_index + i;
@@ -629,7 +629,7 @@ out:
static
int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
{
- struct inode *inode = desc->file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(desc->file);
int ret;
ret = nfs_readdir_xdr_to_array(desc, page, inode);
@@ -660,7 +660,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ return read_cache_page(file_inode(desc->file)->i_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
}
@@ -764,7 +764,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
{
struct page *page = NULL;
int status;
- struct inode *inode = desc->file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(desc->file);
struct nfs_open_dir_context *ctx = desc->file->private_data;
dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
@@ -1136,6 +1136,45 @@ out_error:
}
/*
+ * A weaker form of d_revalidate for revalidating just the dentry->d_inode
+ * when we don't really care about the dentry name. This is called when a
+ * pathwalk ends on a dentry that was not found via a normal lookup in the
+ * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
+ *
+ * In this situation, we just want to verify that the inode itself is OK
+ * since the dentry might have changed on the server.
+ */
+static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int error;
+ struct inode *inode = dentry->d_inode;
+
+ /*
+ * I believe we can only get a negative dentry here in the case of a
+ * procfs-style symlink. Just assume it's correct for now, but we may
+ * eventually need to do something more here.
+ */
+ if (!inode) {
+ dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ return 1;
+ }
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ return 0;
+ }
+
+ error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
+ __func__, inode->i_ino, error ? "invalid" : "valid");
+ return !error;
+}
+
+/*
* This is called from dput() when d_count is going to 0.
*/
static int nfs_dentry_delete(const struct dentry *dentry)
@@ -1202,6 +1241,7 @@ static void nfs_d_release(struct dentry *dentry)
const struct dentry_operations nfs_dentry_operations = {
.d_revalidate = nfs_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
.d_automount = nfs_d_automount,
@@ -2153,12 +2193,16 @@ static int nfs_open_permission_mask(int openflags)
{
int mask = 0;
- if ((openflags & O_ACCMODE) != O_WRONLY)
- mask |= MAY_READ;
- if ((openflags & O_ACCMODE) != O_RDONLY)
- mask |= MAY_WRITE;
- if (openflags & __FMODE_EXEC)
- mask |= MAY_EXEC;
+ if (openflags & __FMODE_EXEC) {
+ /* ONLY check exec rights */
+ mask = MAY_EXEC;
+ } else {
+ if ((openflags & O_ACCMODE) != O_WRONLY)
+ mask |= MAY_READ;
+ if ((openflags & O_ACCMODE) != O_RDONLY)
+ mask |= MAY_WRITE;
+ }
+
return mask;
}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index ca4b11ec87a2..945527092295 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/dns_resolver.h>
#include "dns_resolve.h"
@@ -42,6 +43,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
#include <linux/seq_file.h>
#include <linux/inet.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
@@ -142,7 +144,7 @@ static int nfs_dns_upcall(struct cache_detail *cd,
ret = nfs_cache_upcall(cd, key->hostname);
if (ret)
- ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ ret = sunrpc_cache_pipe_upcall(cd, ch);
return ret;
}
@@ -351,60 +353,47 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
}
EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
+static struct cache_detail nfs_dns_resolve_template = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_request = nfs_dns_request,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+
int nfs_dns_resolver_cache_init(struct net *net)
{
- int err = -ENOMEM;
+ int err;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct cache_detail *cd;
- struct cache_head **tbl;
- cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
- if (cd == NULL)
- goto err_cd;
-
- tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
- GFP_KERNEL);
- if (tbl == NULL)
- goto err_tbl;
-
- cd->owner = THIS_MODULE,
- cd->hash_size = NFS_DNS_HASHTBL_SIZE,
- cd->hash_table = tbl,
- cd->name = "dns_resolve",
- cd->cache_put = nfs_dns_ent_put,
- cd->cache_upcall = nfs_dns_upcall,
- cd->cache_parse = nfs_dns_parse,
- cd->cache_show = nfs_dns_show,
- cd->match = nfs_dns_match,
- cd->init = nfs_dns_ent_init,
- cd->update = nfs_dns_ent_update,
- cd->alloc = nfs_dns_ent_alloc,
-
- nfs_cache_init(cd);
- err = nfs_cache_register_net(net, cd);
+ nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+ if (IS_ERR(nn->nfs_dns_resolve))
+ return PTR_ERR(nn->nfs_dns_resolve);
+
+ err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
if (err)
goto err_reg;
- nn->nfs_dns_resolve = cd;
return 0;
err_reg:
- nfs_cache_destroy(cd);
- kfree(cd->hash_table);
-err_tbl:
- kfree(cd);
-err_cd:
+ cache_destroy_net(nn->nfs_dns_resolve, net);
return err;
}
void nfs_dns_resolver_cache_destroy(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct cache_detail *cd = nn->nfs_dns_resolve;
- nfs_cache_unregister_net(net, cd);
- nfs_cache_destroy(cd);
- kfree(cd->hash_table);
- kfree(cd);
+ nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+ cache_destroy_net(nn->nfs_dns_resolve, net);
}
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3c2b893665ba..29f4a48a0ee6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -292,7 +292,7 @@ static int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c817787fbdb4..24d1d1c5fcaf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
nfs_fscache_inode_unlock(inode);
}
}
+EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
/*
* Replace a per-inode cookie due to revalidation detecting a file having
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index c5b11b53ff33..4ecb76652eba 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
}
/*
+ * Invalidate the contents of fscache for this inode. This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode)
+{
+ fscache_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * Wait for an object to finish being invalidated.
+ */
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+{
+ fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
* indicate the client caching state as readable text
*/
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
@@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
return "no ";
}
-
#else /* CONFIG_NFS_FSCACHE */
static inline int nfs_fscache_register(void) { return 0; }
static inline void nfs_fscache_unregister(void) {}
@@ -205,6 +220,10 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
static inline void nfs_readpage_to_fscache(struct inode *inode,
struct page *page, int sync) {}
+
+static inline void nfs_fscache_invalidate(struct inode *inode) {}
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {}
+
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
{
return "no ";
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 033803c36644..44efaa8c5f78 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
}
spin_unlock(&ret->d_lock);
out:
- if (name)
- kfree(name);
+ kfree(name);
nfs_free_fattr(fsinfo.fattr);
return ret;
}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index bc3968fa81e5..dc0f98dfa717 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -97,7 +97,7 @@ static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
{
struct nfs4_string *owner = fattr->owner_name;
- __u32 uid;
+ kuid_t uid;
if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
return false;
@@ -111,7 +111,7 @@ static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr
static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
{
struct nfs4_string *group = fattr->group_name;
- __u32 gid;
+ kgid_t gid;
if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
return false;
@@ -193,7 +193,8 @@ static int nfs_idmap_init_keyring(void)
if (!cred)
return -ENOMEM;
- keyring = keyring_alloc(".id_resolver", 0, 0, cred,
+ keyring = keyring_alloc(".id_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
KEY_ALLOC_NOT_IN_QUOTA, NULL);
@@ -764,7 +765,7 @@ out:
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
- struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+ struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
struct key_construction *cons;
struct idmap_msg im;
@@ -836,43 +837,61 @@ idmap_release_pipe(struct inode *inode)
nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
}
-int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
- if (nfs_map_string_to_numeric(name, namelen, uid))
- return 0;
- return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
+ if (ret == 0) {
+ *uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(*uid))
+ ret = -ERANGE;
+ }
+ return ret;
}
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
- if (nfs_map_string_to_numeric(name, namelen, gid))
- return 0;
- return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
+ if (ret == 0) {
+ *gid = make_kgid(&init_user_ns, id);
+ if (!gid_valid(*gid))
+ ret = -ERANGE;
+ }
+ return ret;
}
-int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
int ret = -EINVAL;
+ __u32 id;
+ id = from_kuid(&init_user_ns, uid);
if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);
+ ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
if (ret < 0)
- ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
return ret;
}
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
{
struct idmap *idmap = server->nfs_client->cl_idmap;
int ret = -EINVAL;
+ __u32 id;
+ id = from_kgid(&init_user_ns, gid);
if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
- ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);
+ ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
if (ret < 0)
- ret = nfs_map_numeric_to_string(gid, buf, buflen);
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
return ret;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2faae14d89f4..1f941674b089 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)
nfsi->attrtimeo_timestamp = jiffies;
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
- if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
- else
+ nfs_fscache_invalidate(inode);
+ } else {
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+ }
}
void nfs_zap_caches(struct inode *inode)
@@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
if (mapping->nrpages != 0) {
spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_fscache_invalidate(inode);
spin_unlock(&inode->i_lock);
}
}
@@ -234,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque)
if (NFS_FILEID(inode) != fattr->fileid)
return 0;
+ if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+ return 0;
if (nfs_compare_fh(NFS_FH(inode), fh))
return 0;
if (is_bad_inode(inode) || NFS_STALE(inode))
@@ -329,8 +334,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_version = 0;
inode->i_size = 0;
clear_nlink(inode);
- inode->i_uid = -2;
- inode->i_gid = -2;
+ inode->i_uid = make_kuid(&init_user_ns, -2);
+ inode->i_gid = make_kgid(&init_user_ns, -2);
inode->i_blocks = 0;
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
nfsi->write_io = 0;
@@ -691,10 +696,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
if (ctx->cred != NULL)
put_rpccred(ctx->cred);
dput(ctx->dentry);
- if (is_sync)
- nfs_sb_deactive(sb);
- else
- nfs_sb_deactive_async(sb);
+ nfs_sb_deactive(sb);
kfree(ctx->mdsthreshold);
kfree(ctx);
}
@@ -711,7 +713,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
*/
void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct nfs_inode *nfsi = NFS_I(inode);
filp->private_data = get_nfs_open_context(ctx);
@@ -744,7 +746,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
static void nfs_file_clear_open_context(struct file *filp)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
@@ -881,7 +883,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
spin_unlock(&inode->i_lock);
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
- nfs_fscache_reset_inode_cookie(inode);
+ nfs_fscache_wait_on_invalidate(inode);
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
return 0;
@@ -957,6 +959,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
ret |= NFS_INO_INVALID_ATTR;
}
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+
return ret;
}
@@ -1002,9 +1008,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
- if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
- if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
/* Has the link count changed? */
@@ -1205,8 +1211,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
struct nfs_inode *nfsi = NFS_I(inode);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_fscache_invalidate(inode);
+ }
if ((fattr->valid & NFS_ATTR_FATTR) == 0)
return 0;
return nfs_refresh_inode_locked(inode, fattr);
@@ -1431,7 +1439,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
- if (inode->i_uid != fattr->uid) {
+ if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
@@ -1442,7 +1450,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
- if (inode->i_gid != fattr->gid) {
+ if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
@@ -1494,6 +1502,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(save_cache_validity & NFS_INO_REVAL_FORCED))
nfsi->cache_validity |= invalid;
+ if (invalid & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+
return 0;
out_err:
/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f0e6c7df1a07..541c9ebdbc5a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern void nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
-extern void nfs_sb_deactive_async(struct super_block *sb);
/* namespace.c */
#define NFS_PATH_CANONICAL 1
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dd057bc6b65b..fc8dc20fdeb9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -177,11 +177,31 @@ out_nofree:
return mnt;
}
+static int
+nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ if (NFS_FH(dentry->d_inode)->size != 0)
+ return nfs_getattr(mnt, dentry, stat);
+ generic_fillattr(dentry->d_inode, stat);
+ return 0;
+}
+
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ if (NFS_FH(dentry->d_inode)->size != 0)
+ return nfs_setattr(dentry, attr);
+ return -EACCES;
+}
+
const struct inode_operations nfs_mountpoint_inode_operations = {
.getattr = nfs_getattr,
+ .setattr = nfs_setattr,
};
const struct inode_operations nfs_referral_inode_operations = {
+ .getattr = nfs_namespace_getattr,
+ .setattr = nfs_namespace_setattr,
};
static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 06b9df49f7f7..62db136339ea 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -290,8 +290,13 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->mode = be32_to_cpup(p++);
fattr->nlink = be32_to_cpup(p++);
- fattr->uid = be32_to_cpup(p++);
- fattr->gid = be32_to_cpup(p++);
+ fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
+
fattr->size = be32_to_cpup(p++);
fattr->du.nfs2.blocksize = be32_to_cpup(p++);
@@ -313,6 +318,12 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
@@ -351,11 +362,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
else
*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
if (attr->ia_valid & ATTR_UID)
- *p++ = cpu_to_be32(attr->ia_uid);
+ *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
else
*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
if (attr->ia_valid & ATTR_GID)
- *p++ = cpu_to_be32(attr->ia_gid);
+ *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
else
*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
if (attr->ia_valid & ATTR_SIZE)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 70efb63b1e42..43ea96ced28c 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -872,7 +872,7 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
static int
nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index bffc32406fbf..fa6d72131c19 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -592,13 +592,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
if (attr->ia_valid & ATTR_UID) {
*p++ = xdr_one;
- *p++ = cpu_to_be32(attr->ia_uid);
+ *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
} else
*p++ = xdr_zero;
if (attr->ia_valid & ATTR_GID) {
*p++ = xdr_one;
- *p++ = cpu_to_be32(attr->ia_gid);
+ *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
} else
*p++ = xdr_zero;
@@ -657,8 +657,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
fattr->nlink = be32_to_cpup(p++);
- fattr->uid = be32_to_cpup(p++);
- fattr->gid = be32_to_cpup(p++);
+ fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
p = xdr_decode_size3(p, &fattr->size);
p = xdr_decode_size3(p, &fattr->du.nfs3.used);
@@ -675,6 +679,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->valid |= NFS_ATTR_FATTR_V3;
return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a3f488b074a2..944c9a5c1039 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,6 +13,8 @@
#define NFS4_MAX_LOOP_ON_RECOVER (10)
+#include <linux/seqlock.h>
+
struct idmap;
enum nfs4_client_state {
@@ -90,6 +92,8 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
+ seqcount_t so_reclaim_seqcount;
+ struct mutex so_delegreturn_mutex;
};
enum {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index acc347268124..ac4fc9a8fdbc 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -6,6 +6,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_idmap.h>
#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -29,15 +30,14 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
if (clp->rpc_ops->version != 4 || minorversion != 0)
return ret;
-retry:
- if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))
- return -ENOMEM;
+ idr_preload(GFP_KERNEL);
spin_lock(&nn->nfs_client_lock);
- ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ clp->cl_cb_ident = ret;
spin_unlock(&nn->nfs_client_lock);
- if (ret == -EAGAIN)
- goto retry;
- return ret;
+ idr_preload_end();
+ return ret < 0 ? ret : 0;
}
#ifdef CONFIG_NFS_V4_1
@@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
error = nfs4_discover_server_trunking(clp, &old);
if (error < 0)
goto error;
+ nfs_put_client(clp);
if (clp != old) {
clp->cl_preserve_clid = true;
- nfs_put_client(clp);
clp = old;
- atomic_inc(&clp->cl_count);
}
return clp;
@@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
.clientid = new->cl_clientid,
.confirm = new->cl_confirm,
};
- int status;
+ int status = -NFS4ERR_STALE_CLIENTID;
spin_lock(&nn->nfs_client_lock);
list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (prev)
nfs_put_client(prev);
+ prev = pos;
status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
- if (status == 0) {
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ break;
+ case 0:
nfs4_swap_callback_idents(pos, new);
- nfs_put_client(pos);
+ prev = NULL;
*result = pos;
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
- return 0;
- }
- if (status != -NFS4ERR_STALE_CLIENTID) {
- nfs_put_client(pos);
- dprintk("NFS: <-- %s status = %d, no result\n",
- __func__, status);
- return status;
+ default:
+ goto out;
}
spin_lock(&nn->nfs_client_lock);
- prev = pos;
}
+ spin_unlock(&nn->nfs_client_lock);
- /*
- * No matching nfs_client found. This should be impossible,
- * because the new nfs_client has already been added to
- * nfs_client_list by nfs_get_client().
- *
- * Don't BUG(), since the caller is holding a mutex.
- */
+ /* No match found. The server lost our clientid */
+out:
if (prev)
nfs_put_client(prev);
- spin_unlock(&nn->nfs_client_lock);
- pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
- return -NFS4ERR_STALE_CLIENTID;
+ dprintk("NFS: <-- %s status = %d\n", __func__, status);
+ return status;
}
#ifdef CONFIG_NFS_V4_1
@@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
{
struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
struct nfs_client *pos, *n, *prev = NULL;
- int error;
+ int status = -NFS4ERR_STALE_CLIENTID;
spin_lock(&nn->nfs_client_lock);
list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
nfs_put_client(prev);
prev = pos;
- error = nfs_wait_client_init_complete(pos);
- if (error < 0) {
+ nfs4_schedule_lease_recovery(pos);
+ status = nfs_wait_client_init_complete(pos);
+ if (status < 0) {
nfs_put_client(pos);
spin_lock(&nn->nfs_client_lock);
continue;
}
-
+ status = pos->cl_cons_state;
spin_lock(&nn->nfs_client_lock);
+ if (status < 0)
+ continue;
}
if (pos->rpc_ops != new->rpc_ops)
@@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (!nfs4_match_serverowners(pos, new))
continue;
+ atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
@@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
return 0;
}
- /*
- * No matching nfs_client found. This should be impossible,
- * because the new nfs_client has already been added to
- * nfs_client_list by nfs_get_client().
- *
- * Don't BUG(), since the caller is holding a mutex.
- */
+ /* No matching nfs_client found. */
spin_unlock(&nn->nfs_client_lock);
- pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
- return -NFS4ERR_STALE_CLIENTID;
+ dprintk("NFS: <-- %s status = %d\n", __func__, status);
+ return status;
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e7699308364a..13e6bb3e3fe5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -5,6 +5,7 @@
*/
#include <linux/nfs_fs.h>
#include "internal.h"
+#include "fscache.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_FILE
@@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
+ nfs_fscache_set_inode_cookie(inode, filp);
err = 0;
out_put_ctx:
@@ -92,7 +94,7 @@ static int
nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 194c48410336..49eeb044c109 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data)
task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
}
@@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data)
task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
}
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 8c07241fe52b..b8da95548d3d 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -36,7 +36,7 @@
* Default data server connection timeout and retrans vaules.
* Set by module paramters dataserver_timeo and dataserver_retrans.
*/
-#define NFS4_DEF_DS_TIMEO 60
+#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
#define NFS4_DEF_DS_RETRANS 5
/*
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b720064bcd7f..1fe284f01f8b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -31,6 +31,7 @@
#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
#include "internal.h"
#include "nfs4session.h"
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 1e09eb78543b..0dd766079e1c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/vfs.h>
#include <linux/inet.h>
#include "internal.h"
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 493f0f41c554..b2671cb0f901 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,7 +64,7 @@
#include "pnfs.h"
#include "netns.h"
#include "nfs4session.h"
-
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -93,6 +93,8 @@ static int nfs4_map_errors(int err)
return err;
switch (err) {
case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
return -EPERM;
@@ -734,6 +736,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
if (!cinfo->atomic || cinfo->before != dir->i_version)
nfs_force_lookup_revalidate(dir);
dir->i_version = cinfo->after;
+ nfs_fscache_invalidate(dir);
spin_unlock(&dir->i_lock);
}
@@ -895,6 +898,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
return 0;
if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
return 0;
+ if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ return 0;
nfs_mark_delegation_referenced(delegation);
return 1;
}
@@ -972,6 +977,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
spin_lock(&deleg_cur->lock);
if (nfsi->delegation != deleg_cur ||
+ test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
(deleg_cur->type & fmode) != fmode)
goto no_delegation_unlock;
@@ -1154,6 +1160,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
data->o_arg.fmode);
iput(inode);
out:
+ nfs_release_seqid(data->o_arg.seqid);
return state;
err_put_inode:
iput(inode);
@@ -1351,19 +1358,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
case -NFS4ERR_BAD_HIGH_SLOT:
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
case -NFS4ERR_DEADSESSION:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+ err = -EAGAIN;
goto out;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
case -NFS4ERR_EXPIRED:
/* Don't recall a delegation if it was lost */
nfs4_schedule_lease_recovery(server->nfs_client);
+ err = -EAGAIN;
goto out;
- case -ERESTARTSYS:
- /*
- * The show must go on: exit, but mark the
- * stateid as needing recovery.
- */
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
@@ -1374,6 +1380,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
err = 0;
goto out;
}
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
out:
@@ -1462,7 +1469,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
struct nfs4_state_owner *sp = data->owner;
if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
- return;
+ goto out_wait;
/*
* Check if we still need to send an OPEN call, or if we can use
* a delegation instead.
@@ -1497,6 +1504,7 @@ unlock_no_action:
rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
+out_wait:
nfs4_sequence_done(task, &data->o_res.seq_res);
}
@@ -1625,7 +1633,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
static int nfs4_opendata_access(struct rpc_cred *cred,
struct nfs4_opendata *opendata,
- struct nfs4_state *state, fmode_t fmode)
+ struct nfs4_state *state, fmode_t fmode,
+ int openflags)
{
struct nfs_access_entry cache;
u32 mask;
@@ -1637,11 +1646,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
mask = 0;
/* don't check MAY_WRITE - a newly created file may not have
- * write mode bits, but POSIX allows the creating process to write */
- if (fmode & FMODE_READ)
- mask |= MAY_READ;
- if (fmode & FMODE_EXEC)
- mask |= MAY_EXEC;
+ * write mode bits, but POSIX allows the creating process to write.
+ * use openflags to check for exec, because fmode won't
+ * always have FMODE_EXEC set when file open for exec. */
+ if (openflags & __FMODE_EXEC) {
+ /* ONLY check for exec rights */
+ mask = MAY_EXEC;
+ } else if (fmode & FMODE_READ)
+ mask = MAY_READ;
cache.cred = cred;
cache.jiffies = jiffies;
@@ -1840,6 +1852,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
sattr->ia_valid |= ATTR_MTIME;
}
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+ fmode_t fmode,
+ int flags,
+ struct nfs4_state **res)
+{
+ struct nfs4_state_owner *sp = opendata->owner;
+ struct nfs_server *server = sp->so_server;
+ struct nfs4_state *state;
+ unsigned int seq;
+ int ret;
+
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+
+ ret = _nfs4_proc_open(opendata);
+ if (ret != 0)
+ goto out;
+
+ state = nfs4_opendata_to_nfs4_state(opendata);
+ ret = PTR_ERR(state);
+ if (IS_ERR(state))
+ goto out;
+ if (server->caps & NFS_CAP_POSIX_LOCK)
+ set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+ ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+ if (ret != 0)
+ goto out;
+
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+ nfs4_schedule_stateid_recovery(server, state);
+ nfs4_wait_clnt_recover(server->nfs_client);
+ }
+ *res = state;
+out:
+ return ret;
+}
+
/*
* Returns a referenced nfs4_state
*/
@@ -1884,18 +1933,7 @@ static int _nfs4_do_open(struct inode *dir,
if (dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
- status = _nfs4_proc_open(opendata);
- if (status != 0)
- goto err_opendata_put;
-
- state = nfs4_opendata_to_nfs4_state(opendata);
- status = PTR_ERR(state);
- if (IS_ERR(state))
- goto err_opendata_put;
- if (server->caps & NFS_CAP_POSIX_LOCK)
- set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
-
- status = nfs4_opendata_access(cred, opendata, state, fmode);
+ status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
if (status != 0)
goto err_opendata_put;
@@ -2083,7 +2121,7 @@ static void nfs4_free_closedata(void *data)
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
- nfs_sb_deactive_async(sb);
+ nfs_sb_deactive(sb);
kfree(calldata);
}
@@ -2145,7 +2183,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
- return;
+ goto out_wait;
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
@@ -2167,16 +2205,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (!call_close) {
/* Note: exit _without_ calling nfs4_close_done */
- task->tk_action = NULL;
- nfs4_sequence_done(task, &calldata->res.seq_res);
- goto out;
+ goto out_no_action;
}
if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
if (calldata->roc &&
pnfs_roc_drain(inode, &calldata->roc_barrier, task))
- goto out;
+ goto out_wait;
}
nfs_fattr_init(calldata->res.fattr);
@@ -2186,8 +2222,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
&calldata->res.seq_res,
task) != 0)
nfs_release_seqid(calldata->arg.seqid);
-out:
dprintk("%s: done!\n", __func__);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
}
static const struct rpc_call_ops nfs4_close_ops = {
@@ -4418,12 +4458,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
struct nfs4_unlockdata *calldata = data;
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
- return;
+ goto out_wait;
if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
/* Note: exit _without_ running nfs4_locku_done */
- task->tk_action = NULL;
- nfs4_sequence_done(task, &calldata->res.seq_res);
- return;
+ goto out_no_action;
}
calldata->timestamp = jiffies;
if (nfs4_setup_sequence(calldata->server,
@@ -4431,6 +4469,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
&calldata->res.seq_res,
task) != 0)
nfs_release_seqid(calldata->arg.seqid);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
}
static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4477,7 +4520,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
{
- struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *sp = state->owner;
+ struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_seqid *seqid;
struct nfs4_lock_state *lsp;
struct rpc_task *task;
@@ -4487,12 +4532,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
status = nfs4_set_lock_state(state, request);
/* Unlock _before_ we do the RPC call */
request->fl_flags |= FL_EXISTS;
+ /* Exclude nfs_delegation_claim_locks() */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
down_read(&nfsi->rwsem);
if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
}
up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
if (status != 0)
goto out;
/* Is this a delegated lock? */
@@ -4571,7 +4621,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
- return;
+ goto out_wait;
/* Do we need to do an open_to_lock_owner? */
if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
@@ -4591,6 +4641,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
nfs_release_seqid(data->arg.open_seqid);
out_release_lock_seqid:
nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+ nfs4_sequence_done(task, &data->res.seq_res);
dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
}
@@ -4808,8 +4860,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
+ struct nfs4_state_owner *sp = state->owner;
struct nfs_inode *nfsi = NFS_I(state->inode);
unsigned char fl_flags = request->fl_flags;
+ unsigned int seq;
int status = -ENOLCK;
if ((fl_flags & FL_POSIX) &&
@@ -4831,9 +4885,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
status = do_vfs_lock(request->fl_file, request);
goto out_unlock;
}
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ up_read(&nfsi->rwsem);
status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
if (status != 0)
+ goto out;
+ down_read(&nfsi->rwsem);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+ status = -NFS4ERR_DELAY;
goto out_unlock;
+ }
/* Note: we always want to sleep here! */
request->fl_flags = fl_flags | FL_SLEEP;
if (do_vfs_lock(request->fl_file, request) < 0)
@@ -4940,24 +5001,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
case 0:
case -ESTALE:
goto out;
- case -NFS4ERR_EXPIRED:
- nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(server->nfs_client);
+ err = -EAGAIN;
goto out;
case -NFS4ERR_BADSESSION:
case -NFS4ERR_BADSLOT:
case -NFS4ERR_BAD_HIGH_SLOT:
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
case -NFS4ERR_DEADSESSION:
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+ err = -EAGAIN;
goto out;
- case -ERESTARTSYS:
- /*
- * The show must go on: exit, but mark the
- * stateid as needing recovery.
- */
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
@@ -4970,9 +5029,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
/* kill_proc(fl->fl_pid, SIGLOST, 1); */
err = 0;
goto out;
- case -NFS4ERR_DELAY:
- break;
}
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
out:
@@ -5990,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
struct nfs4_state *state = NULL;
+ unsigned long timeo, giveup;
dprintk("--> %s\n", __func__);
@@ -6001,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
goto out;
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
- task->tk_status = -NFS4ERR_DELAY;
+ timeo = rpc_get_timeout(task->tk_client);
+ giveup = lgp->args.timestamp + timeo;
+ if (time_after(giveup, jiffies))
+ task->tk_status = -NFS4ERR_DELAY;
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
@@ -6074,11 +6136,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
static void nfs4_layoutget_release(void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
size_t max_pages = max_response_pages(server);
dprintk("--> %s\n", __func__);
nfs4_free_pages(lgp->args.layout.pages, max_pages);
+ pnfs_put_layout_hdr(NFS_I(inode)->layout);
put_nfs_open_context(lgp->args.ctx);
kfree(calldata);
dprintk("<-- %s\n", __func__);
@@ -6093,7 +6157,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
struct pnfs_layout_segment *
nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
{
- struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
size_t max_pages = max_response_pages(server);
struct rpc_task *task;
struct rpc_message msg = {
@@ -6119,17 +6184,23 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
return ERR_PTR(-ENOMEM);
}
lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+ lgp->args.timestamp = jiffies;
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
+
+ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+ pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return ERR_CAST(task);
status = nfs4_wait_for_completion_rpc_task(task);
if (status == 0)
status = task->tk_status;
- if (status == 0)
+ /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
+ if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9448c579d41a..6ace365c6334 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
clp->cl_confirm = clid.confirm;
status = nfs40_walk_client_list(clp, result, cred);
- switch (status) {
- case -NFS4ERR_STALE_CLIENTID:
- set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- case 0:
+ if (status == 0) {
/* Sustain the lease, even if it's empty. If the clientid4
* goes stale it's of no use for trunking discovery. */
nfs4_schedule_state_renewal(*result);
- break;
}
-
out:
return status;
}
@@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server,
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
+ seqcount_init(&sp->so_reclaim_seqcount);
+ mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
* recovering after a network partition or a reboot from a
* server that doesn't support a grace period.
*/
-restart:
spin_lock(&sp->so_lock);
+ write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
continue;
@@ -1417,6 +1415,7 @@ restart:
}
spin_unlock(&state->state_lock);
nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
goto restart;
}
}
@@ -1454,12 +1453,17 @@ restart:
goto out_err;
}
nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
goto restart;
}
+ write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return 0;
out_err:
nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
+ write_seqcount_end(&sp->so_reclaim_seqcount);
+ spin_unlock(&sp->so_lock);
return status;
}
@@ -1863,6 +1867,7 @@ again:
case -ETIMEDOUT:
case -EAGAIN:
ssleep(1);
+ case -NFS4ERR_STALE_CLIENTID:
dprintk("NFS: %s after status %d, retrying\n",
__func__, status);
goto again;
@@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp)
nfs4_begin_drain_session(clp);
cred = nfs4_get_exchange_id_cred(clp);
status = nfs4_proc_destroy_session(clp->cl_session, cred);
- if (status && status != -NFS4ERR_BADSESSION &&
- status != -NFS4ERR_DEADSESSION) {
+ switch (status) {
+ case 0:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ break;
+ case -NFS4ERR_BACK_CHAN_BUSY:
+ case -NFS4ERR_DELAY:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ status = 0;
+ ssleep(1);
+ goto out;
+ default:
status = nfs4_recovery_handle_error(clp, status);
goto out;
}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 84d2e9e2f313..569b166cc050 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -28,7 +28,7 @@ static struct file_system_type nfs4_remote_fs_type = {
.name = "nfs4",
.mount = nfs4_remote_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
static struct file_system_type nfs4_remote_referral_fs_type = {
@@ -36,7 +36,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
.name = "nfs4",
.mount = nfs4_remote_referral_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
struct file_system_type nfs4_referral_fs_type = {
@@ -44,7 +44,7 @@ struct file_system_type nfs4_referral_fs_type = {
.name = "nfs4",
.mount = nfs4_referral_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
static const struct super_operations nfs4_sops = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 26b143920433..e3edda554ac7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1002,7 +1002,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
dprintk("nfs: couldn't resolve uid %d to string\n",
- iap->ia_uid);
+ from_kuid(&init_user_ns, iap->ia_uid));
/* XXX */
strcpy(owner_name, "nobody");
owner_namelen = sizeof("nobody") - 1;
@@ -1014,7 +1014,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
if (owner_grouplen < 0) {
dprintk("nfs: couldn't resolve gid %d to string\n",
- iap->ia_gid);
+ from_kgid(&init_user_ns, iap->ia_gid));
strcpy(owner_group, "nobody");
owner_grouplen = sizeof("nobody") - 1;
/* goto out; */
@@ -3778,14 +3778,14 @@ out_overflow:
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *uid,
+ const struct nfs_server *server, kuid_t *uid,
struct nfs4_string *owner_name)
{
uint32_t len;
__be32 *p;
int ret = 0;
- *uid = -2;
+ *uid = make_kuid(&init_user_ns, -2);
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
@@ -3813,7 +3813,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
__func__, len);
bitmap[1] &= ~FATTR4_WORD1_OWNER;
}
- dprintk("%s: uid=%d\n", __func__, (int)*uid);
+ dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));
return ret;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -3821,14 +3821,14 @@ out_overflow:
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
- const struct nfs_server *server, uint32_t *gid,
+ const struct nfs_server *server, kgid_t *gid,
struct nfs4_string *group_name)
{
uint32_t len;
__be32 *p;
int ret = 0;
- *gid = -2;
+ *gid = make_kgid(&init_user_ns, -2);
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
@@ -3856,7 +3856,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
__func__, len);
bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
}
- dprintk("%s: gid=%d\n", __func__, (int)*gid);
+ dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));
return ret;
out_overflow:
print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6f990656f89..88f9611a945c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_LAYOUTRET_ON_ERROR,
+ .owner = THIS_MODULE,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e7165d915362..48ac5aad6258 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -254,7 +254,7 @@ static void
pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
{
lo->plh_retry_timestamp = jiffies;
- if (test_and_set_bit(fail_bit, &lo->plh_flags))
+ if (!test_and_set_bit(fail_bit, &lo->plh_flags))
atomic_inc(&lo->plh_refcount);
}
@@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
}
EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
-/*
- * Called by the state manger to remove all layouts established under an
- * expired lease.
- */
-void
-pnfs_destroy_all_layouts(struct nfs_client *clp)
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+ struct list_head *layout_list)
{
- struct nfs_server *server;
struct pnfs_layout_hdr *lo;
- LIST_HEAD(tmp_list);
+ bool ret = false;
- nfs4_deviceid_mark_client_invalid(clp);
- nfs4_deviceid_purge_client(clp);
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+ pnfs_get_layout_hdr(lo);
+ list_add(&lo->plh_bulk_destroy, layout_list);
+ ret = true;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+ struct nfs_server *server,
+ struct list_head *layout_list)
+{
+ struct pnfs_layout_hdr *lo, *next;
+ struct inode *inode;
+
+ list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+ inode = igrab(lo->plh_inode);
+ if (inode == NULL)
+ continue;
+ list_del_init(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+ bool is_bulk_recall)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(lseg_list);
+ int ret = 0;
+
+ while (!list_empty(layout_list)) {
+ lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+ plh_bulk_destroy);
+ dprintk("%s freeing layout for inode %lu\n", __func__,
+ lo->plh_inode->i_ino);
+ inode = lo->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del_init(&lo->plh_bulk_destroy);
+ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+ ret = -EAGAIN;
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&lseg_list);
+ pnfs_put_layout_hdr(lo);
+ iput(inode);
+ }
+ return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
spin_lock(&clp->cl_lock);
rcu_read_lock();
+restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- if (!list_empty(&server->layouts))
- list_splice_init(&server->layouts, &tmp_list);
+ if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+ continue;
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
}
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- while (!list_empty(&tmp_list)) {
- lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
- plh_layouts);
- dprintk("%s freeing layout for inode %lu\n", __func__,
- lo->plh_inode->i_ino);
- list_del_init(&lo->plh_layouts);
- pnfs_destroy_layout(NFS_I(lo->plh_inode));
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
}
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+ nfs4_deviceid_mark_client_invalid(clp);
+ nfs4_deviceid_purge_client(clp);
+
+ pnfs_destroy_layouts_byclid(clp, false);
}
/*
@@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino,
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
- INIT_LIST_HEAD(&lo->plh_bulk_recall);
+ INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
return lo;
@@ -1071,7 +1181,7 @@ pnfs_update_layout(struct inode *ino,
struct nfs_client *clp = server->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
- bool first = false;
+ bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
goto out;
@@ -1105,10 +1215,9 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
- if (list_empty(&lo->plh_segs))
- first = true;
-
+ first = list_empty(&lo->plh_layouts) ? true : false;
spin_unlock(&ino->i_lock);
+
if (first) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
@@ -1312,13 +1421,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
int pnfs_write_done_resend_to_mds(struct inode *inode,
struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops)
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor pgio;
LIST_HEAD(failed);
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+ pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1353,7 +1464,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
/*
@@ -1468,13 +1580,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
int pnfs_read_done_resend_to_mds(struct inode *inode,
struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops)
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor pgio;
LIST_HEAD(failed);
/* Resend all requests through the MDS */
nfs_pageio_init_read(&pgio, inode, compl_ops);
+ pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1505,7 +1619,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
&hdr->pages,
- hdr->completion_ops);
+ hdr->completion_ops,
+ hdr->dreq);
}
/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index dbf7bba52da0..94ba80417748 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
struct list_head plh_layouts; /* other client layouts */
- struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
+ struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
atomic_t plh_outstanding; /* number of RPCs out */
@@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall);
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
@@ -225,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops);
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq);
int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops);
+ const struct nfs_pgio_completion_ops *compl_ops,
+ struct nfs_direct_req *dreq);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
/* nfs4_deviceid_flags */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index d35b62e83ea6..6da209bd9408 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -77,9 +77,8 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
long hash)
{
struct nfs4_deviceid_node *d;
- struct hlist_node *n;
- hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
if (d->ld == ld && d->nfs_client == clp &&
!memcmp(&d->deviceid, id, sizeof(*id))) {
if (atomic_read(&d->ref))
@@ -248,12 +247,11 @@ static void
_deviceid_purge_client(const struct nfs_client *clp, long hash)
{
struct nfs4_deviceid_node *d;
- struct hlist_node *n;
HLIST_HEAD(tmp);
spin_lock(&nfs4_deviceid_lock);
rcu_read_lock();
- hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
if (d->nfs_client == clp && atomic_read(&d->ref)) {
hlist_del_init_rcu(&d->node);
hlist_add_head(&d->tmpnode, &tmp);
@@ -291,12 +289,11 @@ void
nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
{
struct nfs4_deviceid_node *d;
- struct hlist_node *n;
int i;
rcu_read_lock();
for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
- hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node)
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)
if (d->nfs_client == clp)
set_bit(NFS_DEVICEID_INVALID, &d->flags);
}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f084dac948e1..fc8de9016acf 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -662,7 +662,7 @@ nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
static int
nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index b6bdb18e892c..a5e5d9899d56 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
put_nfs_open_context(rdata->args.context);
if (rdata->pages.pagevec != rdata->pages.page_array)
kfree(rdata->pages.pagevec);
- if (rdata != &read_header->rpc_data)
- kfree(rdata);
- else
+ if (rdata == &read_header->rpc_data) {
rdata->header = NULL;
+ rdata = NULL;
+ }
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
+ /* Note: we only free the rpc_task after callbacks are done.
+ * See the comment in rpc_free_task() for why
+ */
+ kfree(rdata);
}
EXPORT_SYMBOL_GPL(nfs_readdata_release);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa5315bb3666..17b32b722457 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -31,6 +31,7 @@
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
#include <linux/sunrpc/xprtsock.h>
@@ -54,7 +55,6 @@
#include <linux/parser.h>
#include <linux/nsproxy.h>
#include <linux/rcupdate.h>
-#include <linux/kthread.h>
#include <asm/uaccess.h>
@@ -292,7 +292,7 @@ struct file_system_type nfs_fs_type = {
.name = "nfs",
.mount = nfs_fs_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
EXPORT_SYMBOL_GPL(nfs_fs_type);
@@ -301,7 +301,7 @@ struct file_system_type nfs_xdev_fs_type = {
.name = "nfs",
.mount = nfs_xdev_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
const struct super_operations nfs_sops = {
@@ -331,7 +331,7 @@ struct file_system_type nfs4_fs_type = {
.name = "nfs4",
.mount = nfs_fs_mount,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
EXPORT_SYMBOL_GPL(nfs4_fs_type);
@@ -418,54 +418,6 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
-static int nfs_deactivate_super_async_work(void *ptr)
-{
- struct super_block *sb = ptr;
-
- deactivate_super(sb);
- module_put_and_exit(0);
- return 0;
-}
-
-/*
- * same effect as deactivate_super, but will do final unmount in kthread
- * context
- */
-static void nfs_deactivate_super_async(struct super_block *sb)
-{
- struct task_struct *task;
- char buf[INET6_ADDRSTRLEN + 1];
- struct nfs_server *server = NFS_SB(sb);
- struct nfs_client *clp = server->nfs_client;
-
- if (!atomic_add_unless(&sb->s_active, -1, 1)) {
- rcu_read_lock();
- snprintf(buf, sizeof(buf),
- rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
- rcu_read_unlock();
-
- __module_get(THIS_MODULE);
- task = kthread_run(nfs_deactivate_super_async_work, sb,
- "%s-deactivate-super", buf);
- if (IS_ERR(task)) {
- pr_err("%s: kthread_run: %ld\n",
- __func__, PTR_ERR(task));
- /* make synchronous call and hope for the best */
- deactivate_super(sb);
- module_put(THIS_MODULE);
- }
- }
-}
-
-void nfs_sb_deactive_async(struct super_block *sb)
-{
- struct nfs_server *server = NFS_SB(sb);
-
- if (atomic_dec_and_test(&server->active))
- nfs_deactivate_super_async(sb);
-}
-EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
-
/*
* Deliver file system statistics to userspace
*/
@@ -1152,7 +1104,7 @@ static int nfs_get_option_str(substring_t args[], char **option)
{
kfree(*option);
*option = match_strdup(args);
- return !option;
+ return !*option;
}
static int nfs_get_option_ul(substring_t args[], unsigned long *option)
@@ -2375,19 +2327,30 @@ static void nfs_get_cache_cookie(struct super_block *sb,
struct nfs_parsed_mount_data *parsed,
struct nfs_clone_mount *cloned)
{
+ struct nfs_server *nfss = NFS_SB(sb);
char *uniq = NULL;
int ulen = 0;
- if (parsed && parsed->fscache_uniq) {
- uniq = parsed->fscache_uniq;
- ulen = strlen(parsed->fscache_uniq);
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+
+ if (parsed) {
+ if (!(parsed->options & NFS_OPTION_FSCACHE))
+ return;
+ if (parsed->fscache_uniq) {
+ uniq = parsed->fscache_uniq;
+ ulen = strlen(parsed->fscache_uniq);
+ }
} else if (cloned) {
struct nfs_server *mnt_s = NFS_SB(cloned->sb);
+ if (!(mnt_s->options & NFS_OPTION_FSCACHE))
+ return;
if (mnt_s->fscache_key) {
uniq = mnt_s->fscache_key->key.uniquifier;
ulen = mnt_s->fscache_key->key.uniq_len;
};
- }
+ } else
+ return;
nfs_fscache_get_super_cookie(sb, uniq, ulen);
}
@@ -2578,27 +2541,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
struct nfs_server *server;
struct dentry *mntroot = ERR_PTR(-ENOMEM);
struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
- int error;
- dprintk("--> nfs_xdev_mount_common()\n");
+ dprintk("--> nfs_xdev_mount()\n");
mount_info.mntfh = mount_info.cloned->fh;
/* create a new volume representation */
server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
- if (IS_ERR(server)) {
- error = PTR_ERR(server);
- goto out_err;
- }
- mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod);
- dprintk("<-- nfs_xdev_mount_common() = 0\n");
-out:
- return mntroot;
+ if (IS_ERR(server))
+ mntroot = ERR_CAST(server);
+ else
+ mntroot = nfs_fs_mount_common(server, flags,
+ dev_name, &mount_info, nfs_mod);
-out_err:
- dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
- goto out;
+ dprintk("<-- nfs_xdev_mount() = %ld\n",
+ IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
+ return mntroot;
}
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3f79c77153b8..1f1f38f0c5d5 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
nfs_dec_sillycount(data->dir);
nfs_free_unlinkdata(data);
- nfs_sb_deactive_async(sb);
+ nfs_sb_deactive(sb);
}
static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
@@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
* point dentry is definitely not a root, so we won't need
* that anymore.
*/
- if (devname_garbage)
- kfree(devname_garbage);
+ kfree(devname_garbage);
return 0;
out_unlock:
spin_unlock(&dentry->d_lock);
@@ -336,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct inode *old_dir = data->old_dir;
struct inode *new_dir = data->new_dir;
struct dentry *old_dentry = data->old_dentry;
- struct dentry *new_dentry = data->new_dentry;
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
rpc_restart_call_prepare(task);
return;
}
- if (task->tk_status != 0) {
+ if (task->tk_status != 0)
nfs_cancel_async_unlink(old_dentry);
- return;
- }
-
- d_drop(old_dentry);
- d_drop(new_dentry);
}
/**
@@ -550,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
error = rpc_wait_for_completion_task(task);
if (error == 0)
error = task->tk_status;
+ switch (error) {
+ case 0:
+ /* The rename succeeded */
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ d_move(dentry, sdentry);
+ break;
+ case -ERESTARTSYS:
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ d_drop(dentry);
+ d_drop(sdentry);
+ }
rpc_put_task(task);
out_dput:
dput(sdentry);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5209916e1222..c483cc50b82e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
put_nfs_open_context(wdata->args.context);
if (wdata->pages.pagevec != wdata->pages.page_array)
kfree(wdata->pages.pagevec);
- if (wdata != &write_header->rpc_data)
- kfree(wdata);
- else
+ if (wdata == &write_header->rpc_data) {
wdata->header = NULL;
+ wdata = NULL;
+ }
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
+ /* Note: we only free the rpc_task after callbacks are done.
+ * See the comment in rpc_free_task() for why
+ */
+ kfree(wdata);
}
EXPORT_SYMBOL_GPL(nfs_writedata_release);
@@ -1794,7 +1798,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
if (PagePrivate(page))
return -EBUSY;
- nfs_fscache_release_page(page, GFP_KERNEL);
+ if (!nfs_fscache_release_page(page, GFP_KERNEL))
+ return -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 6940439bd609..ed628f71274c 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -38,8 +38,8 @@ struct nfsacl_encode_desc {
unsigned int count;
struct posix_acl *acl;
int typeflag;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
};
struct nfsacl_simple_acl {
@@ -60,14 +60,16 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
switch(entry->e_tag) {
case ACL_USER_OBJ:
- *p++ = htonl(nfsacl_desc->uid);
+ *p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid));
break;
case ACL_GROUP_OBJ:
- *p++ = htonl(nfsacl_desc->gid);
+ *p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid));
break;
case ACL_USER:
+ *p++ = htonl(from_kuid(&init_user_ns, entry->e_uid));
+ break;
case ACL_GROUP:
- *p++ = htonl(entry->e_id);
+ *p++ = htonl(from_kgid(&init_user_ns, entry->e_gid));
break;
default: /* Solaris depends on that! */
*p++ = 0;
@@ -148,6 +150,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
(struct nfsacl_decode_desc *) desc;
__be32 *p = elem;
struct posix_acl_entry *entry;
+ unsigned int id;
if (!nfsacl_desc->acl) {
if (desc->array_len > NFS_ACL_MAX_ENTRIES)
@@ -160,14 +163,22 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
- entry->e_id = ntohl(*p++);
+ id = ntohl(*p++);
entry->e_perm = ntohl(*p++);
switch(entry->e_tag) {
- case ACL_USER_OBJ:
case ACL_USER:
- case ACL_GROUP_OBJ:
+ entry->e_uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(entry->e_uid))
+ return -EINVAL;
+ break;
case ACL_GROUP:
+ entry->e_gid = make_kgid(&init_user_ns, id);
+ if (!gid_valid(entry->e_gid))
+ return -EINVAL;
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
case ACL_OTHER:
if (entry->e_perm & ~S_IRWXO)
return -EINVAL;
@@ -190,9 +201,13 @@ cmp_acl_entry(const void *x, const void *y)
if (a->e_tag != b->e_tag)
return a->e_tag - b->e_tag;
- else if (a->e_id > b->e_id)
+ else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid))
+ return 1;
+ else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid))
+ return -1;
+ else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid))
return 1;
- else if (a->e_id < b->e_id)
+ else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid))
return -1;
else
return 0;
@@ -213,22 +228,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
cmp_acl_entry, NULL);
- /* Clear undefined identifier fields and find the ACL_GROUP_OBJ
- and ACL_MASK entries. */
+ /* Find the ACL_GROUP_OBJ and ACL_MASK entries. */
FOREACH_ACL_ENTRY(pa, acl, pe) {
switch(pa->e_tag) {
case ACL_USER_OBJ:
- pa->e_id = ACL_UNDEFINED_ID;
break;
case ACL_GROUP_OBJ:
- pa->e_id = ACL_UNDEFINED_ID;
group_obj = pa;
break;
case ACL_MASK:
mask = pa;
/* fall through */
case ACL_OTHER:
- pa->e_id = ACL_UNDEFINED_ID;
break;
}
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 8df1ea4a6ff9..430b6872806f 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -65,8 +65,8 @@ config NFSD_V3_ACL
If unsure, say N.
config NFSD_V4
- bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
- depends on NFSD && PROC_FS && EXPERIMENTAL
+ bool "NFS server support for NFS version 4"
+ depends on NFSD && PROC_FS
select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 34e5c40af5ef..8b186a4955cc 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -44,8 +44,6 @@
struct nfs4_acl *nfs4_acl_new(int);
int nfs4_acl_get_whotype(char *, u32);
int nfs4_acl_write_who(int who, char *p);
-int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
- uid_t who, u32 mask);
#define NFS4_ACL_TYPE_DEFAULT 0x01
#define NFS4_ACL_DIR 0x02
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 34a10d78b839..06cddd572264 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -47,9 +47,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
if (!gi)
goto oom;
} else if (flags & NFSEXP_ROOTSQUASH) {
- if (!new->fsuid)
+ if (uid_eq(new->fsuid, GLOBAL_ROOT_UID))
new->fsuid = exp->ex_anon_uid;
- if (!new->fsgid)
+ if (gid_eq(new->fsgid, GLOBAL_ROOT_GID))
new->fsgid = exp->ex_anon_gid;
gi = groups_alloc(rqgi->ngroups);
@@ -58,7 +58,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
for (i = 0; i < rqgi->ngroups; i++) {
if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
- GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid);
+ GROUP_AT(gi, i) = exp->ex_anon_gid;
else
GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
}
@@ -66,9 +66,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
gi = get_group_info(rqgi);
}
- if (new->fsuid == (uid_t) -1)
+ if (uid_eq(new->fsuid, INVALID_UID))
new->fsuid = exp->ex_anon_uid;
- if (new->fsgid == (gid_t) -1)
+ if (gid_eq(new->fsgid, INVALID_GID))
new->fsgid = exp->ex_anon_gid;
ret = set_groups(new, gi);
@@ -76,7 +76,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
if (ret < 0)
goto error;
- if (new->fsuid)
+ if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
else
new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
index 78b3c0e93822..53325a12ba62 100644
--- a/fs/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,6 +1,5 @@
/*
* nfsd-specific authentication stuff.
- * uid/gid mapping not yet implemented.
*
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*/
@@ -8,11 +7,6 @@
#ifndef LINUX_NFSD_AUTH_H
#define LINUX_NFSD_AUTH_H
-#define nfsd_luid(rq, uid) ((u32)(uid))
-#define nfsd_lgid(rq, gid) ((u32)(gid))
-#define nfsd_ruid(rq, uid) ((u32)(uid))
-#define nfsd_rgid(rq, gid) ((u32)(gid))
-
/*
* Set the current process's fsuid/fsgid etc to those of the NFS
* client user
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 93cc9d34c459..87fd1410b737 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -12,6 +12,10 @@
/*
* Representation of a reply cache entry.
+ *
+ * Note that we use a sockaddr_in6 to hold the address instead of the more
+ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage
+ * is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
struct hlist_node c_hash;
@@ -20,11 +24,13 @@ struct svc_cacherep {
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- struct sockaddr_in c_addr;
+ struct sockaddr_in6 c_addr;
__be32 c_xid;
u32 c_prot;
u32 c_proc;
u32 c_vers;
+ unsigned int c_len;
+ __wsum c_csum;
unsigned long c_timestamp;
union {
struct kvec u_vec;
@@ -46,8 +52,7 @@ enum {
enum {
RC_DROPIT,
RC_REPLY,
- RC_DOIT,
- RC_INTR
+ RC_DOIT
};
/*
@@ -67,6 +72,12 @@ enum {
*/
#define RC_DELAY (HZ/5)
+/* Cache entries expire after this time period */
+#define RC_EXPIRE (120 * HZ)
+
+/* Checksum this amount of the request */
+#define RC_CSUMLEN (256U)
+
int nfsd_reply_cache_init(void);
void nfsd_reply_cache_shutdown(void);
int nfsd_cache_lookup(struct svc_rqst *);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a3946cf13fc8..5f38ea36e266 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
-}
-
static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
struct svc_expkey *old);
static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *);
@@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = {
.hash_size = EXPKEY_HASHMAX,
.name = "nfsd.fh",
.cache_put = expkey_put,
- .cache_upcall = expkey_upcall,
+ .cache_request = expkey_request,
.cache_parse = expkey_parse,
.cache_show = expkey_show,
.match = expkey_match,
@@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref)
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
+ kfree(exp->ex_uuid);
kfree(exp);
}
@@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
-}
-
static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -544,13 +535,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
err = get_int(&mesg, &an_int);
if (err)
goto out3;
- exp.ex_anon_uid= an_int;
+ exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
+ if (!uid_valid(exp.ex_anon_uid))
+ goto out3;
/* anon gid */
err = get_int(&mesg, &an_int);
if (err)
goto out3;
- exp.ex_anon_gid= an_int;
+ exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
+ if (!gid_valid(exp.ex_anon_gid))
+ goto out3;
/* fsid */
err = get_int(&mesg, &an_int);
@@ -613,7 +608,7 @@ out:
}
static void exp_flags(struct seq_file *m, int flag, int fsid,
- uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
+ kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
static void show_secinfo(struct seq_file *m, struct svc_export *exp);
static int svc_export_show(struct seq_file *m,
@@ -670,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
+ new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -711,7 +707,7 @@ static struct cache_detail svc_export_cache_template = {
.hash_size = EXPORT_HASHMAX,
.name = "nfsd.export",
.cache_put = svc_export_put,
- .cache_upcall = svc_export_upcall,
+ .cache_request = svc_export_request,
.cache_parse = svc_export_parse,
.cache_show = svc_export_show,
.match = svc_export_match,
@@ -1179,15 +1175,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp)
}
static void exp_flags(struct seq_file *m, int flag, int fsid,
- uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
+ kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)
{
show_expflags(m, flag, NFSEXP_ALLFLAGS);
if (flag & NFSEXP_FSID)
seq_printf(m, ",fsid=%d", fsid);
- if (anonu != (uid_t)-2 && anonu != (0x10000-2))
- seq_printf(m, ",anonuid=%u", anonu);
- if (anong != (gid_t)-2 && anong != (0x10000-2))
- seq_printf(m, ",anongid=%u", anong);
+ if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) &&
+ !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2)))
+ seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu));
+ if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) &&
+ !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2)))
+ seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong));
if (fsloc && fsloc->locations_count > 0) {
char *loctype = (fsloc->migrated) ? "refer" : "replicas";
int i;
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e6c38159622f..d620e7f81429 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -8,61 +8,144 @@
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/sunrpc/addr.h>
+#include <asm/uaccess.h>
#include "state.h"
-#include "fault_inject.h"
+#include "netns.h"
struct nfsd_fault_inject_op {
char *file;
- void (*func)(u64);
+ u64 (*forget)(struct nfs4_client *, u64);
+ u64 (*print)(struct nfs4_client *, u64);
};
static struct nfsd_fault_inject_op inject_ops[] = {
{
.file = "forget_clients",
- .func = nfsd_forget_clients,
+ .forget = nfsd_forget_client,
+ .print = nfsd_print_client,
},
{
.file = "forget_locks",
- .func = nfsd_forget_locks,
+ .forget = nfsd_forget_client_locks,
+ .print = nfsd_print_client_locks,
},
{
.file = "forget_openowners",
- .func = nfsd_forget_openowners,
+ .forget = nfsd_forget_client_openowners,
+ .print = nfsd_print_client_openowners,
},
{
.file = "forget_delegations",
- .func = nfsd_forget_delegations,
+ .forget = nfsd_forget_client_delegations,
+ .print = nfsd_print_client_delegations,
},
{
.file = "recall_delegations",
- .func = nfsd_recall_delegations,
+ .forget = nfsd_recall_client_delegations,
+ .print = nfsd_print_client_delegations,
},
};
static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
static struct dentry *debug_dir;
-static int nfsd_inject_set(void *op_ptr, u64 val)
+static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
{
- struct nfsd_fault_inject_op *op = op_ptr;
+ u64 count = 0;
if (val == 0)
printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
else
printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
- op->func(val);
- return 0;
+ nfs4_lock_state();
+ count = nfsd_for_n_state(val, op->forget);
+ nfs4_unlock_state();
+ printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
}
-static int nfsd_inject_get(void *data, u64 *val)
+static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
+ struct sockaddr_storage *addr,
+ size_t addr_size)
{
- *val = 0;
- return 0;
+ char buf[INET6_ADDRSTRLEN];
+ struct nfs4_client *clp;
+ u64 count;
+
+ nfs4_lock_state();
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp) {
+ count = op->forget(clp, 0);
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
+ }
+ nfs4_unlock_state();
+}
+
+static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
+{
+ nfs4_lock_state();
+ *val = nfsd_for_n_state(0, op->print);
+ nfs4_unlock_state();
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+static ssize_t fault_inject_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ static u64 val;
+ char read_buf[25];
+ size_t size, ret;
+ loff_t pos = *ppos;
+
+ if (!pos)
+ nfsd_inject_get(file_inode(file)->i_private, &val);
+ size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= size || !len)
+ return 0;
+ if (len > size - pos)
+ len = size - pos;
+ ret = copy_to_user(buf, read_buf + pos, len);
+ if (ret == len)
+ return -EFAULT;
+ len -= ret;
+ *ppos = pos + len;
+ return len;
+}
+
+static ssize_t fault_inject_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ char write_buf[INET6_ADDRSTRLEN];
+ size_t size = min(sizeof(write_buf) - 1, len);
+ struct net *net = current->nsproxy->net_ns;
+ struct sockaddr_storage sa;
+ u64 val;
+
+ if (copy_from_user(write_buf, buf, size))
+ return -EFAULT;
+ write_buf[size] = '\0';
+
+ size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
+ if (size > 0)
+ nfsd_inject_set_client(file_inode(file)->i_private, &sa, size);
+ else {
+ val = simple_strtoll(write_buf, NULL, 0);
+ nfsd_inject_set(file_inode(file)->i_private, val);
+ }
+ return len; /* on success, claim we got the whole input */
+}
+
+static const struct file_operations fops_nfsd = {
+ .owner = THIS_MODULE,
+ .read = fault_inject_read,
+ .write = fault_inject_write,
+};
void nfsd_fault_inject_cleanup(void)
{
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
deleted file mode 100644
index 90bd0570956c..000000000000
--- a/fs/nfsd/fault_inject.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
- *
- * Function definitions for fault injection
- */
-
-#ifndef LINUX_NFSD_FAULT_INJECT_H
-#define LINUX_NFSD_FAULT_INJECT_H
-
-#ifdef CONFIG_NFSD_FAULT_INJECTION
-int nfsd_fault_inject_init(void);
-void nfsd_fault_inject_cleanup(void);
-void nfsd_forget_clients(u64);
-void nfsd_forget_locks(u64);
-void nfsd_forget_openowners(u64);
-void nfsd_forget_delegations(u64);
-void nfsd_recall_delegations(u64);
-#else /* CONFIG_NFSD_FAULT_INJECTION */
-static inline int nfsd_fault_inject_init(void) { return 0; }
-static inline void nfsd_fault_inject_cleanup(void) {}
-static inline void nfsd_forget_clients(u64 num) {}
-static inline void nfsd_forget_locks(u64 num) {}
-static inline void nfsd_forget_openowners(u64 num) {}
-static inline void nfsd_forget_delegations(u64 num) {}
-static inline void nfsd_recall_delegations(u64 num) {}
-#endif /* CONFIG_NFSD_FAULT_INJECTION */
-
-#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index 9d513efc01ba..bf95f6b817a4 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -54,9 +54,9 @@ static inline void nfsd_idmap_shutdown(struct net *net)
}
#endif
-__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
-__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
-int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
-int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
+int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);
#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 65c2431ea32f..1051bebff1b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -24,7 +24,18 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS 4
+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
+
+#define LOCKOWNER_INO_HASH_BITS 8
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+
+#define SESSION_HASH_SIZE 512
+
struct cld_net;
+struct nfsd4_client_tracking_ops;
struct nfsd_net {
struct cld_net *cld_net;
@@ -38,7 +49,62 @@ struct nfsd_net {
struct lock_manager nfsd4_manager;
bool grace_ended;
time_t boot_time;
+
+ /*
+ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+ * used in reboot/reset lease grace period processing
+ *
+ * conf_id_hashtbl[], and conf_name_tree hold confirmed
+ * setclientid_confirmed info.
+ *
+ * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
+ * setclientid info.
+ */
+ struct list_head *reclaim_str_hashtbl;
+ int reclaim_str_hashtbl_size;
+ struct list_head *conf_id_hashtbl;
+ struct rb_root conf_name_tree;
+ struct list_head *unconf_id_hashtbl;
+ struct rb_root unconf_name_tree;
+ struct list_head *ownerstr_hashtbl;
+ struct list_head *lockowner_ino_hashtbl;
+ struct list_head *sessionid_hashtbl;
+ /*
+ * client_lru holds client queue ordered by nfs4_client.cl_time
+ * for lease renewal.
+ *
+ * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+ * for last close replay.
+ *
+ * All of the above fields are protected by the client_mutex.
+ */
+ struct list_head client_lru;
+ struct list_head close_lru;
+
+ struct delayed_work laundromat_work;
+
+ /* client_lock protects the client lru list and session hash table */
+ spinlock_t client_lock;
+
+ struct file *rec_file;
+ bool in_grace;
+ struct nfsd4_client_tracking_ops *client_tracking_ops;
+
+ time_t nfsd4_lease;
+ time_t nfsd4_grace;
+
+ bool nfsd_net_up;
+
+ /*
+ * Time of server startup
+ */
+ struct timeval nfssvc_boot;
+
+ struct svc_serv *nfsd_serv;
};
+/* Simple check to find out if a given net was properly initialized */
+#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
+
extern int nfsd_net_id;
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b314888825d5..95d76dc6c5da 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -45,6 +45,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
+ nfserr = fh_getattr(fh, &resp->stat);
+ if (nfserr)
+ goto fail;
+
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
if (IS_ERR(acl)) {
@@ -115,6 +119,9 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
nfserr = nfserrno( nfsd_set_posix_acl(
fh, ACL_TYPE_DEFAULT, argp->acl_default) );
}
+ if (!nfserr) {
+ nfserr = fh_getattr(fh, &resp->stat);
+ }
/* argp->acl_{access,default} may have been allocated in
nfssvc_decode_setaclargs. */
@@ -129,10 +136,15 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
{
+ __be32 nfserr;
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
+ return nfserr;
+ nfserr = fh_getattr(&resp->fh, &resp->stat);
+ return nfserr;
}
/*
@@ -150,6 +162,9 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg
fh_copy(&resp->fh, &argp->fh);
resp->access = argp->access;
nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ if (nfserr)
+ return nfserr;
+ nfserr = fh_getattr(&resp->fh, &resp->stat);
return nfserr;
}
@@ -243,7 +258,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
return 0;
inode = dentry->d_inode;
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->mask);
if (!xdr_ressize_check(rqstp, p))
return 0;
@@ -253,7 +268,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
- if (!rqstp->rq_respages[rqstp->rq_resused++])
+ if (!*(rqstp->rq_next_page++))
return 0;
w -= PAGE_SIZE;
}
@@ -274,7 +289,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_attrstat *resp)
{
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
return xdr_ressize_check(rqstp, p);
}
@@ -282,7 +297,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessres *resp)
{
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->access);
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a596e9d987e4..9cbc1a841f87 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
- if (!rqstp->rq_respages[rqstp->rq_resused++])
+ if (!*(rqstp->rq_next_page++))
return 0;
w -= PAGE_SIZE;
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 97d90d1c8608..401289913130 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,6 @@ static __be32
nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
struct nfsd3_attrstat *resp)
{
- int err;
__be32 nfserr;
dprintk("nfsd: GETATTR(3) %s\n",
@@ -55,9 +54,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
if (nfserr)
RETURN_STATUS(nfserr);
- err = vfs_getattr(resp->fh.fh_export->ex_path.mnt,
- resp->fh.fh_dentry, &resp->stat);
- nfserr = nfserrno(err);
+ nfserr = fh_getattr(&resp->fh, &resp->stat);
RETURN_STATUS(nfserr);
}
@@ -460,7 +457,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
__be32 nfserr;
int count = 0;
loff_t offset;
- int i;
+ struct page **p;
caddr_t page_addr = NULL;
dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
@@ -484,8 +481,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
&resp->common,
nfs3svc_encode_entry_plus);
memcpy(resp->verf, argp->verf, 8);
- for (i=1; i<rqstp->rq_resused ; i++) {
- page_addr = page_address(rqstp->rq_respages[i]);
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
if (((caddr_t)resp->buffer >= page_addr) &&
((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 43f46cd9edea..14d9ecb96cff 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -7,8 +7,11 @@
*/
#include <linux/namei.h>
+#include <linux/sunrpc/svc_xprt.h>
#include "xdr3.h"
#include "auth.h"
+#include "netns.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -103,12 +106,14 @@ decode_sattr3(__be32 *p, struct iattr *iap)
iap->ia_mode = ntohl(*p++);
}
if (*p++) {
- iap->ia_valid |= ATTR_UID;
- iap->ia_uid = ntohl(*p++);
+ iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++));
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
}
if (*p++) {
- iap->ia_valid |= ATTR_GID;
- iap->ia_gid = ntohl(*p++);
+ iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++));
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
}
if (*p++) {
u64 newsize;
@@ -165,8 +170,8 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
*p++ = htonl((u32) stat->mode);
*p++ = htonl((u32) stat->nlink);
- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+ *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+ *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
} else {
@@ -202,10 +207,10 @@ encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
if (dentry && dentry->d_inode) {
- int err;
+ __be32 err;
struct kstat stat;
- err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat);
+ err = fh_getattr(fhp, &stat);
if (!err) {
*p++ = xdr_one; /* attributes follow */
lease_get_mtime(dentry->d_inode, &stat.mtime);
@@ -252,13 +257,12 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
*/
void fill_post_wcc(struct svc_fh *fhp)
{
- int err;
+ __be32 err;
if (fhp->fh_post_saved)
printk("nfsd: inode locked twice during operation.\n");
- err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
- &fhp->fh_post_attr);
+ err = fh_getattr(fhp, &fhp->fh_post_attr);
fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
if (err) {
fhp->fh_post_saved = 0;
@@ -323,7 +327,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readargs *args)
{
unsigned int len;
- int v,pn;
+ int v;
u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh)))
@@ -338,8 +342,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
/* set up the kvec */
v=0;
while (len > 0) {
- pn = rqstp->rq_resused++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
len -= rqstp->rq_vec[v].iov_len;
v++;
@@ -461,8 +466,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
len = ntohl(*p++);
if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
return 0;
- args->tname = new =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->tname = new = page_address(*(rqstp->rq_next_page++));
args->tlen = len;
/* first copy and check from the first page */
old = (char*)p;
@@ -533,8 +537,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- args->buffer =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -565,8 +568,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- args->buffer =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -575,7 +577,7 @@ int
nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
- int len, pn;
+ int len;
u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh)))
@@ -590,9 +592,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->count = len;
while (len > 0) {
- pn = rqstp->rq_resused++;
+ struct page *p = *(rqstp->rq_next_page++);
if (!args->buffer)
- args->buffer = page_address(rqstp->rq_respages[pn]);
+ args->buffer = page_address(p);
len -= PAGE_SIZE;
}
@@ -720,12 +722,14 @@ int
nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_writeres *resp)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
- *p++ = htonl(nfssvc_boot.tv_sec);
- *p++ = htonl(nfssvc_boot.tv_usec);
+ *p++ = htonl(nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_usec);
}
return xdr_ressize_check(rqstp, p);
}
@@ -876,7 +880,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
common);
__be32 *p = cd->buffer;
caddr_t curr_page_addr = NULL;
- int pn; /* current page number */
+ struct page ** page;
int slen; /* string (name) length */
int elen; /* estimated entry length in words */
int num_entry_words = 0; /* actual number of words */
@@ -913,8 +917,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
}
/* determine which page in rq_respages[] we are currently filling */
- for (pn=1; pn < cd->rqstp->rq_resused; pn++) {
- curr_page_addr = page_address(cd->rqstp->rq_respages[pn]);
+ for (page = cd->rqstp->rq_respages + 1;
+ page < cd->rqstp->rq_next_page; page++) {
+ curr_page_addr = page_address(*page);
if (((caddr_t)cd->buffer >= curr_page_addr) &&
((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
@@ -929,14 +934,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
if (plus)
p = encode_entryplus_baggage(cd, p, name, namlen);
num_entry_words = p - cd->buffer;
- } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
+ } else if (*(page+1) != NULL) {
/* temporarily encode entry into next page, then move back to
* current and next page in rq_respages[] */
__be32 *p1, *tmp;
int len1, len2;
/* grab next page for temporary storage of entry */
- p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]);
+ p1 = tmp = page_address(*(page+1));
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
@@ -1082,11 +1087,13 @@ int
nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitres *resp)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
- *p++ = htonl(nfssvc_boot.tv_sec);
- *p++ = htonl(nfssvc_boot.tv_usec);
+ *p++ = htonl(nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_usec);
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 9c51aff02ae2..8a50b3c18093 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -264,7 +264,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
ace->flag = eflag;
ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
- ace->who = pa->e_id;
+ ace->who_uid = pa->e_uid;
ace++;
acl->naces++;
}
@@ -273,7 +273,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
- ace->who = pa->e_id;
+ ace->who_uid = pa->e_uid;
ace++;
acl->naces++;
pa++;
@@ -300,7 +300,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
- ace->who = pa->e_id;
+ ace->who_gid = pa->e_gid;
ace++;
acl->naces++;
pa++;
@@ -329,7 +329,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
- ace->who = pa->e_id;
+ ace->who_gid = pa->e_gid;
ace++;
acl->naces++;
}
@@ -345,6 +345,18 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
acl->naces++;
}
+static bool
+pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
+{
+ if (pace1->e_tag != pace2->e_tag)
+ return pace1->e_tag > pace2->e_tag;
+ if (pace1->e_tag == ACL_USER)
+ return uid_gt(pace1->e_uid, pace2->e_uid);
+ if (pace1->e_tag == ACL_GROUP)
+ return gid_gt(pace1->e_gid, pace2->e_gid);
+ return false;
+}
+
static void
sort_pacl_range(struct posix_acl *pacl, int start, int end) {
int sorted = 0, i;
@@ -355,8 +367,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) {
while (!sorted) {
sorted = 1;
for (i = start; i < end; i++) {
- if (pacl->a_entries[i].e_id
- > pacl->a_entries[i+1].e_id) {
+ if (pace_gt(&pacl->a_entries[i],
+ &pacl->a_entries[i+1])) {
sorted = 0;
tmp = pacl->a_entries[i];
pacl->a_entries[i] = pacl->a_entries[i+1];
@@ -398,7 +410,10 @@ struct posix_ace_state {
};
struct posix_user_ace_state {
- uid_t uid;
+ union {
+ kuid_t uid;
+ kgid_t gid;
+ };
struct posix_ace_state perms;
};
@@ -521,7 +536,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
if (error)
goto out_err;
low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
- pace->e_id = ACL_UNDEFINED_ID;
for (i=0; i < state->users->n; i++) {
pace++;
@@ -531,7 +545,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
goto out_err;
low_mode_from_nfs4(state->users->aces[i].perms.allow,
&pace->e_perm, flags);
- pace->e_id = state->users->aces[i].uid;
+ pace->e_uid = state->users->aces[i].uid;
add_to_mask(state, &state->users->aces[i].perms);
}
@@ -541,7 +555,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
if (error)
goto out_err;
low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
- pace->e_id = ACL_UNDEFINED_ID;
add_to_mask(state, &state->group);
for (i=0; i < state->groups->n; i++) {
@@ -552,14 +565,13 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
goto out_err;
low_mode_from_nfs4(state->groups->aces[i].perms.allow,
&pace->e_perm, flags);
- pace->e_id = state->groups->aces[i].uid;
+ pace->e_gid = state->groups->aces[i].gid;
add_to_mask(state, &state->groups->aces[i].perms);
}
pace++;
pace->e_tag = ACL_MASK;
low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
- pace->e_id = ACL_UNDEFINED_ID;
pace++;
pace->e_tag = ACL_OTHER;
@@ -567,7 +579,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
if (error)
goto out_err;
low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
- pace->e_id = ACL_UNDEFINED_ID;
return pacl;
out_err:
@@ -587,12 +598,13 @@ static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
astate->deny |= mask & ~astate->allow;
}
-static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+static int find_uid(struct posix_acl_state *state, kuid_t uid)
{
+ struct posix_ace_state_array *a = state->users;
int i;
for (i = 0; i < a->n; i++)
- if (a->aces[i].uid == uid)
+ if (uid_eq(a->aces[i].uid, uid))
return i;
/* Not found: */
a->n++;
@@ -603,6 +615,23 @@ static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array
return i;
}
+static int find_gid(struct posix_acl_state *state, kgid_t gid)
+{
+ struct posix_ace_state_array *a = state->groups;
+ int i;
+
+ for (i = 0; i < a->n; i++)
+ if (gid_eq(a->aces[i].gid, gid))
+ return i;
+ /* Not found: */
+ a->n++;
+ a->aces[i].gid = gid;
+ a->aces[i].perms.allow = state->everyone.allow;
+ a->aces[i].perms.deny = state->everyone.deny;
+
+ return i;
+}
+
static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
{
int i;
@@ -636,7 +665,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,
}
break;
case ACL_USER:
- i = find_uid(state, state->users, ace->who);
+ i = find_uid(state, ace->who_uid);
if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
allow_bits(&state->users->aces[i].perms, mask);
} else {
@@ -658,7 +687,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,
}
break;
case ACL_GROUP:
- i = find_uid(state, state->groups, ace->who);
+ i = find_gid(state, ace->who_gid);
if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
allow_bits(&state->groups->aces[i].perms, mask);
} else {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index bdf29c96e4cd..99bc85ff0217 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include "nfsd.h"
#include "state.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {
.pipe_dir_name = "nfsd4_cb",
};
-static int max_cb_time(void)
+static int max_cb_time(struct net *net)
{
- return max(nfsd4_lease/10, (time_t)1) * HZ;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
}
+static struct rpc_cred *callback_cred;
+
+int set_callback_cred(void)
+{
+ if (callback_cred)
+ return 0;
+ callback_cred = rpc_lookup_machine_cred("nfs");
+ if (!callback_cred)
+ return -ENOMEM;
+ return 0;
+}
+
+static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+{
+ if (clp->cl_minorversion == 0) {
+ return get_rpccred(callback_cred);
+ } else {
+ struct rpc_auth *auth = client->cl_auth;
+ struct auth_cred acred = {};
+
+ acred.uid = ses->se_cb_sec.uid;
+ acred.gid = ses->se_cb_sec.gid;
+ return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+ }
+}
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
struct rpc_timeout timeparms = {
- .to_initval = max_cb_time(),
+ .to_initval = max_cb_time(clp->net),
.to_retries = 0,
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = clp->net,
.address = (struct sockaddr *) &conn->cb_addr,
.addrsize = conn->cb_addrlen,
.saddress = (struct sockaddr *) &conn->cb_saddr,
@@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_clnt *client;
+ struct rpc_cred *cred;
if (clp->cl_minorversion == 0) {
if (!clp->cl_cred.cr_principal &&
@@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
args.protocol = XPRT_TRANSPORT_BC_TCP;
- args.authflavor = RPC_AUTH_UNIX;
+ args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
client = rpc_create(&args);
@@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
PTR_ERR(client));
return PTR_ERR(client);
}
+ cred = get_backchannel_cred(clp, client, ses);
+ if (IS_ERR(cred)) {
+ rpc_shutdown_client(client);
+ return PTR_ERR(cred);
+ }
clp->cl_cb_client = client;
+ clp->cl_cb_cred = cred;
return 0;
-
}
static void warn_no_callback_path(struct nfs4_client *clp, int reason)
@@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
.rpc_call_done = nfsd4_cb_probe_done,
};
-static struct rpc_cred *callback_cred;
-
-int set_callback_cred(void)
-{
- if (callback_cred)
- return 0;
- callback_cred = rpc_lookup_machine_cred("nfs");
- if (!callback_cred)
- return -ENOMEM;
- return 0;
-}
-
static struct workqueue_struct *callback_wq;
static void run_nfsd4_cb(struct nfsd4_callback *cb)
@@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
cb->cb_msg.rpc_argp = NULL;
cb->cb_msg.rpc_resp = NULL;
- cb->cb_msg.rpc_cred = callback_cred;
cb->cb_ops = &nfsd4_cb_probe_ops;
@@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
if (clp->cl_cb_client) {
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
+ put_rpccred(clp->cl_cb_cred);
+ clp->cl_cb_cred = NULL;
}
if (clp->cl_cb_conn.cb_xprt) {
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
run_nfsd4_cb(cb);
}
-void nfsd4_do_callback_rpc(struct work_struct *w)
+static void nfsd4_do_callback_rpc(struct work_struct *w)
{
struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
@@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
nfsd4_release_cb(cb);
return;
}
+ cb->cb_msg.rpc_cred = clp->cl_cb_cred;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
cb->cb_ops, cb);
}
+void nfsd4_init_callback(struct nfsd4_callback *cb)
+{
+ INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
+}
+
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
- cb->cb_msg.rpc_cred = callback_cred;
cb->cb_ops = &nfsd4_cb_recall_ops;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a1f10c0a6255..4832fd819f88 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -65,7 +65,7 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
struct ent {
struct cache_head h;
int type; /* User / Group */
- uid_t id;
+ u32 id;
char name[IDMAP_NAMESZ];
char authname[IDMAP_NAMESZ];
};
@@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
-idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
- return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
-}
-
-static int
idtoname_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.idtoname",
.cache_put = ent_put,
- .cache_upcall = idtoname_upcall,
+ .cache_request = idtoname_request,
.cache_parse = idtoname_parse,
.cache_show = idtoname_show,
.warn_no_listener = warn_no_idmapd,
@@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
-nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
- return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
-}
-
-static int
nametoid_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = {
.hash_size = ENT_HASHMAX,
.name = "nfs4.nametoid",
.cache_put = ent_put,
- .cache_upcall = nametoid_upcall,
+ .cache_request = nametoid_request,
.cache_parse = nametoid_parse,
.cache_show = nametoid_show,
.warn_no_listener = warn_no_idmapd,
@@ -540,7 +528,7 @@ rqst_authname(struct svc_rqst *rqstp)
static __be32
idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
- uid_t *id)
+ u32 *id)
{
struct ent *item, key = {
.type = type,
@@ -564,7 +552,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
}
static int
-idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
{
struct ent *item, key = {
.id = id,
@@ -587,7 +575,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
}
static bool
-numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
{
int ret;
char buf[11];
@@ -603,7 +591,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
}
static __be32
-do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
{
if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
if (numeric_name_to_id(rqstp, type, name, namelen, id))
@@ -616,7 +604,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
}
static int
-do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
{
if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
return sprintf(name, "%u", id);
@@ -625,26 +613,40 @@ do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
__be32
nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
- __u32 *id)
+ kuid_t *uid)
{
- return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
+ __be32 status;
+ u32 id = -1;
+ status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
+ *uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(*uid))
+ status = nfserr_badowner;
+ return status;
}
__be32
nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
- __u32 *id)
+ kgid_t *gid)
{
- return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id);
+ __be32 status;
+ u32 id = -1;
+ status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
+ *gid = make_kgid(&init_user_ns, id);
+ if (!gid_valid(*gid))
+ status = nfserr_badowner;
+ return status;
}
int
-nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
{
+ u32 id = from_kuid(&init_user_ns, uid);
return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
}
int
-nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
{
+ u32 id = from_kgid(&init_user_ns, gid);
return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6c9a4b291dba..ae73175e6e68 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -40,6 +40,7 @@
#include "xdr4.h"
#include "vfs.h"
#include "current_stateid.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -194,6 +195,7 @@ static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh *resfh;
+ int accmode;
__be32 status;
resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
@@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
/* set reply cache */
fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&resfh->fh_handle);
- if (!open->op_created)
- status = do_open_permission(rqstp, resfh, open,
- NFSD_MAY_NOP);
+ accmode = NFSD_MAY_NOP;
+ if (open->op_created)
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ status = do_open_permission(rqstp, resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
fh_dup2(current_fh, resfh);
out:
@@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
struct nfsd4_compoundres *resp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
@@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* check seqid for replay. set nfs4_owner */
resp = rqstp->rq_resp;
- status = nfsd4_process_open1(&resp->cstate, open);
+ status = nfsd4_process_open1(&resp->cstate, open, nn);
if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
@@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
+ status = nfs4_check_open_reclaim(&open->op_clientid,
+ cstate->minorversion,
+ nn);
if (status)
goto out;
case NFS4_OPEN_CLAIM_FH:
@@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&access->ac_supported);
}
-static void gen_boot_verifier(nfs4_verifier *verifier)
+static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
{
__be32 verf[2];
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- verf[0] = (__be32)nfssvc_boot.tv_sec;
- verf[1] = (__be32)nfssvc_boot.tv_usec;
+ verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
memcpy(verifier->data, verf, sizeof(verifier->data));
}
@@ -503,7 +511,7 @@ static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- gen_boot_verifier(&commit->co_verf);
+ gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
}
@@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (read->rd_offset >= OFFSET_MAX)
return nfserr_inval;
+ /*
+ * If we do a zero copy read, then a client will see read data
+ * that reflects the state of the file *after* performing the
+ * following compound.
+ *
+ * To ensure proper ordering, we therefore turn off zero copy if
+ * the client wants us to do more in this compound:
+ */
+ if (!nfsd4_last_compound_op(rqstp))
+ rqstp->rq_splice_ok = false;
+
nfs4_lock_state();
/* check stateid */
if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -876,6 +895,24 @@ out:
return status;
}
+static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
+{
+ int i = 1;
+ int buflen = write->wr_buflen;
+
+ vec[0].iov_base = write->wr_head.iov_base;
+ vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
+ buflen -= vec[0].iov_len;
+
+ while (buflen) {
+ vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
+ vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
+ buflen -= vec[i].iov_len;
+ i++;
+ }
+ return i;
+}
+
static __be32
nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_write *write)
@@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file *filp = NULL;
__be32 status = nfs_ok;
unsigned long cnt;
+ int nvecs;
/* no need to check permission - this will be done in nfsd_write() */
@@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
- gen_boot_verifier(&write->wr_verifier);
+ gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
+
+ nvecs = fill_in_write_vector(rqstp->rq_vec, write);
+ WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_write(rqstp, &cstate->current_fh, filp,
- write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ write->wr_offset, rqstp->rq_vec, nvecs,
&cnt, &write->wr_how_written);
if (filp)
fput(filp);
@@ -952,14 +993,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!buf)
return nfserr_jukebox;
+ p = buf;
status = nfsd4_encode_fattr(&cstate->current_fh,
cstate->current_fh.fh_export,
- cstate->current_fh.fh_dentry, buf,
- &count, verify->ve_bmval,
+ cstate->current_fh.fh_dentry, &p,
+ count, verify->ve_bmval,
rqstp, 0);
/* this means that nfsd4_encode_fattr() ran out of space */
- if (status == nfserr_resource && count == 0)
+ if (status == nfserr_resource)
status = nfserr_not_same;
if (status)
goto out_kfree;
@@ -1666,6 +1708,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_EXCHANGE_ID",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
},
+ [OP_BACKCHANNEL_CTL] = {
+ .op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_BACKCHANNEL_CTL",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_BIND_CONN_TO_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
@@ -1719,6 +1767,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_func = (nfsd4op_func)nfsd4_free_stateid,
.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_FREE_STATEID",
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 43295d45cc2b..899ca26dd194 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {
void (*create)(struct nfs4_client *);
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
- void (*grace_done)(struct net *, time_t);
+ void (*grace_done)(struct nfsd_net *, time_t);
};
/* Globals */
-static struct file *rec_file;
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static struct nfsd4_client_tracking_ops *client_tracking_ops;
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -75,8 +73,8 @@ nfs4_save_creds(const struct cred **original_creds)
if (!new)
return -ENOMEM;
- new->fsuid = 0;
- new->fsgid = 0;
+ new->fsuid = GLOBAL_ROOT_UID;
+ new->fsgid = GLOBAL_ROOT_GID;
*original_creds = override_creds(new);
put_cred(new);
return 0;
@@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)
*out = '\0';
}
-__be32
-nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
+static int
+nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
{
struct xdr_netobj cksum;
struct hash_desc desc;
struct scatterlist sg;
- __be32 status = nfserr_jukebox;
+ int status;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm))
+ if (IS_ERR(desc.tfm)) {
+ status = PTR_ERR(desc.tfm);
goto out_no_tfm;
+ }
+
cksum.len = crypto_hash_digestsize(desc.tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL)
+ if (cksum.data == NULL) {
+ status = -ENOMEM;
goto out;
+ }
sg_init_one(&sg, clname->data, clname->len);
- if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data))
+ status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
+ if (status)
goto out;
md5_to_hex(dname, cksum.data);
- status = nfs_ok;
+ status = 0;
out:
kfree(cksum.data);
crypto_free_hash(desc.tfm);
@@ -136,29 +140,61 @@ out_no_tfm:
return status;
}
+/*
+ * If we had an error generating the recdir name for the legacy tracker
+ * then warn the admin. If the error doesn't appear to be transient,
+ * then disable recovery tracking.
+ */
+static void
+legacy_recdir_name_error(int error)
+{
+ printk(KERN_ERR "NFSD: unable to generate recoverydir "
+ "name (%d).\n", error);
+
+ /*
+ * if the algorithm just doesn't exist, then disable the recovery
+ * tracker altogether. The crypto libs will generally return this if
+ * FIPS is enabled as well.
+ */
+ if (error == -ENOENT) {
+ printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
+ "Reboot recovery will not function correctly!\n");
+
+ /* the argument is ignored by the legacy exit function */
+ nfsd4_client_tracking_exit(NULL);
+ }
+}
+
static void
nfsd4_create_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
- char *dname = clp->cl_recdir;
+ char dname[HEXDIR_LEN];
struct dentry *dir, *dentry;
+ struct nfs4_client_reclaim *crp;
int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- if (!rec_file)
+ if (!nn->rec_file)
return;
+
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(status);
+
status = nfs4_save_creds(&original_cred);
if (status < 0)
return;
- status = mnt_want_write_file(rec_file);
+ status = mnt_want_write_file(nn->rec_file);
if (status)
return;
- dir = rec_file->f_path.dentry;
+ dir = nn->rec_file->f_path.dentry;
/* lock the parent */
mutex_lock(&dir->d_inode->i_mutex);
@@ -182,18 +218,24 @@ out_put:
dput(dentry);
out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
- if (status == 0)
- vfs_fsync(rec_file, 0);
- else
+ if (status == 0) {
+ if (nn->in_grace) {
+ crp = nfs4_client_to_reclaim(dname, nn);
+ if (crp)
+ crp->cr_clp = clp;
+ }
+ vfs_fsync(nn->rec_file, 0);
+ } else {
printk(KERN_ERR "NFSD: failed to write recovery record"
" (err %d); please check that %s exists"
" and is writeable", status,
user_recovery_dirname);
- mnt_drop_write_file(rec_file);
+ }
+ mnt_drop_write_file(nn->rec_file);
nfs4_reset_creds(original_cred);
}
-typedef int (recdir_func)(struct dentry *, struct dentry *);
+typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
struct name_list {
char name[HEXDIR_LEN];
@@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
}
static int
-nfsd4_list_rec_dir(recdir_func *f)
+nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
{
const struct cred *original_cred;
- struct dentry *dir = rec_file->f_path.dentry;
+ struct dentry *dir = nn->rec_file->f_path.dentry;
LIST_HEAD(names);
int status;
@@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)
if (status < 0)
return status;
- status = vfs_llseek(rec_file, 0, SEEK_SET);
+ status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
if (status < 0) {
nfs4_reset_creds(original_cred);
return status;
}
- status = vfs_readdir(rec_file, nfsd4_build_namelist, &names);
+ status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
while (!list_empty(&names)) {
struct name_list *entry;
@@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)
status = PTR_ERR(dentry);
break;
}
- status = f(dir, dentry);
+ status = f(dir, dentry, nn);
dput(dentry);
}
list_del(&entry->list);
@@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)
}
static int
-nfsd4_unlink_clid_dir(char *name, int namlen)
+nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
{
struct dentry *dir, *dentry;
int status;
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
- dir = rec_file->f_path.dentry;
+ dir = nn->rec_file->f_path.dentry;
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
@@ -289,37 +331,52 @@ static void
nfsd4_remove_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
+ struct nfs4_client_reclaim *crp;
+ char dname[HEXDIR_LEN];
int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- status = mnt_want_write_file(rec_file);
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(status);
+
+ status = mnt_want_write_file(nn->rec_file);
if (status)
goto out;
clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
status = nfs4_save_creds(&original_cred);
if (status < 0)
- goto out;
+ goto out_drop_write;
- status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
+ status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
nfs4_reset_creds(original_cred);
- if (status == 0)
- vfs_fsync(rec_file, 0);
- mnt_drop_write_file(rec_file);
+ if (status == 0) {
+ vfs_fsync(nn->rec_file, 0);
+ if (nn->in_grace) {
+ /* remove reclaim record */
+ crp = nfsd4_find_reclaim_client(dname, nn);
+ if (crp)
+ nfs4_remove_reclaim_record(crp, nn);
+ }
+ }
+out_drop_write:
+ mnt_drop_write_file(nn->rec_file);
out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
- " %.*s\n", HEXDIR_LEN, clp->cl_recdir);
+ " %.*s\n", HEXDIR_LEN, dname);
}
static int
-purge_old(struct dentry *parent, struct dentry *child)
+purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
{
int status;
- if (nfs4_has_reclaimed_state(child->d_name.name, false))
+ if (nfs4_has_reclaimed_state(child->d_name.name, nn))
return 0;
status = vfs_rmdir(parent->d_inode, child);
@@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)
}
static void
-nfsd4_recdir_purge_old(struct net *net, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
{
int status;
- if (!rec_file)
+ nn->in_grace = false;
+ if (!nn->rec_file)
return;
- status = mnt_want_write_file(rec_file);
+ status = mnt_want_write_file(nn->rec_file);
if (status)
goto out;
- status = nfsd4_list_rec_dir(purge_old);
+ status = nfsd4_list_rec_dir(purge_old, nn);
if (status == 0)
- vfs_fsync(rec_file, 0);
- mnt_drop_write_file(rec_file);
+ vfs_fsync(nn->rec_file, 0);
+ mnt_drop_write_file(nn->rec_file);
out:
+ nfs4_release_reclaim(nn);
if (status)
printk("nfsd4: failed to purge old clients from recovery"
- " directory %s\n", rec_file->f_path.dentry->d_name.name);
+ " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
}
static int
-load_recdir(struct dentry *parent, struct dentry *child)
+load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
{
if (child->d_name.len != HEXDIR_LEN - 1) {
printk("nfsd4: illegal name %s in recovery directory\n",
@@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)
/* Keep trying; maybe the others are OK: */
return 0;
}
- nfs4_client_to_reclaim(child->d_name.name);
+ nfs4_client_to_reclaim(child->d_name.name, nn);
return 0;
}
static int
-nfsd4_recdir_load(void) {
+nfsd4_recdir_load(struct net *net) {
int status;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (!rec_file)
+ if (!nn->rec_file)
return 0;
- status = nfsd4_list_rec_dir(load_recdir);
+ status = nfsd4_list_rec_dir(load_recdir, nn);
if (status)
printk("nfsd4: failed loading clients from recovery"
- " directory %s\n", rec_file->f_path.dentry->d_name.name);
+ " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
return status;
}
@@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {
*/
static int
-nfsd4_init_recdir(void)
+nfsd4_init_recdir(struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
const struct cred *original_cred;
int status;
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
user_recovery_dirname);
- BUG_ON(rec_file);
+ BUG_ON(nn->rec_file);
status = nfs4_save_creds(&original_cred);
if (status < 0) {
@@ -400,23 +461,65 @@ nfsd4_init_recdir(void)
return status;
}
- rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
- if (IS_ERR(rec_file)) {
+ nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
+ if (IS_ERR(nn->rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
user_recovery_dirname);
- status = PTR_ERR(rec_file);
- rec_file = NULL;
+ status = PTR_ERR(nn->rec_file);
+ nn->rec_file = NULL;
}
nfs4_reset_creds(original_cred);
+ if (!status)
+ nn->in_grace = true;
return status;
}
+
+static int
+nfs4_legacy_state_init(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->reclaim_str_hashtbl)
+ return -ENOMEM;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+ nn->reclaim_str_hashtbl_size = 0;
+
+ return 0;
+}
+
+static void
+nfs4_legacy_state_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ kfree(nn->reclaim_str_hashtbl);
+}
+
static int
nfsd4_load_reboot_recovery_data(struct net *net)
{
int status;
+ status = nfsd4_init_recdir(net);
+ if (!status)
+ status = nfsd4_recdir_load(net);
+ if (status)
+ printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+}
+
+static int
+nfsd4_legacy_tracking_init(struct net *net)
+{
+ int status;
+
/* XXX: The legacy code won't work in a container */
if (net != &init_net) {
WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
@@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)
return -EINVAL;
}
- nfs4_lock_state();
- status = nfsd4_init_recdir();
- if (!status)
- status = nfsd4_recdir_load();
- nfs4_unlock_state();
+ status = nfs4_legacy_state_init(net);
if (status)
- printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+
+ status = nfsd4_load_reboot_recovery_data(net);
+ if (status)
+ goto err;
+ return 0;
+
+err:
+ nfs4_legacy_state_shutdown(net);
return status;
}
static void
-nfsd4_shutdown_recdir(void)
+nfsd4_shutdown_recdir(struct nfsd_net *nn)
{
- if (!rec_file)
+ if (!nn->rec_file)
return;
- fput(rec_file);
- rec_file = NULL;
+ fput(nn->rec_file);
+ nn->rec_file = NULL;
}
static void
nfsd4_legacy_tracking_exit(struct net *net)
{
- nfs4_release_reclaim();
- nfsd4_shutdown_recdir();
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_release_reclaim(nn);
+ nfsd4_shutdown_recdir(nn);
+ nfs4_legacy_state_shutdown(net);
}
/*
@@ -480,13 +590,26 @@ nfs4_recoverydir(void)
static int
nfsd4_check_legacy_client(struct nfs4_client *clp)
{
+ int status;
+ char dname[HEXDIR_LEN];
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
/* did we already find that this client is stable? */
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return 0;
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status) {
+ legacy_recdir_name_error(status);
+ return status;
+ }
+
/* look for it in the reclaim hashtable otherwise */
- if (nfsd4_find_reclaim_client(clp)) {
+ crp = nfsd4_find_reclaim_client(dname, nn);
+ if (crp) {
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ crp->cr_clp = clp;
return 0;
}
@@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
}
static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
- .init = nfsd4_load_reboot_recovery_data,
+ .init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
.remove = nfsd4_remove_clid_dir,
@@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if it's already stored */
@@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if it's already removed */
@@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if one was already stored during this grace pd */
@@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)
}
static void
-nfsd4_cld_grace_done(struct net *net, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
{
int ret;
struct cld_upcall *cup;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
cup = alloc_cld_upcall(cn);
@@ -926,28 +1045,267 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.grace_done = nfsd4_cld_grace_done,
};
+/* upcall via usermodehelper */
+static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
+module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
+ S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
+
+static bool cltrack_legacy_disable;
+module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_legacy_disable,
+ "Disable legacy recoverydir conversion. Default: false");
+
+#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
+#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+
+static char *
+nfsd4_cltrack_legacy_topdir(void)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
+ nfs4_recoverydir());
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ /* +1 is for '/' between "topdir" and "recdir" */
+ len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
+ nfs4_recoverydir());
+ if (copied > (len - HEXDIR_LEN)) {
+ /* just return nothing if output will be truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ copied = nfs4_make_rec_clidname(result + copied, name);
+ if (copied) {
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static int
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+{
+ char *envp[2];
+ char *argv[4];
+ int ret;
+
+ if (unlikely(!cltrack_prog[0])) {
+ dprintk("%s: cltrack_prog is disabled\n", __func__);
+ return -EACCES;
+ }
+
+ dprintk("%s: cmd: %s\n", __func__, cmd);
+ dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
+ dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+
+ envp[0] = legacy;
+ envp[1] = NULL;
+
+ argv[0] = (char *)cltrack_prog;
+ argv[1] = cmd;
+ argv[2] = arg;
+ argv[3] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or EACCES
+ * error. The admin can re-enable it on the fly by using sysfs
+ * once the problem has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES) {
+ dprintk("NFSD: %s was not found or isn't executable (%d). "
+ "Setting cltrack_prog to blank string!",
+ cltrack_prog, ret);
+ cltrack_prog[0] = '\0';
+ }
+ dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
+
+ return ret;
+}
+
+static char *
+bin_to_hex_dup(const unsigned char *src, int srclen)
+{
+ int i;
+ char *buf, *hex;
+
+ /* +1 for terminating NULL */
+ buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return buf;
+
+ hex = buf;
+ for (i = 0; i < srclen; i++) {
+ sprintf(hex, "%2.2x", *src++);
+ hex += 2;
+ }
+ return buf;
+}
+
+static int
+nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+{
+ /* XXX: The usermode helper s not working in container yet. */
+ if (net != &init_net) {
+ WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
+ "tracking in a container!\n");
+ return -EINVAL;
+ }
+ return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+}
+
+static void
+nfsd4_umh_cltrack_create(struct nfs4_client *clp)
+{
+ char *hexid;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+ nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+ kfree(hexid);
+}
+
+static void
+nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
+{
+ char *hexid;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+ kfree(hexid);
+}
+
+static int
+nfsd4_umh_cltrack_check(struct nfs4_client *clp)
+{
+ int ret;
+ char *hexid, *legacy;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return -ENOMEM;
+ }
+ legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+ kfree(legacy);
+ kfree(hexid);
+ return ret;
+}
+
+static void
+nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
+ time_t boot_time)
+{
+ char *legacy;
+ char timestr[22]; /* FIXME: better way to determine max size? */
+
+ sprintf(timestr, "%ld", boot_time);
+ legacy = nfsd4_cltrack_legacy_topdir();
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+ kfree(legacy);
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+ .init = nfsd4_umh_cltrack_init,
+ .exit = NULL,
+ .create = nfsd4_umh_cltrack_create,
+ .remove = nfsd4_umh_cltrack_remove,
+ .check = nfsd4_umh_cltrack_check,
+ .grace_done = nfsd4_umh_cltrack_grace_done,
+};
+
int
nfsd4_client_tracking_init(struct net *net)
{
int status;
struct path path;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (!client_tracking_ops) {
- client_tracking_ops = &nfsd4_cld_tracking_ops;
- status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
- if (!status) {
- if (S_ISDIR(path.dentry->d_inode->i_mode))
- client_tracking_ops =
- &nfsd4_legacy_tracking_ops;
- path_put(&path);
- }
+ /* just run the init if it the method is already decided */
+ if (nn->client_tracking_ops)
+ goto do_init;
+
+ /*
+ * First, try a UMH upcall. It should succeed or fail quickly, so
+ * there's little harm in trying that first.
+ */
+ nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+
+ /*
+ * See if the recoverydir exists and is a directory. If it is,
+ * then use the legacy ops.
+ */
+ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+ status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+ if (!status) {
+ status = S_ISDIR(path.dentry->d_inode->i_mode);
+ path_put(&path);
+ if (status)
+ goto do_init;
}
- status = client_tracking_ops->init(net);
+ /* Finally, try to use nfsdcld */
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+ "removed in 3.10. Please transition to using "
+ "nfsdcltrack.\n");
+do_init:
+ status = nn->client_tracking_ops->init(net);
if (status) {
printk(KERN_WARNING "NFSD: Unable to initialize client "
"recovery tracking! (%d)\n", status);
- client_tracking_ops = NULL;
+ nn->client_tracking_ops = NULL;
}
return status;
}
@@ -955,40 +1313,49 @@ nfsd4_client_tracking_init(struct net *net)
void
nfsd4_client_tracking_exit(struct net *net)
{
- if (client_tracking_ops) {
- client_tracking_ops->exit(net);
- client_tracking_ops = NULL;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->client_tracking_ops) {
+ if (nn->client_tracking_ops->exit)
+ nn->client_tracking_ops->exit(net);
+ nn->client_tracking_ops = NULL;
}
}
void
nfsd4_client_record_create(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- client_tracking_ops->create(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->create(clp);
}
void
nfsd4_client_record_remove(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- client_tracking_ops->remove(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->remove(clp);
}
int
nfsd4_client_record_check(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- return client_tracking_ops->check(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ return nn->client_tracking_ops->check(clp);
return -EOPNOTSUPP;
}
void
-nfsd4_record_grace_done(struct net *net, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
{
- if (client_tracking_ops)
- client_tracking_ops->grace_done(net, boot_time);
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->grace_done(nn, boot_time);
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0237f872cc4..16d39c6c4fbb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -40,20 +40,15 @@
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sunrpc/svcauth_gss.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include "xdr4.h"
#include "vfs.h"
#include "current_stateid.h"
-#include "fault_inject.h"
#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
-/* Globals */
-time_t nfsd4_lease = 90; /* default lease time */
-time_t nfsd4_grace = 90;
-
#define all_ones {{~0,~0},~0}
static const stateid_t one_stateid = {
.si_generation = ~0,
@@ -156,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
}
static int num_delegations;
-unsigned int max_delegations;
+unsigned long max_delegations;
/*
* Open owner state (share locks)
@@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
return ret & OWNER_HASH_MASK;
}
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
-
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
@@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
- BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+ WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
atomic_inc(&fp->fi_access[oflag]);
}
@@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
* preallocations that can exist at a time, but the state lock
* prevents anyone from using ours before we get here:
*/
- BUG_ON(error);
+ WARN_ON_ONCE(error);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
* I don't think it is for 4.1. But with 4.0 I worry that, for
@@ -268,33 +261,46 @@ static inline int get_new_stid(struct nfs4_stid *stid)
return new_stid;
}
-static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
+kmem_cache *slab)
{
- stateid_t *s = &stid->sc_stateid;
+ struct idr *stateids = &cl->cl_stateids;
+ static int min_stateid = 0;
+ struct nfs4_stid *stid;
int new_id;
- stid->sc_type = type;
+ stid = kmem_cache_alloc(slab, GFP_KERNEL);
+ if (!stid)
+ return NULL;
+
+ if (!idr_pre_get(stateids, GFP_KERNEL))
+ goto out_free;
+ if (idr_get_new_above(stateids, stid, min_stateid, &new_id))
+ goto out_free;
stid->sc_client = cl;
- s->si_opaque.so_clid = cl->cl_clientid;
- new_id = get_new_stid(stid);
- s->si_opaque.so_id = (u32)new_id;
+ stid->sc_type = 0;
+ stid->sc_stateid.si_opaque.so_id = new_id;
+ stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
- s->si_generation = 0;
-}
+ stid->sc_stateid.si_generation = 0;
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
-{
- struct idr *stateids = &cl->cl_stateids;
-
- if (!idr_pre_get(stateids, GFP_KERNEL))
- return NULL;
/*
- * Note: if we fail here (or any time between now and the time
- * we actually get the new idr), we won't need to undo the idr
- * preallocation, since the idr code caps the number of
- * preallocated entries.
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
*/
- return kmem_cache_alloc(slab, GFP_KERNEL);
+
+ min_stateid = new_id+1;
+ if (min_stateid == INT_MAX)
+ min_stateid = 0;
+ return stid;
+out_free:
+ kfree(stid);
+ return NULL;
}
static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
@@ -323,7 +329,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
- init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -340,17 +346,25 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
- INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
+ nfsd4_init_callback(&dp->dl_recall);
return dp;
}
+static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab)
+{
+ struct idr *stateids = &s->sc_client->cl_stateids;
+
+ idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+ kmem_cache_free(slab, s);
+}
+
void
nfs4_put_delegation(struct nfs4_delegation *dp)
{
if (atomic_dec_and_test(&dp->dl_count)) {
dprintk("NFSD: freeing dp %p\n",dp);
put_nfs4_file(dp->dl_file);
- kmem_cache_free(deleg_slab, dp);
+ free_stid(&dp->dl_stid, deleg_slab);
num_delegations--;
}
}
@@ -367,9 +381,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
static void unhash_stid(struct nfs4_stid *s)
{
- struct idr *stateids = &s->sc_client->cl_stateids;
-
- idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+ s->sc_type = 0;
}
/* Called under the state lock. */
@@ -390,14 +402,6 @@ unhash_delegation(struct nfs4_delegation *dp)
* SETCLIENTID state
*/
-/* client_lock protects the client lru list and session hash table */
-static DEFINE_SPINLOCK(client_lock);
-
-/* Hash tables for nfs4_clientid state */
-#define CLIENT_HASH_BITS 4
-#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
-#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-
static unsigned int clientid_hashval(u32 id)
{
return id & CLIENT_HASH_MASK;
@@ -409,31 +413,6 @@ static unsigned int clientstr_hashval(const char *name)
}
/*
- * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
- * used in reboot/reset lease grace period processing
- *
- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
- * setclientid_confirmed info.
- *
- * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
- * setclientid info.
- *
- * client_lru holds client queue ordered by nfs4_client.cl_time
- * for lease renewal.
- *
- * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
- * for last close replay.
- */
-static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
-static int reclaim_str_hashtbl_size = 0;
-static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head client_lru;
-static struct list_head close_lru;
-
-/*
* We store the NONE, READ, WRITE, and BOTH bits separately in the
* st_{access,deny}_bmap field of the stateid, in order to track not
* only what share bits are currently in force, but also what
@@ -526,7 +505,8 @@ static int nfs4_access_to_omode(u32 access)
case NFS4_SHARE_ACCESS_BOTH:
return O_RDWR;
}
- BUG();
+ WARN_ON_ONCE(1);
+ return O_RDONLY;
}
/* release all access and file references for a given stateid */
@@ -558,7 +538,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
static void free_generic_stateid(struct nfs4_ol_stateid *stp)
{
- kmem_cache_free(stateid_slab, stp);
+ free_stid(&stp->st_stid, stateid_slab);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -652,9 +632,6 @@ static void release_openowner(struct nfs4_openowner *oo)
nfs4_free_openowner(oo);
}
-#define SESSION_HASH_SIZE 512
-static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
-
static inline int
hash_sessionid(struct nfs4_sessionid *sessionid)
{
@@ -742,8 +719,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
spin_lock(&nfsd_drc_lock);
- avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
- nfsd_drc_max_mem - nfsd_drc_mem_used);
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
+ nfsd_drc_max_mem - nfsd_drc_mem_used);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
@@ -785,9 +762,12 @@ out_free:
return NULL;
}
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
+ struct nfsd4_channel_attrs *req,
+ int numslots, int slotsize,
+ struct nfsd_net *nn)
{
- u32 maxrpc = nfsd_serv->sv_max_mesg;
+ u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
new->maxreqs = numslots;
new->maxresp_cached = min_t(u32, req->maxresp_cached,
@@ -906,21 +886,27 @@ static void __free_session(struct nfsd4_session *ses)
static void free_session(struct kref *kref)
{
struct nfsd4_session *ses;
+ struct nfsd_net *nn;
- lockdep_assert_held(&client_lock);
ses = container_of(kref, struct nfsd4_session, se_ref);
+ nn = net_generic(ses->se_client->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
nfsd4_del_conns(ses);
__free_session(ses);
}
void nfsd4_put_session(struct nfsd4_session *ses)
{
- spin_lock(&client_lock);
+ struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
nfsd4_put_session_locked(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
+ struct nfsd_net *nn)
{
struct nfsd4_session *new;
int numslots, slotsize;
@@ -938,16 +924,17 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
new = __alloc_session(slotsize, numslots);
if (!new) {
- nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+ nfsd4_put_drc_mem(slotsize, numslots);
return NULL;
}
- init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
+ init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
return new;
}
-static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
{
int idx;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
new->se_client = clp;
gen_sessionid(new);
@@ -957,14 +944,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
new->se_cb_seq_nr = 1;
new->se_flags = cses->flags;
new->se_cb_prog = cses->callback_prog;
+ new->se_cb_sec = cses->cb_sec;
kref_init(&new->se_ref);
idx = hash_sessionid(&new->se_sessionid);
- spin_lock(&client_lock);
- list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+ spin_lock(&nn->client_lock);
+ list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
spin_lock(&clp->cl_lock);
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
@@ -978,20 +966,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
}
- return new;
}
/* caller must hold client_lock */
static struct nfsd4_session *
-find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
{
struct nfsd4_session *elem;
int idx;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dump_sessionid(__func__, sessionid);
idx = hash_sessionid(sessionid);
/* Search in the appropriate list */
- list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+ list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
if (!memcmp(elem->se_sessionid.data, sessionid->data,
NFS4_MAX_SESSIONID_LEN)) {
return elem;
@@ -1016,6 +1004,8 @@ unhash_session(struct nfsd4_session *ses)
static inline void
renew_client_locked(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
if (is_client_expired(clp)) {
WARN_ON(1);
printk("%s: client (clientid %08x/%08x) already expired\n",
@@ -1028,16 +1018,18 @@ renew_client_locked(struct nfs4_client *clp)
dprintk("renewing client (clientid %08x/%08x)\n",
clp->cl_clientid.cl_boot,
clp->cl_clientid.cl_id);
- list_move_tail(&clp->cl_lru, &client_lru);
+ list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = get_seconds();
}
static inline void
renew_client(struct nfs4_client *clp)
{
- spin_lock(&client_lock);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
renew_client_locked(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -1075,7 +1067,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
static inline void
free_client(struct nfs4_client *clp)
{
- lockdep_assert_held(&client_lock);
+ struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
while (!list_empty(&clp->cl_sessions)) {
struct nfsd4_session *ses;
ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1085,6 +1079,7 @@ free_client(struct nfs4_client *clp)
}
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
+ idr_destroy(&clp->cl_stateids);
kfree(clp);
}
@@ -1092,15 +1087,16 @@ void
release_session_client(struct nfsd4_session *session)
{
struct nfs4_client *clp = session->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+ if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
return;
if (is_client_expired(clp)) {
free_client(clp);
session->se_client = NULL;
} else
renew_client_locked(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
/* must be called under the client_lock */
@@ -1123,6 +1119,7 @@ destroy_client(struct nfs4_client *clp)
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
@@ -1144,12 +1141,15 @@ destroy_client(struct nfs4_client *clp)
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
list_del(&clp->cl_idhash);
- list_del(&clp->cl_strhash);
- spin_lock(&client_lock);
+ if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+ rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+ else
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ spin_lock(&nn->client_lock);
unhash_client_locked(clp);
if (atomic_read(&clp->cl_refcount) == 0)
free_client(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
static void expire_client(struct nfs4_client *clp)
@@ -1187,6 +1187,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
return 0;
}
+static long long
+compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
+{
+ long long res;
+
+ res = o1->len - o2->len;
+ if (res)
+ return res;
+ return (long long)memcmp(o1->data, o2->data, o1->len);
+}
+
static int same_name(const char *n1, const char *n2)
{
return 0 == memcmp(n1, n2, HEXDIR_LEN);
@@ -1211,7 +1222,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
if (g1->ngroups != g2->ngroups)
return false;
for (i=0; i<g1->ngroups; i++)
- if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
+ if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
return false;
return true;
}
@@ -1236,8 +1247,8 @@ static bool
same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
{
if ((is_gss_cred(cr1) != is_gss_cred(cr2))
- || (cr1->cr_uid != cr2->cr_uid)
- || (cr1->cr_gid != cr2->cr_gid)
+ || (!uid_eq(cr1->cr_uid, cr2->cr_uid))
+ || (!gid_eq(cr1->cr_gid, cr2->cr_gid))
|| !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
return false;
if (cr1->cr_principal == cr2->cr_principal)
@@ -1247,10 +1258,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
}
-static void gen_clid(struct nfs4_client *clp)
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
static u32 current_clientid = 1;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
clp->cl_clientid.cl_boot = nn->boot_time;
clp->cl_clientid.cl_id = current_clientid++;
@@ -1268,7 +1278,12 @@ static void gen_confirm(struct nfs4_client *clp)
static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
{
- return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+ struct nfs4_stid *ret;
+
+ ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+ if (!ret || !ret->sc_type)
+ return NULL;
+ return ret;
}
static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
@@ -1283,12 +1298,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t
return NULL;
}
-static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
struct nfs4_client *clp;
struct sockaddr *sa = svc_addr(rqstp);
int ret;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
clp = alloc_client(name);
if (clp == NULL)
@@ -1297,23 +1314,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
INIT_LIST_HEAD(&clp->cl_sessions);
ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
if (ret) {
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
free_client(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
return NULL;
}
idr_init(&clp->cl_stateids);
- memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_refcount, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
spin_lock_init(&clp->cl_lock);
- INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
+ nfsd4_init_callback(&clp->cl_cb_null);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -1321,17 +1336,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
gen_confirm(clp);
clp->cl_cb_session = NULL;
+ clp->net = net;
return clp;
}
static void
-add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
+add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct nfs4_client *clp;
+
+ while (*new) {
+ clp = rb_entry(*new, struct nfs4_client, cl_namenode);
+ parent = *new;
+
+ if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&new_clp->cl_namenode, parent, new);
+ rb_insert_color(&new_clp->cl_namenode, root);
+}
+
+static struct nfs4_client *
+find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
+{
+ long long cmp;
+ struct rb_node *node = root->rb_node;
+ struct nfs4_client *clp;
+
+ while (node) {
+ clp = rb_entry(node, struct nfs4_client, cl_namenode);
+ cmp = compare_blob(&clp->cl_name, name);
+ if (cmp > 0)
+ node = node->rb_left;
+ else if (cmp < 0)
+ node = node->rb_right;
+ else
+ return clp;
+ }
+ return NULL;
+}
+
+static void
+add_to_unconfirmed(struct nfs4_client *clp)
{
unsigned int idhashval;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
+ clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ add_clp_to_name_tree(clp, &nn->unconf_name_tree);
idhashval = clientid_hashval(clp->cl_clientid.cl_id);
- list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
+ list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
renew_client(clp);
}
@@ -1339,22 +1397,23 @@ static void
move_to_confirmed(struct nfs4_client *clp)
{
unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
- unsigned int strhashval;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
- list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
- strhashval = clientstr_hashval(clp->cl_recdir);
- list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+ list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ add_clp_to_name_tree(clp, &nn->conf_name_tree);
+ set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
renew_client(clp);
}
static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions)
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
unsigned int idhashval = clientid_hashval(clid->cl_id);
- list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
+ list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
@@ -1366,12 +1425,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)
}
static struct nfs4_client *
-find_unconfirmed_client(clientid_t *clid, bool sessions)
+find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
unsigned int idhashval = clientid_hashval(clid->cl_id);
- list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
+ list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
@@ -1387,27 +1446,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
}
static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
- struct nfs4_client *clp;
-
- list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
- return clp;
- }
- return NULL;
+ return find_clp_in_name_tree(name, &nn->conf_name_tree);
}
static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
- struct nfs4_client *clp;
-
- list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
- return clp;
- }
- return NULL;
+ return find_clp_in_name_tree(name, &nn->unconf_name_tree);
}
static void
@@ -1428,7 +1475,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
else
goto out_err;
- conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
+ conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
se->se_callback_addr_len,
(struct sockaddr *)&conn->cb_addr,
sizeof(conn->cb_addr));
@@ -1572,12 +1619,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
{
struct nfs4_client *unconf, *conf, *new;
__be32 status;
- unsigned int strhashval;
- char dname[HEXDIR_LEN];
char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
struct sockaddr *sa = svc_addr(rqstp);
bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1592,24 +1638,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
switch (exid->spa_how) {
case SP4_NONE:
break;
+ default: /* checked by xdr code */
+ WARN_ON_ONCE(1);
case SP4_SSV:
- return nfserr_serverfault;
- default:
- BUG(); /* checked by xdr code */
case SP4_MACH_CRED:
return nfserr_serverfault; /* no excuse :-/ */
}
- status = nfs4_make_rec_clidname(dname, &exid->clname);
-
- if (status)
- return status;
-
- strhashval = clientstr_hashval(dname);
-
/* Cases below refer to rfc 5661 section 18.35.4: */
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval);
+ conf = find_confirmed_client_by_name(&exid->clname, nn);
if (conf) {
bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
bool verfs_match = same_verf(&verf, &conf->cl_verifier);
@@ -1654,21 +1692,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
goto out;
}
- unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
if (unconf) /* case 4, possible retry or client restart */
expire_client(unconf);
/* case 1 (normal case) */
out_new:
- new = create_client(exid->clname, dname, rqstp, &verf);
+ new = create_client(exid->clname, rqstp, &verf);
if (new == NULL) {
status = nfserr_jukebox;
goto out;
}
new->cl_minorversion = 1;
- gen_clid(new);
- add_to_unconfirmed(new, strhashval);
+ gen_clid(new, nn);
+ add_to_unconfirmed(new);
out_copy:
exid->clientid.cl_boot = new->cl_clientid.cl_boot;
exid->clientid.cl_id = new->cl_clientid.cl_id;
@@ -1761,12 +1799,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_conn *conn;
struct nfsd4_clid_slot *cs_slot = NULL;
__be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
if (check_forechannel_attrs(cr_ses->fore_channel))
return nfserr_toosmall;
- new = alloc_session(&cr_ses->fore_channel);
+ new = alloc_session(&cr_ses->fore_channel, nn);
if (!new)
return nfserr_jukebox;
status = nfserr_jukebox;
@@ -1775,8 +1814,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_session;
nfs4_lock_state();
- unconf = find_unconfirmed_client(&cr_ses->clientid, true);
- conf = find_confirmed_client(&cr_ses->clientid, true);
+ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
+ conf = find_confirmed_client(&cr_ses->clientid, true, nn);
if (conf) {
cs_slot = &conf->cl_cs_slot;
@@ -1789,7 +1828,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
} else if (unconf) {
- unsigned int hash;
struct nfs4_client *old;
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
!rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1803,8 +1841,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_seq_misordered;
goto out_free_conn;
}
- hash = clientstr_hashval(unconf->cl_recdir);
- old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old)
expire_client(old);
move_to_confirmed(unconf);
@@ -1832,25 +1869,18 @@ nfsd4_create_session(struct svc_rqst *rqstp,
/* cache solo and embedded create sessions under the state lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
-out:
nfs4_unlock_state();
+out:
dprintk("%s returns %d\n", __func__, ntohl(status));
return status;
out_free_conn:
+ nfs4_unlock_state();
free_conn(conn);
out_free_session:
__free_session(new);
goto out;
}
-static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
-{
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct nfsd4_compoundargs *argp = rqstp->rq_argp;
-
- return argp->opcnt == resp->opcnt;
-}
-
static __be32 nfsd4_map_bcts_dir(u32 *dir)
{
switch (*dir) {
@@ -1865,24 +1895,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)
return nfserr_inval;
}
+__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
+{
+ struct nfsd4_session *session = cstate->session;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ session->se_cb_prog = bc->bc_cb_program;
+ session->se_cb_sec = bc->bc_cb_sec;
+ spin_unlock(&nn->client_lock);
+
+ nfsd4_probe_callback(session->se_client);
+
+ return nfs_ok;
+}
+
__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_bind_conn_to_session *bcts)
{
__be32 status;
struct nfsd4_conn *conn;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (!nfsd4_last_compound_op(rqstp))
return nfserr_not_only_op;
- spin_lock(&client_lock);
- cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+ spin_lock(&nn->client_lock);
+ cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
/* Sorta weird: we only need the refcnt'ing because new_conn acquires
* client_lock iself: */
if (cstate->session) {
nfsd4_get_session(cstate->session);
atomic_inc(&cstate->session->se_client->cl_refcount);
}
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
if (!cstate->session)
return nfserr_badsession;
@@ -1910,6 +1956,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
{
struct nfsd4_session *ses;
__be32 status = nfserr_badsession;
+ struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
/* Notes:
* - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1923,24 +1970,24 @@ nfsd4_destroy_session(struct svc_rqst *r,
return nfserr_not_only_op;
}
dump_sessionid(__func__, &sessionid->sessionid);
- spin_lock(&client_lock);
- ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+ spin_lock(&nn->client_lock);
+ ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
if (!ses) {
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
goto out;
}
unhash_session(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
nfs4_lock_state();
nfsd4_probe_callback_sync(ses->se_client);
nfs4_unlock_state();
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
nfsd4_del_conns(ses);
nfsd4_put_session_locked(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
status = nfs_ok;
out:
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -2006,6 +2053,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
__be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (resp->opcnt != 1)
return nfserr_sequence_pos;
@@ -2018,9 +2066,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (!conn)
return nfserr_jukebox;
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
status = nfserr_badsession;
- session = find_in_sessionid_hashtbl(&seq->sessionid);
+ session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
if (!session)
goto out;
@@ -2094,7 +2142,7 @@ out:
}
}
kfree(conn);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
}
@@ -2104,10 +2152,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
{
struct nfs4_client *conf, *unconf, *clp;
__be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
nfs4_lock_state();
- unconf = find_unconfirmed_client(&dc->clientid, true);
- conf = find_confirmed_client(&dc->clientid, true);
+ unconf = find_unconfirmed_client(&dc->clientid, true, nn);
+ conf = find_confirmed_client(&dc->clientid, true, nn);
if (conf) {
clp = conf;
@@ -2181,20 +2230,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct xdr_netobj clname = setclid->se_name;
nfs4_verifier clverifier = setclid->se_verf;
- unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
- char dname[HEXDIR_LEN];
-
- status = nfs4_make_rec_clidname(dname, &clname);
- if (status)
- return status;
-
- strhashval = clientstr_hashval(dname);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
/* Cases below refer to rfc 3530 section 14.2.33: */
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval);
+ conf = find_confirmed_client_by_name(&clname, nn);
if (conf) {
/* case 0: */
status = nfserr_clid_inuse;
@@ -2209,21 +2251,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
}
- unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
expire_client(unconf);
status = nfserr_jukebox;
- new = create_client(clname, dname, rqstp, &clverifier);
+ new = create_client(clname, rqstp, &clverifier);
if (new == NULL)
goto out;
if (conf && same_verf(&conf->cl_verifier, &clverifier))
/* case 1: probable callback update */
copy_clid(new, conf);
else /* case 4 (new client) or cases 2, 3 (client reboot): */
- gen_clid(new);
+ gen_clid(new, nn);
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
- add_to_unconfirmed(new, strhashval);
+ add_to_unconfirmed(new);
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -2243,14 +2285,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (STALE_CLIENTID(clid, nn))
return nfserr_stale_clientid;
nfs4_lock_state();
- conf = find_confirmed_client(clid, false);
- unconf = find_unconfirmed_client(clid, false);
+ conf = find_confirmed_client(clid, false, nn);
+ unconf = find_unconfirmed_client(clid, false, nn);
/*
* We try hard to give out unique clientid's, so if we get an
* attempt to confirm the same clientid with a different cred,
@@ -2276,9 +2318,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
nfsd4_probe_callback(conf);
expire_client(unconf);
} else { /* case 3: normal case; new or rebooted client */
- unsigned int hash = clientstr_hashval(unconf->cl_recdir);
-
- conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+ conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (conf)
expire_client(conf);
move_to_confirmed(unconf);
@@ -2340,7 +2380,7 @@ nfsd4_init_slabs(void)
if (openowner_slab == NULL)
goto out_nomem;
lockowner_slab = kmem_cache_create("nfsd4_lockowners",
- sizeof(struct nfs4_openowner), 0, 0, NULL);
+ sizeof(struct nfs4_lockowner), 0, 0, NULL);
if (lockowner_slab == NULL)
goto out_nomem;
file_slab = kmem_cache_create("nfsd4_files",
@@ -2404,7 +2444,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
{
- list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
list_add(&oo->oo_perclient, &clp->cl_openowners);
}
@@ -2427,9 +2469,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
struct nfs4_openowner *oo = open->op_openowner;
- struct nfs4_client *clp = oo->oo_owner.so_client;
- init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
+ stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_lockowners);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -2444,11 +2485,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
}
static void
-move_to_close_lru(struct nfs4_openowner *oo)
+move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
- list_move_tail(&oo->oo_close_lru, &close_lru);
+ list_move_tail(&oo->oo_close_lru, &nn->close_lru);
oo->oo_time = get_seconds();
}
@@ -2462,13 +2505,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
}
static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+ bool sessions, struct nfsd_net *nn)
{
struct nfs4_stateowner *so;
struct nfs4_openowner *oo;
struct nfs4_client *clp;
- list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
if (!so->so_is_open_owner)
continue;
if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
@@ -2555,9 +2599,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
struct nfs4_delegation *dp;
- BUG_ON(!fp);
- /* We assume break_lease is only called once per lease: */
- BUG_ON(fp->fi_had_conflict);
+ if (!fp) {
+ WARN(1, "(%p)->fl_owner NULL\n", fl);
+ return;
+ }
+ if (fp->fi_had_conflict) {
+ WARN(1, "duplicate break on %p\n", fp);
+ return;
+ }
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -2599,14 +2648,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
__be32
nfsd4_process_open1(struct nfsd4_compound_state *cstate,
- struct nfsd4_open *open)
+ struct nfsd4_open *open, struct nfsd_net *nn)
{
clientid_t *clientid = &open->op_clientid;
struct nfs4_client *clp = NULL;
unsigned int strhashval;
struct nfs4_openowner *oo = NULL;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
if (STALE_CLIENTID(&open->op_clientid, nn))
return nfserr_stale_clientid;
@@ -2619,10 +2667,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
return nfserr_jukebox;
strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
- oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
+ oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
open->op_openowner = oo;
if (!oo) {
- clp = find_confirmed_client(clientid, cstate->minorversion);
+ clp = find_confirmed_client(clientid, cstate->minorversion,
+ nn);
if (clp == NULL)
return nfserr_expired;
goto new_owner;
@@ -2891,7 +2940,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
open->op_why_no_deleg = WND4_CANCELLED;
break;
case NFS4_SHARE_WANT_NO_DELEG:
- BUG(); /* not supposed to get here */
+ WARN_ON_ONCE(1);
}
}
}
@@ -2959,6 +3008,7 @@ out:
}
return;
out_free:
+ unhash_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
out_no_deleg:
flag = NFS4_OPEN_DELEGATE_NONE;
@@ -3104,27 +3154,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
free_generic_stateid(open->op_stp);
}
+static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
+{
+ struct nfs4_client *found;
+
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+ found = find_confirmed_client(clid, session, nn);
+ if (clp)
+ *clp = found;
+ return found ? nfs_ok : nfserr_expired;
+}
+
__be32
nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clientid_t *clid)
{
struct nfs4_client *clp;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
nfs4_lock_state();
dprintk("process_renew(%08x/%08x): starting\n",
clid->cl_boot, clid->cl_id);
- status = nfserr_stale_clientid;
- if (STALE_CLIENTID(clid, nn))
- goto out;
- clp = find_confirmed_client(clid, cstate->minorversion);
- status = nfserr_expired;
- if (clp == NULL) {
- /* We assume the client took too long to RENEW. */
- dprintk("nfsd4_renew: clientid not found!\n");
+ status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
+ if (status)
goto out;
- }
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -3136,44 +3191,42 @@ out:
}
static void
-nfsd4_end_grace(struct net *net)
+nfsd4_end_grace(struct nfsd_net *nn)
{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
/* do nothing if grace period already ended */
if (nn->grace_ended)
return;
dprintk("NFSD: end of grace period\n");
nn->grace_ended = true;
- nfsd4_record_grace_done(net, nn->boot_time);
+ nfsd4_record_grace_done(nn, nn->boot_time);
locks_end_grace(&nn->nfsd4_manager);
/*
* Now that every NFSv4 client has had the chance to recover and
* to see the (possibly new, possibly shorter) lease time, we
* can safely set the next grace time to the current lease time:
*/
- nfsd4_grace = nfsd4_lease;
+ nn->nfsd4_grace = nn->nfsd4_lease;
}
static time_t
-nfs4_laundromat(void)
+nfs4_laundromat(struct nfsd_net *nn)
{
struct nfs4_client *clp;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
- time_t cutoff = get_seconds() - nfsd4_lease;
- time_t t, clientid_val = nfsd4_lease;
- time_t u, test_val = nfsd4_lease;
+ time_t cutoff = get_seconds() - nn->nfsd4_lease;
+ time_t t, clientid_val = nn->nfsd4_lease;
+ time_t u, test_val = nn->nfsd4_lease;
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
- nfsd4_end_grace(&init_net);
+ nfsd4_end_grace(nn);
INIT_LIST_HEAD(&reaplist);
- spin_lock(&client_lock);
- list_for_each_safe(pos, next, &client_lru) {
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
t = clp->cl_time - cutoff;
@@ -3189,7 +3242,7 @@ nfs4_laundromat(void)
unhash_client_locked(clp);
list_add(&clp->cl_lru, &reaplist);
}
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
dprintk("NFSD: purging unused client (clientid %08x)\n",
@@ -3199,6 +3252,8 @@ nfs4_laundromat(void)
spin_lock(&recall_lock);
list_for_each_safe(pos, next, &del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
+ continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
u = dp->dl_time - cutoff;
if (test_val > u)
@@ -3212,8 +3267,8 @@ nfs4_laundromat(void)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
unhash_delegation(dp);
}
- test_val = nfsd4_lease;
- list_for_each_safe(pos, next, &close_lru) {
+ test_val = nn->nfsd4_lease;
+ list_for_each_safe(pos, next, &nn->close_lru) {
oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
u = oo->oo_time - cutoff;
@@ -3231,16 +3286,19 @@ nfs4_laundromat(void)
static struct workqueue_struct *laundry_wq;
static void laundromat_main(struct work_struct *);
-static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
static void
-laundromat_main(struct work_struct *not_used)
+laundromat_main(struct work_struct *laundry)
{
time_t t;
+ struct delayed_work *dwork = container_of(laundry, struct delayed_work,
+ work);
+ struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ laundromat_work);
- t = nfs4_laundromat();
+ t = nfs4_laundromat(nn);
dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
- queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
@@ -3385,16 +3443,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
return nfs_ok;
}
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, bool sessions,
+ struct nfsd_net *nn)
{
struct nfs4_client *cl;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return nfserr_bad_stateid;
if (STALE_STATEID(stateid, nn))
return nfserr_stale_stateid;
- cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
+ cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
if (!cl)
return nfserr_expired;
*s = find_stateid_by_type(cl, stateid, typemask);
@@ -3416,6 +3475,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
struct nfs4_delegation *dp = NULL;
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
__be32 status;
if (filpp)
@@ -3427,7 +3487,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(net, current_fh, stateid, flags);
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+ &s, cstate->minorversion, nn);
if (status)
return status;
status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3441,7 +3502,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
goto out;
if (filpp) {
*filpp = dp->dl_file->fi_deleg_file;
- BUG_ON(!*filpp);
+ if (!*filpp) {
+ WARN_ON_ONCE(1);
+ status = nfserr_serverfault;
+ goto out;
+ }
}
break;
case NFS4_OPEN_STID:
@@ -3568,7 +3633,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
stateid_t *stateid, char typemask,
- struct nfs4_ol_stateid **stpp)
+ struct nfs4_ol_stateid **stpp,
+ struct nfsd_net *nn)
{
__be32 status;
struct nfs4_stid *s;
@@ -3577,7 +3643,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
seqid, STATEID_VAL(stateid));
*stpp = NULL;
- status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, typemask, &s,
+ cstate->minorversion, nn);
if (status)
return status;
*stpp = openlockstateid(s);
@@ -3586,13 +3653,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
}
-static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
{
__be32 status;
struct nfs4_openowner *oo;
status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
- NFS4_OPEN_STID, stpp);
+ NFS4_OPEN_STID, stpp, nn);
if (status)
return status;
oo = openowner((*stpp)->st_stateowner);
@@ -3608,6 +3676,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfs4_openowner *oo;
struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3621,7 +3690,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
- NFS4_OPEN_STID, &stp);
+ NFS4_OPEN_STID, &stp, nn);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
@@ -3664,7 +3733,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
case NFS4_SHARE_ACCESS_BOTH:
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
}
}
@@ -3685,6 +3754,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
{
__be32 status;
struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3697,7 +3767,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
nfs4_lock_state();
status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
- &od->od_stateid, &stp);
+ &od->od_stateid, &stp, nn);
if (status)
goto out;
status = nfserr_inval;
@@ -3760,6 +3830,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfs4_openowner *oo;
struct nfs4_ol_stateid *stp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_close on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3769,7 +3841,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
&close->cl_stateid,
NFS4_OPEN_STID|NFS4_CLOSED_STID,
- &stp);
+ &stp, nn);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
@@ -3791,7 +3863,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* little while to handle CLOSE replay.
*/
if (list_empty(&oo->oo_owner.so_stateids))
- move_to_close_lru(oo);
+ move_to_close_lru(oo, SVC_NET(rqstp));
}
}
out:
@@ -3807,15 +3879,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_delegation *dp;
stateid_t *stateid = &dr->dr_stateid;
struct nfs4_stid *s;
- struct inode *inode;
__be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- inode = cstate->current_fh.fh_dentry->d_inode;
nfs4_lock_state();
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
+ cstate->minorversion, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -3833,8 +3905,6 @@ out:
#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-#define LOCKOWNER_INO_HASH_BITS 8
-#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
static inline u64
@@ -3852,7 +3922,7 @@ last_byte_offset(u64 start, u64 len)
{
u64 end;
- BUG_ON(!len);
+ WARN_ON_ONCE(!len);
end = start + len;
return end > start ? end - 1: NFS4_MAX_UINT64;
}
@@ -3864,8 +3934,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct
& LOCKOWNER_INO_HASH_MASK;
}
-static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
-
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
* we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3931,12 +3999,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
static struct nfs4_lockowner *
find_lockowner_str(struct inode *inode, clientid_t *clid,
- struct xdr_netobj *owner)
+ struct xdr_netobj *owner, struct nfsd_net *nn)
{
unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
struct nfs4_lockowner *lo;
- list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+ list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
if (same_lockowner_ino(lo, inode, clid, owner))
return lo;
}
@@ -3948,9 +4016,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
struct inode *inode = open_stp->st_file->fi_inode;
unsigned int inohash = lockowner_ino_hashval(inode,
clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
- list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
+ list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
}
@@ -3987,7 +4056,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
stp = nfs4_alloc_stateid(clp);
if (stp == NULL)
return NULL;
- init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
+ stp->st_stid.sc_type = NFS4_LOCK_STID;
list_add(&stp->st_perfile, &fp->fi_stateids);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
stp->st_stateowner = &lo->lo_owner;
@@ -4024,8 +4093,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s
struct nfs4_client *cl = oo->oo_owner.so_client;
struct nfs4_lockowner *lo;
unsigned int strhashval;
+ struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
- lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+ lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
+ &lock->v.new.owner, nn);
if (lo) {
if (!cstate->minorversion)
return nfserr_bad_seqid;
@@ -4065,7 +4136,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
bool new_state = false;
int lkflg;
int err;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
(long long) lock->lk_offset,
@@ -4099,7 +4171,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_confirmed_seqid_op(cstate,
lock->lk_new_open_seqid,
&lock->lk_new_open_stateid,
- &open_stp);
+ &open_stp, nn);
if (status)
goto out;
open_sop = openowner(open_stp->st_stateowner);
@@ -4113,7 +4185,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
&lock->lk_old_lock_stateid,
- NFS4_LOCK_STID, &lock_stp);
+ NFS4_LOCK_STID, &lock_stp, nn);
if (status)
goto out;
lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4124,10 +4196,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = nfserr_grace;
- if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim)
+ if (locks_in_grace(net) && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
+ if (!locks_in_grace(net) && lock->lk_reclaim)
goto out;
file_lock = locks_alloc_lock();
@@ -4238,7 +4310,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock *file_lock = NULL;
struct nfs4_lockowner *lo;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (locks_in_grace(SVC_NET(rqstp)))
return nfserr_grace;
@@ -4248,9 +4320,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- status = nfserr_stale_clientid;
- if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn))
- goto out;
+ if (!nfsd4_has_session(cstate)) {
+ status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
+ if (status)
+ goto out;
+ }
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
goto out;
@@ -4278,7 +4352,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+ lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
if (lo)
file_lock->fl_owner = (fl_owner_t)lo;
file_lock->fl_pid = current->tgid;
@@ -4313,7 +4387,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock *file_lock = NULL;
__be32 status;
int err;
-
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
(long long) locku->lu_offset,
(long long) locku->lu_length);
@@ -4324,7 +4399,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
- &locku->lu_stateid, NFS4_LOCK_STID, &stp);
+ &locku->lu_stateid, NFS4_LOCK_STID,
+ &stp, nn);
if (status)
goto out;
filp = find_any_file(stp->st_file);
@@ -4414,23 +4490,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct list_head matches;
unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
- /* XXX check for lease expiration */
-
- status = nfserr_stale_clientid;
- if (STALE_CLIENTID(clid, nn))
- return status;
-
nfs4_lock_state();
+ status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
+ if (status)
+ goto out;
+
status = nfserr_locks_held;
INIT_LIST_HEAD(&matches);
- list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
if (sop->so_is_open_owner)
continue;
if (!same_owner_str(sop, owner, clid))
@@ -4466,73 +4540,74 @@ alloc_reclaim(void)
return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
}
-int
-nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
+bool
+nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
{
- unsigned int strhashval = clientstr_hashval(name);
- struct nfs4_client *clp;
+ struct nfs4_client_reclaim *crp;
- clp = find_confirmed_client_by_str(name, strhashval);
- if (!clp)
- return 0;
- return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ crp = nfsd4_find_reclaim_client(name, nn);
+ return (crp && crp->cr_clp);
}
/*
* failure => all reset bets are off, nfserr_no_grace...
*/
-int
-nfs4_client_to_reclaim(const char *name)
+struct nfs4_client_reclaim *
+nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
{
unsigned int strhashval;
- struct nfs4_client_reclaim *crp = NULL;
+ struct nfs4_client_reclaim *crp;
dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
crp = alloc_reclaim();
- if (!crp)
- return 0;
- strhashval = clientstr_hashval(name);
- INIT_LIST_HEAD(&crp->cr_strhash);
- list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
- memcpy(crp->cr_recdir, name, HEXDIR_LEN);
- reclaim_str_hashtbl_size++;
- return 1;
+ if (crp) {
+ strhashval = clientstr_hashval(name);
+ INIT_LIST_HEAD(&crp->cr_strhash);
+ list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
+ memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+ crp->cr_clp = NULL;
+ nn->reclaim_str_hashtbl_size++;
+ }
+ return crp;
+}
+
+void
+nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
+{
+ list_del(&crp->cr_strhash);
+ kfree(crp);
+ nn->reclaim_str_hashtbl_size--;
}
void
-nfs4_release_reclaim(void)
+nfs4_release_reclaim(struct nfsd_net *nn)
{
struct nfs4_client_reclaim *crp = NULL;
int i;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- while (!list_empty(&reclaim_str_hashtbl[i])) {
- crp = list_entry(reclaim_str_hashtbl[i].next,
+ while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
+ crp = list_entry(nn->reclaim_str_hashtbl[i].next,
struct nfs4_client_reclaim, cr_strhash);
- list_del(&crp->cr_strhash);
- kfree(crp);
- reclaim_str_hashtbl_size--;
+ nfs4_remove_reclaim_record(crp, nn);
}
}
- BUG_ON(reclaim_str_hashtbl_size);
+ WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
}
/*
* called from OPEN, CLAIM_PREVIOUS with a new clientid. */
struct nfs4_client_reclaim *
-nfsd4_find_reclaim_client(struct nfs4_client *clp)
+nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
{
unsigned int strhashval;
struct nfs4_client_reclaim *crp = NULL;
- dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
- clp->cl_name.len, clp->cl_name.data,
- clp->cl_recdir);
+ dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
- /* find clp->cl_name in reclaim_str_hashtbl */
- strhashval = clientstr_hashval(clp->cl_recdir);
- list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
- if (same_name(crp->cr_recdir, clp->cl_recdir)) {
+ strhashval = clientstr_hashval(recdir);
+ list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
+ if (same_name(crp->cr_recdir, recdir)) {
return crp;
}
}
@@ -4543,12 +4618,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
* Called from OPEN. Look for clientid in reclaim list.
*/
__be32
-nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
+nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
/* find clientid in conf_id_hashtbl */
- clp = find_confirmed_client(clid, sessions);
+ clp = find_confirmed_client(clid, sessions, nn);
if (clp == NULL)
return nfserr_reclaim_bad;
@@ -4557,124 +4632,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
#ifdef CONFIG_NFSD_FAULT_INJECTION
-void nfsd_forget_clients(u64 num)
+u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
{
- struct nfs4_client *clp, *next;
- int count = 0;
-
- nfs4_lock_state();
- list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
- expire_client(clp);
- if (++count == num)
- break;
- }
- nfs4_unlock_state();
-
- printk(KERN_INFO "NFSD: Forgot %d clients", count);
+ expire_client(clp);
+ return 1;
}
-static void release_lockowner_sop(struct nfs4_stateowner *sop)
+u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
{
- release_lockowner(lockowner(sop));
+ char buf[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFS Client: %s\n", buf);
+ return 1;
}
-static void release_openowner_sop(struct nfs4_stateowner *sop)
+static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
+ const char *type)
{
- release_openowner(openowner(sop));
+ char buf[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
}
-static int nfsd_release_n_owners(u64 num, bool is_open_owner,
- void (*release_sop)(struct nfs4_stateowner *))
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
{
- int i, count = 0;
- struct nfs4_stateowner *sop, *next;
+ struct nfs4_openowner *oop;
+ struct nfs4_lockowner *lop, *lo_next;
+ struct nfs4_ol_stateid *stp, *st_next;
+ u64 count = 0;
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
- if (sop->so_is_open_owner != is_open_owner)
- continue;
- release_sop(sop);
- if (++count == num)
- return count;
+ list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
+ list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
+ list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
+ if (func)
+ func(lop);
+ if (++count == max)
+ return count;
+ }
}
}
+
return count;
}
-void nfsd_forget_locks(u64 num)
+u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
{
- int count;
-
- nfs4_lock_state();
- count = nfsd_release_n_owners(num, false, release_lockowner_sop);
- nfs4_unlock_state();
+ return nfsd_foreach_client_lock(clp, max, release_lockowner);
+}
- printk(KERN_INFO "NFSD: Forgot %d locks", count);
+u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
+{
+ u64 count = nfsd_foreach_client_lock(clp, max, NULL);
+ nfsd_print_count(clp, count, "locked files");
+ return count;
}
-void nfsd_forget_openowners(u64 num)
+static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
{
- int count;
+ struct nfs4_openowner *oop, *next;
+ u64 count = 0;
- nfs4_lock_state();
- count = nfsd_release_n_owners(num, true, release_openowner_sop);
- nfs4_unlock_state();
+ list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
+ if (func)
+ func(oop);
+ if (++count == max)
+ break;
+ }
- printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+ return count;
}
-static int nfsd_process_n_delegations(u64 num, struct list_head *list)
+u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
{
- int i, count = 0;
- struct nfs4_file *fp, *fnext;
- struct nfs4_delegation *dp, *dnext;
+ return nfsd_foreach_client_open(clp, max, release_openowner);
+}
- for (i = 0; i < FILE_HASH_SIZE; i++) {
- list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
- list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
- list_move(&dp->dl_recall_lru, list);
- if (++count == num)
- return count;
- }
- }
- }
+u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
+{
+ u64 count = nfsd_foreach_client_open(clp, max, NULL);
+ nfsd_print_count(clp, count, "open files");
+ return count;
+}
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+ struct list_head *victims)
+{
+ struct nfs4_delegation *dp, *next;
+ u64 count = 0;
+
+ list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+ if (victims)
+ list_move(&dp->dl_recall_lru, victims);
+ if (++count == max)
+ break;
+ }
return count;
}
-void nfsd_forget_delegations(u64 num)
+u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
{
- unsigned int count;
+ struct nfs4_delegation *dp, *next;
LIST_HEAD(victims);
- struct nfs4_delegation *dp, *dnext;
+ u64 count;
spin_lock(&recall_lock);
- count = nfsd_process_n_delegations(num, &victims);
+ count = nfsd_find_all_delegations(clp, max, &victims);
spin_unlock(&recall_lock);
- nfs4_lock_state();
- list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru)
+ list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
unhash_delegation(dp);
- nfs4_unlock_state();
- printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+ return count;
}
-void nfsd_recall_delegations(u64 num)
+u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
{
- unsigned int count;
+ struct nfs4_delegation *dp, *next;
LIST_HEAD(victims);
- struct nfs4_delegation *dp, *dnext;
+ u64 count;
spin_lock(&recall_lock);
- count = nfsd_process_n_delegations(num, &victims);
- list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) {
- list_del(&dp->dl_recall_lru);
+ count = nfsd_find_all_delegations(clp, max, &victims);
+ list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
nfsd_break_one_deleg(dp);
- }
spin_unlock(&recall_lock);
- printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+ return count;
+}
+
+u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
+{
+ u64 count = 0;
+
+ spin_lock(&recall_lock);
+ count = nfsd_find_all_delegations(clp, max, NULL);
+ spin_unlock(&recall_lock);
+
+ nfsd_print_count(clp, count, "delegations");
+ return count;
+}
+
+u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
+{
+ struct nfs4_client *clp, *next;
+ u64 count = 0;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return 0;
+
+ list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+ count += func(clp, max - count);
+ if ((max != 0) && (count >= max))
+ break;
+ }
+
+ return count;
+}
+
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return NULL;
+
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+ return clp;
+ }
+ return NULL;
}
#endif /* CONFIG_NFSD_FAULT_INJECTION */
@@ -4686,27 +4814,10 @@ nfs4_state_init(void)
{
int i;
- for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&conf_id_hashtbl[i]);
- INIT_LIST_HEAD(&conf_str_hashtbl[i]);
- INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
- INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
- INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
- }
- for (i = 0; i < SESSION_HASH_SIZE; i++)
- INIT_LIST_HEAD(&sessionid_hashtbl[i]);
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
- }
- for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
- INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
- INIT_LIST_HEAD(&close_lru);
- INIT_LIST_HEAD(&client_lru);
INIT_LIST_HEAD(&del_recall_lru);
- reclaim_str_hashtbl_size = 0;
}
/*
@@ -4730,34 +4841,126 @@ set_max_delegations(void)
max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
}
-/* initialization to perform when the nfsd service is started: */
+static int nfs4_state_create_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->conf_id_hashtbl)
+ goto err;
+ nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->unconf_id_hashtbl)
+ goto err_unconf_id;
+ nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
+ OWNER_HASH_SIZE, GFP_KERNEL);
+ if (!nn->ownerstr_hashtbl)
+ goto err_ownerstr;
+ nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
+ LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
+ if (!nn->lockowner_ino_hashtbl)
+ goto err_lockowner_ino;
+ nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
+ SESSION_HASH_SIZE, GFP_KERNEL);
+ if (!nn->sessionid_hashtbl)
+ goto err_sessionid;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
+ INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
+ }
+ for (i = 0; i < OWNER_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
+ for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
+ for (i = 0; i < SESSION_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
+ nn->conf_name_tree = RB_ROOT;
+ nn->unconf_name_tree = RB_ROOT;
+ INIT_LIST_HEAD(&nn->client_lru);
+ INIT_LIST_HEAD(&nn->close_lru);
+ spin_lock_init(&nn->client_lock);
+
+ INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ get_net(net);
+
+ return 0;
+
+err_sessionid:
+ kfree(nn->lockowner_ino_hashtbl);
+err_lockowner_ino:
+ kfree(nn->ownerstr_hashtbl);
+err_ownerstr:
+ kfree(nn->unconf_id_hashtbl);
+err_unconf_id:
+ kfree(nn->conf_id_hashtbl);
+err:
+ return -ENOMEM;
+}
+
+static void
+nfs4_state_destroy_net(struct net *net)
+{
+ int i;
+ struct nfs4_client *clp = NULL;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct rb_node *node, *tmp;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->conf_id_hashtbl[i])) {
+ clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
+ }
+
+ node = rb_first(&nn->unconf_name_tree);
+ while (node != NULL) {
+ tmp = node;
+ node = rb_next(tmp);
+ clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
+ rb_erase(tmp, &nn->unconf_name_tree);
+ destroy_client(clp);
+ }
+
+ kfree(nn->sessionid_hashtbl);
+ kfree(nn->lockowner_ino_hashtbl);
+ kfree(nn->ownerstr_hashtbl);
+ kfree(nn->unconf_id_hashtbl);
+ kfree(nn->conf_id_hashtbl);
+ put_net(net);
+}
int
-nfs4_state_start(void)
+nfs4_state_start_net(struct net *net)
{
- struct net *net = &init_net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- /*
- * FIXME: For now, we hang most of the pernet global stuff off of
- * init_net until nfsd is fully containerized. Eventually, we'll
- * need to pass a net pointer into this function, take a reference
- * to that instead and then do most of the rest of this on a per-net
- * basis.
- */
- get_net(net);
+ ret = nfs4_state_create_net(net);
+ if (ret)
+ return ret;
nfsd4_client_tracking_init(net);
nn->boot_time = get_seconds();
locks_start_grace(net, &nn->nfsd4_manager);
nn->grace_ended = false;
- printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
- nfsd4_grace);
+ printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
+ nn->nfsd4_grace, net);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
+ return 0;
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+int
+nfs4_state_start(void)
+{
+ int ret;
+
ret = set_callback_cred();
- if (ret) {
- ret = -ENOMEM;
- goto out_recovery;
- }
+ if (ret)
+ return -ENOMEM;
laundry_wq = create_singlethread_workqueue("nfsd4");
if (laundry_wq == NULL) {
ret = -ENOMEM;
@@ -4766,39 +4969,34 @@ nfs4_state_start(void)
ret = nfsd4_create_callback_queue();
if (ret)
goto out_free_laundry;
- queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
+
set_max_delegations();
+
return 0;
+
out_free_laundry:
destroy_workqueue(laundry_wq);
out_recovery:
- nfsd4_client_tracking_exit(net);
- put_net(net);
return ret;
}
-static void
-__nfs4_state_shutdown(void)
+/* should be called with the state lock held */
+void
+nfs4_state_shutdown_net(struct net *net)
{
- int i;
- struct nfs4_client *clp = NULL;
struct nfs4_delegation *dp = NULL;
struct list_head *pos, *next, reaplist;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ cancel_delayed_work_sync(&nn->laundromat_work);
+ locks_end_grace(&nn->nfsd4_manager);
- for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- while (!list_empty(&conf_id_hashtbl[i])) {
- clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
- destroy_client(clp);
- }
- while (!list_empty(&unconf_str_hashtbl[i])) {
- clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
- destroy_client(clp);
- }
- }
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
list_for_each_safe(pos, next, &del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ if (dp->dl_stid.sc_client->net != net)
+ continue;
list_move(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&recall_lock);
@@ -4807,22 +5005,14 @@ __nfs4_state_shutdown(void)
unhash_delegation(dp);
}
- nfsd4_client_tracking_exit(&init_net);
- put_net(&init_net);
+ nfsd4_client_tracking_exit(net);
+ nfs4_state_destroy_net(net);
}
void
nfs4_state_shutdown(void)
{
- struct net *net = &init_net;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- cancel_delayed_work_sync(&laundromat_work);
destroy_workqueue(laundry_wq);
- locks_end_grace(&nn->nfsd4_manager);
- nfs4_lock_state();
- __nfs4_state_shutdown();
- nfs4_unlock_state();
nfsd4_destroy_callback_queue();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fd548d155088..01168865dd37 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -53,6 +53,7 @@
#include "vfs.h"
#include "state.h"
#include "cache.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -65,17 +66,17 @@
#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
static __be32
-check_filename(char *str, int len, __be32 err)
+check_filename(char *str, int len)
{
int i;
if (len == 0)
return nfserr_inval;
if (isdotent(str, len))
- return err;
+ return nfserr_badname;
for (i = 0; i < len; i++)
if (str[i] == '/')
- return err;
+ return nfserr_badname;
return 0;
}
@@ -292,13 +293,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
status = nfs_ok;
if (ace->whotype != NFS4_ACL_WHO_NAMED)
- ace->who = 0;
+ ;
else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
status = nfsd_map_name_to_gid(argp->rqstp,
- buf, dummy32, &ace->who);
+ buf, dummy32, &ace->who_gid);
else
status = nfsd_map_name_to_uid(argp->rqstp,
- buf, dummy32, &ace->who);
+ buf, dummy32, &ace->who_uid);
if (status)
return status;
}
@@ -422,6 +423,93 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_TAIL;
}
+static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+ DECODE_HEAD;
+ u32 dummy, uid, gid;
+ char *machine_name;
+ int i;
+ int nr_secflavs;
+
+ /* callback_sec_params4 */
+ READ_BUF(4);
+ READ32(nr_secflavs);
+ cbs->flavor = (u32)(-1);
+ for (i = 0; i < nr_secflavs; ++i) {
+ READ_BUF(4);
+ READ32(dummy);
+ switch (dummy) {
+ case RPC_AUTH_NULL:
+ /* Nothing to read */
+ if (cbs->flavor == (u32)(-1))
+ cbs->flavor = RPC_AUTH_NULL;
+ break;
+ case RPC_AUTH_UNIX:
+ READ_BUF(8);
+ /* stamp */
+ READ32(dummy);
+
+ /* machine name */
+ READ32(dummy);
+ READ_BUF(dummy);
+ SAVEMEM(machine_name, dummy);
+
+ /* uid, gid */
+ READ_BUF(8);
+ READ32(uid);
+ READ32(gid);
+
+ /* more gids */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ if (cbs->flavor == (u32)(-1)) {
+ kuid_t kuid = make_kuid(&init_user_ns, uid);
+ kgid_t kgid = make_kgid(&init_user_ns, gid);
+ if (uid_valid(kuid) && gid_valid(kgid)) {
+ cbs->uid = kuid;
+ cbs->gid = kgid;
+ cbs->flavor = RPC_AUTH_UNIX;
+ } else {
+ dprintk("RPC_AUTH_UNIX with invalid"
+ "uid or gid ignoring!\n");
+ }
+ }
+ break;
+ case RPC_AUTH_GSS:
+ dprintk("RPC_AUTH_GSS callback secflavor "
+ "not supported!\n");
+ READ_BUF(8);
+ /* gcbp_service */
+ READ32(dummy);
+ /* gcbp_handle_from_server */
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ /* gcbp_handle_from_client */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ break;
+ default:
+ dprintk("Illegal callback secflavor\n");
+ return nfserr_inval;
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ READ32(bc->bc_cb_program);
+ nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+
+ DECODE_TAIL;
+}
+
static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
{
DECODE_HEAD;
@@ -490,7 +578,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
READ32(create->cr_namelen);
READ_BUF(create->cr_namelen);
SAVEMEM(create->cr_name, create->cr_namelen);
- if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+ if ((status = check_filename(create->cr_name, create->cr_namelen)))
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
@@ -522,7 +610,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
READ32(link->li_namelen);
READ_BUF(link->li_namelen);
SAVEMEM(link->li_name, link->li_namelen);
- if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval)))
+ if ((status = check_filename(link->li_name, link->li_namelen)))
return status;
DECODE_TAIL;
@@ -616,7 +704,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
READ32(lookup->lo_len);
READ_BUF(lookup->lo_len);
SAVEMEM(lookup->lo_name, lookup->lo_len);
- if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent)))
+ if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
return status;
DECODE_TAIL;
@@ -780,7 +868,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
return status;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
@@ -795,7 +883,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
return status;
break;
case NFS4_OPEN_CLAIM_FH:
@@ -907,7 +995,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
READ32(remove->rm_namelen);
READ_BUF(remove->rm_namelen);
SAVEMEM(remove->rm_name, remove->rm_namelen);
- if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent)))
+ if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
return status;
DECODE_TAIL;
@@ -925,9 +1013,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
READ32(rename->rn_tnamelen);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
- if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent)))
+ if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
return status;
- if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval)))
+ if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
return status;
DECODE_TAIL;
@@ -954,8 +1042,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
READ32(secinfo->si_namelen);
READ_BUF(secinfo->si_namelen);
SAVEMEM(secinfo->si_name, secinfo->si_namelen);
- status = check_filename(secinfo->si_name, secinfo->si_namelen,
- nfserr_noent);
+ status = check_filename(secinfo->si_name, secinfo->si_namelen);
if (status)
return status;
DECODE_TAIL;
@@ -1026,31 +1113,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
static __be32
nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
{
-#if 0
- struct nfsd4_compoundargs save = {
- .p = argp->p,
- .end = argp->end,
- .rqstp = argp->rqstp,
- };
- u32 ve_bmval[2];
- struct iattr ve_iattr; /* request */
- struct nfs4_acl *ve_acl; /* request */
-#endif
DECODE_HEAD;
if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
goto out;
/* For convenience's sake, we compare raw xdr'd attributes in
- * nfsd4_proc_verify; however we still decode here just to return
- * correct error in case of bad xdr. */
-#if 0
- status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
- if (status == nfserr_inval) {
- status = nfserrno(status);
- goto out;
- }
-#endif
+ * nfsd4_proc_verify */
+
READ_BUF(4);
READ32(verify->ve_attrlen);
READ_BUF(verify->ve_attrlen);
@@ -1063,7 +1133,6 @@ static __be32
nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
{
int avail;
- int v;
int len;
DECODE_HEAD;
@@ -1087,27 +1156,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
__FILE__, __LINE__);
goto xdr_error;
}
- argp->rqstp->rq_vec[0].iov_base = p;
- argp->rqstp->rq_vec[0].iov_len = avail;
- v = 0;
- len = write->wr_buflen;
- while (len > argp->rqstp->rq_vec[v].iov_len) {
- len -= argp->rqstp->rq_vec[v].iov_len;
- v++;
- argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen >= PAGE_SIZE) {
- argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
- argp->pagelen -= PAGE_SIZE;
- } else {
- argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
- argp->pagelen -= len;
- }
+ write->wr_head.iov_base = p;
+ write->wr_head.iov_len = avail;
+ WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
+ write->wr_pagelist = argp->pagelist;
+
+ len = XDR_QUADLEN(write->wr_buflen) << 2;
+ if (len >= avail) {
+ int pages;
+
+ len -= avail;
+
+ pages = len >> PAGE_SHIFT;
+ argp->pagelist += pages;
+ argp->pagelen -= pages * PAGE_SIZE;
+ len -= pages * PAGE_SIZE;
+
+ argp->p = (__be32 *)page_address(argp->pagelist[0]);
+ argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
}
- argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
- argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
- argp->rqstp->rq_vec[v].iov_len = len;
- write->wr_vlen = v+1;
+ argp->p += XDR_QUADLEN(len);
DECODE_TAIL;
}
@@ -1237,11 +1305,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
struct nfsd4_create_session *sess)
{
DECODE_HEAD;
-
u32 dummy;
- char *machine_name;
- int i;
- int nr_secflavs;
READ_BUF(16);
COPYMEM(&sess->clientid, 8);
@@ -1282,58 +1346,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
goto xdr_error;
}
- READ_BUF(8);
+ READ_BUF(4);
READ32(sess->callback_prog);
-
- /* callback_sec_params4 */
- READ32(nr_secflavs);
- for (i = 0; i < nr_secflavs; ++i) {
- READ_BUF(4);
- READ32(dummy);
- switch (dummy) {
- case RPC_AUTH_NULL:
- /* Nothing to read */
- break;
- case RPC_AUTH_UNIX:
- READ_BUF(8);
- /* stamp */
- READ32(dummy);
-
- /* machine name */
- READ32(dummy);
- READ_BUF(dummy);
- SAVEMEM(machine_name, dummy);
-
- /* uid, gid */
- READ_BUF(8);
- READ32(sess->uid);
- READ32(sess->gid);
-
- /* more gids */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy * 4);
- break;
- case RPC_AUTH_GSS:
- dprintk("RPC_AUTH_GSS callback secflavor "
- "not supported!\n");
- READ_BUF(8);
- /* gcbp_service */
- READ32(dummy);
- /* gcbp_handle_from_server */
- READ32(dummy);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
- /* gcbp_handle_from_client */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
- break;
- default:
- dprintk("Illegal callback secflavor\n");
- return nfserr_inval;
- }
- }
+ nfsd4_decode_cb_sec(argp, &sess->cb_sec);
DECODE_TAIL;
}
@@ -1528,7 +1543,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
/* new operations for NFSv4.1 */
- [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
[OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
@@ -1568,12 +1583,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
bool cachethis = false;
int i;
- /*
- * XXX: According to spec, we should check the tag
- * for UTF-8 compliance. I'm postponing this for
- * now because it seems that some clients do use
- * binary tags.
- */
READ_BUF(4);
READ32(argp->taglen);
READ_BUF(argp->taglen + 8);
@@ -1603,38 +1612,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op = &argp->ops[i];
op->replay = NULL;
- /*
- * We can't use READ_BUF() here because we need to handle
- * a missing opcode as an OP_WRITE + 1. So we need to check
- * to see if we're truly at the end of our buffer or if there
- * is another page we need to flip to.
- */
-
- if (argp->p == argp->end) {
- if (argp->pagelen < 4) {
- /* There isn't an opcode still on the wire */
- op->opnum = OP_WRITE + 1;
- op->status = nfserr_bad_xdr;
- argp->opcnt = i+1;
- break;
- }
-
- /*
- * False alarm. We just hit a page boundary, but there
- * is still data available. Move pointer across page
- * boundary. *snip from READ_BUF*
- */
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + (argp->pagelen>>2);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
- }
- op->opnum = ntohl(*argp->p++);
+ READ_BUF(4);
+ READ32(op->opnum);
if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
op->status = ops->decoders[op->opnum](argp, &op->u);
@@ -1954,7 +1933,7 @@ static u32 nfs4_file_type(umode_t mode)
}
static __be32
-nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
__be32 **p, int *buflen)
{
int status;
@@ -1963,10 +1942,10 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
return nfserr_resource;
if (whotype != NFS4_ACL_WHO_NAMED)
status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
- else if (group)
- status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1));
+ else if (gid_valid(gid))
+ status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
else
- status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1));
+ status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
if (status < 0)
return nfserrno(status);
*p = xdr_encode_opaque(*p, NULL, status);
@@ -1976,22 +1955,33 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
}
static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
+nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
{
- return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
+ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
+ p, buflen);
}
static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
+nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
{
- return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
+ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
+ p, buflen);
}
static inline __be32
-nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
__be32 **p, int *buflen)
{
- return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
+ kuid_t uid = INVALID_UID;
+ kgid_t gid = INVALID_GID;
+
+ if (ace->whotype == NFS4_ACL_WHO_NAMED) {
+ if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ gid = ace->who_gid;
+ else
+ uid = ace->who_uid;
+ }
+ return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
}
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2014,16 +2004,31 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
return 0;
}
+
+static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+{
+ struct path path = exp->ex_path;
+ int err;
+
+ path_get(&path);
+ while (follow_up(&path)) {
+ if (path.dentry != path.mnt->mnt_root)
+ break;
+ }
+ err = vfs_getattr(&path, stat);
+ path_put(&path);
+ return err;
+}
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
*
- * @countp is the buffer size in _words_; upon successful return this becomes
- * replaced with the number of words written.
+ * countp is the buffer size in _words_
*/
__be32
nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
+ struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,
struct svc_rqst *rqstp, int ignore_crossmnt)
{
u32 bmval0 = bmval[0];
@@ -2032,12 +2037,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct kstat stat;
struct svc_fh tempfh;
struct kstatfs statfs;
- int buflen = *countp << 2;
+ int buflen = count << 2;
__be32 *attrlenp;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
- __be32 *p = buffer;
+ __be32 *p = *buffer;
__be32 status;
int err;
int aclsupport = 0;
@@ -2048,6 +2053,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -2061,7 +2067,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out;
}
- err = vfs_getattr(exp->ex_path.mnt, dentry, &stat);
+ err = vfs_getattr(&path, &stat);
if (err)
goto out_nfserr;
if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
@@ -2208,7 +2214,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
if ((buflen -= 4) < 0)
goto out_resource;
- WRITE32(nfsd4_lease);
+ WRITE32(nn->nfsd4_lease);
}
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
if ((buflen -= 4) < 0)
@@ -2235,9 +2241,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
WRITE32(ace->type);
WRITE32(ace->flag);
WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
- status = nfsd4_encode_aclname(rqstp, ace->whotype,
- ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP,
- &p, &buflen);
+ status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
if (status == nfserr_resource)
goto out_resource;
if (status)
@@ -2430,18 +2434,8 @@ out_acl:
* and this is the root of a cross-mounted filesystem.
*/
if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root) {
- struct path path = exp->ex_path;
- path_get(&path);
- while (follow_up(&path)) {
- if (path.dentry != path.mnt->mnt_root)
- break;
- }
- err = vfs_getattr(path.mnt, path.dentry, &stat);
- path_put(&path);
- if (err)
- goto out_nfserr;
- }
+ dentry == exp->ex_path.mnt->mnt_root)
+ get_parent_attributes(exp, &stat);
WRITE64(stat.ino);
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
@@ -2452,7 +2446,7 @@ out_acl:
}
*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
- *countp = p - buffer;
+ *buffer = p;
status = nfs_ok;
out:
@@ -2464,7 +2458,6 @@ out_nfserr:
status = nfserrno(err);
goto out;
out_resource:
- *countp = 0;
status = nfserr_resource;
goto out;
out_serverfault:
@@ -2483,7 +2476,7 @@ static inline int attributes_need_mount(u32 *bmval)
static __be32
nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
- const char *name, int namlen, __be32 *p, int *buflen)
+ const char *name, int namlen, __be32 **p, int buflen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -2589,10 +2582,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
- nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
+ nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);
switch (nfserr) {
case nfs_ok:
- p += buflen;
break;
case nfserr_resource:
nfserr = nfserr_toosmall;
@@ -2719,10 +2711,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
- resp->p, &buflen, getattr->ga_bmval,
+ &resp->p, buflen, getattr->ga_bmval,
resp->rqstp, 0);
- if (!nfserr)
- resp->p += buflen;
return nfserr;
}
@@ -2927,7 +2917,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
u32 eof;
- int v, pn;
+ int v;
+ struct page *page;
unsigned long maxcount;
long len;
__be32 *p;
@@ -2946,11 +2937,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
len = maxcount;
v = 0;
while (len > 0) {
- pn = resp->rqstp->rq_resused++;
- resp->rqstp->rq_vec[v].iov_base =
- page_address(resp->rqstp->rq_respages[pn]);
+ page = *(resp->rqstp->rq_next_page);
+ if (!page) { /* ran out of pages */
+ maxcount -= len;
+ break;
+ }
+ resp->rqstp->rq_vec[v].iov_base = page_address(page);
resp->rqstp->rq_vec[v].iov_len =
len < PAGE_SIZE ? len : PAGE_SIZE;
+ resp->rqstp->rq_next_page++;
v++;
len -= PAGE_SIZE;
}
@@ -2996,8 +2991,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
return nfserr;
if (resp->xbuf->page_len)
return nfserr_resource;
+ if (!*resp->rqstp->rq_next_page)
+ return nfserr_resource;
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+ page = page_address(*(resp->rqstp->rq_next_page++));
maxcount = PAGE_SIZE;
RESERVE_SPACE(4);
@@ -3045,6 +3042,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
return nfserr;
if (resp->xbuf->page_len)
return nfserr_resource;
+ if (!*resp->rqstp->rq_next_page)
+ return nfserr_resource;
RESERVE_SPACE(NFS4_VERIFIER_SIZE);
savep = p;
@@ -3071,7 +3070,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
goto err_no_verf;
}
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+ page = page_address(*(resp->rqstp->rq_next_page++));
readdir->common.err = 0;
readdir->buflen = maxcount;
readdir->buffer = page;
@@ -3094,8 +3093,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
p = readdir->buffer;
*p++ = 0; /* no more entries */
*p++ = htonl(readdir->common.err == nfserr_eof);
- resp->xbuf->page_len = ((char*)p) - (char*)page_address(
- resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+ resp->xbuf->page_len = ((char*)p) -
+ (char*)page_address(*(resp->rqstp->rq_next_page-1));
/* Use rest of head for padding and remaining ops: */
resp->xbuf->tail[0].iov_base = tailbase;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 2cbac34a55da..62c1ee128aeb 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -9,22 +9,22 @@
*/
#include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
#include "nfsd.h"
#include "cache.h"
-/* Size of reply cache. Common values are:
- * 4.3BSD: 128
- * 4.4BSD: 256
- * Solaris2: 1024
- * DEC Unix: 512-4096
- */
-#define CACHESIZE 1024
+#define NFSDDBG_FACILITY NFSDDBG_REPCACHE
+
#define HASHSIZE 64
static struct hlist_head * cache_hash;
static struct list_head lru_head;
-static int cache_disabled = 1;
+static struct kmem_cache *drc_slab;
+static unsigned int num_drc_entries;
+static unsigned int max_drc_entries;
/*
* Calculate the hash index from an XID.
@@ -37,6 +37,14 @@ static inline u32 request_hash(u32 xid)
}
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+static void cache_cleaner_func(struct work_struct *unused);
+static int nfsd_reply_cache_shrink(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+struct shrinker nfsd_reply_cache_shrinker = {
+ .shrink = nfsd_reply_cache_shrink,
+ .seeks = 1,
+};
/*
* locking for the reply cache:
@@ -44,30 +52,86 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
* Otherwise, it when accessing _prev or _next, the lock must be held.
*/
static DEFINE_SPINLOCK(cache_lock);
+static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-int nfsd_reply_cache_init(void)
+/*
+ * Put a cap on the size of the DRC based on the amount of available
+ * low memory in the machine.
+ *
+ * 64MB: 8192
+ * 128MB: 11585
+ * 256MB: 16384
+ * 512MB: 23170
+ * 1GB: 32768
+ * 2GB: 46340
+ * 4GB: 65536
+ * 8GB: 92681
+ * 16GB: 131072
+ *
+ * ...with a hard cap of 256k entries. In the worst case, each entry will be
+ * ~1k, so the above numbers should give a rough max of the amount of memory
+ * used in k.
+ */
+static unsigned int
+nfsd_cache_size_limit(void)
+{
+ unsigned int limit;
+ unsigned long low_pages = totalram_pages - totalhigh_pages;
+
+ limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
+ return min_t(unsigned int, limit, 256*1024);
+}
+
+static struct svc_cacherep *
+nfsd_reply_cache_alloc(void)
{
struct svc_cacherep *rp;
- int i;
- INIT_LIST_HEAD(&lru_head);
- i = CACHESIZE;
- while (i) {
- rp = kmalloc(sizeof(*rp), GFP_KERNEL);
- if (!rp)
- goto out_nomem;
- list_add(&rp->c_lru, &lru_head);
+ rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+ if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
+ INIT_LIST_HEAD(&rp->c_lru);
INIT_HLIST_NODE(&rp->c_hash);
- i--;
}
+ return rp;
+}
+
+static void
+nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+{
+ if (rp->c_type == RC_REPLBUFF)
+ kfree(rp->c_replvec.iov_base);
+ hlist_del(&rp->c_hash);
+ list_del(&rp->c_lru);
+ --num_drc_entries;
+ kmem_cache_free(drc_slab, rp);
+}
+
+static void
+nfsd_reply_cache_free(struct svc_cacherep *rp)
+{
+ spin_lock(&cache_lock);
+ nfsd_reply_cache_free_locked(rp);
+ spin_unlock(&cache_lock);
+}
+
+int nfsd_reply_cache_init(void)
+{
+ register_shrinker(&nfsd_reply_cache_shrinker);
+ drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
+ 0, 0, NULL);
+ if (!drc_slab)
+ goto out_nomem;
- cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+ cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
if (!cache_hash)
goto out_nomem;
- cache_disabled = 0;
+ INIT_LIST_HEAD(&lru_head);
+ max_drc_entries = nfsd_cache_size_limit();
+ num_drc_entries = 0;
+
return 0;
out_nomem:
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
@@ -79,27 +143,33 @@ void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unregister_shrinker(&nfsd_reply_cache_shrinker);
+ cancel_delayed_work_sync(&cache_cleaner);
+
while (!list_empty(&lru_head)) {
rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
- kfree(rp->c_replvec.iov_base);
- list_del(&rp->c_lru);
- kfree(rp);
+ nfsd_reply_cache_free_locked(rp);
}
- cache_disabled = 1;
-
kfree (cache_hash);
cache_hash = NULL;
+
+ if (drc_slab) {
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
+ }
}
/*
- * Move cache entry to end of LRU list
+ * Move cache entry to end of LRU list, and queue the cleaner to run if it's
+ * not already scheduled.
*/
static void
lru_put_end(struct svc_cacherep *rp)
{
+ rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &lru_head);
+ schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
/*
@@ -112,83 +182,214 @@ hash_refile(struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
}
+static inline bool
+nfsd_cache_entry_expired(struct svc_cacherep *rp)
+{
+ return rp->c_state != RC_INPROG &&
+ time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static void
+prune_cache_entries(void)
+{
+ struct svc_cacherep *rp, *tmp;
+
+ list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+ if (!nfsd_cache_entry_expired(rp) &&
+ num_drc_entries <= max_drc_entries)
+ break;
+ nfsd_reply_cache_free_locked(rp);
+ }
+
+ /*
+ * Conditionally rearm the job. If we cleaned out the list, then
+ * cancel any pending run (since there won't be any work to do).
+ * Otherwise, we rearm the job or modify the existing one to run in
+ * RC_EXPIRE since we just ran the pruner.
+ */
+ if (list_empty(&lru_head))
+ cancel_delayed_work(&cache_cleaner);
+ else
+ mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
+}
+
+static void
+cache_cleaner_func(struct work_struct *unused)
+{
+ spin_lock(&cache_lock);
+ prune_cache_entries();
+ spin_unlock(&cache_lock);
+}
+
+static int
+nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned int num;
+
+ spin_lock(&cache_lock);
+ if (sc->nr_to_scan)
+ prune_cache_entries();
+ num = num_drc_entries;
+ spin_unlock(&cache_lock);
+
+ return num;
+}
+
+/*
+ * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+ */
+static __wsum
+nfsd_cache_csum(struct svc_rqst *rqstp)
+{
+ int idx;
+ unsigned int base;
+ __wsum csum;
+ struct xdr_buf *buf = &rqstp->rq_arg;
+ const unsigned char *p = buf->head[0].iov_base;
+ size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
+ RC_CSUMLEN);
+ size_t len = min(buf->head[0].iov_len, csum_len);
+
+ /* rq_arg.head first */
+ csum = csum_partial(p, len, 0);
+ csum_len -= len;
+
+ /* Continue into page array */
+ idx = buf->page_base / PAGE_SIZE;
+ base = buf->page_base & ~PAGE_MASK;
+ while (csum_len) {
+ p = page_address(buf->pages[idx]) + base;
+ len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ csum = csum_partial(p, len, csum);
+ csum_len -= len;
+ base = 0;
+ ++idx;
+ }
+ return csum;
+}
+
+/*
+ * Search the request hash for an entry that matches the given rqstp.
+ * Must be called with cache_lock held. Returns the found entry or
+ * NULL on failure.
+ */
+static struct svc_cacherep *
+nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+{
+ struct svc_cacherep *rp;
+ struct hlist_head *rh;
+ __be32 xid = rqstp->rq_xid;
+ u32 proto = rqstp->rq_prot,
+ vers = rqstp->rq_vers,
+ proc = rqstp->rq_proc;
+
+ rh = &cache_hash[request_hash(xid)];
+ hlist_for_each_entry(rp, rh, c_hash) {
+ if (xid == rp->c_xid && proc == rp->c_proc &&
+ proto == rp->c_prot && vers == rp->c_vers &&
+ rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum &&
+ rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) &&
+ rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return rp;
+ }
+ return NULL;
+}
+
/*
* Try to find an entry matching the current call in the cache. When none
- * is found, we grab the oldest unlocked entry off the LRU list.
- * Note that no operation within the loop may sleep.
+ * is found, we try to grab the oldest expired entry off the LRU list. If
+ * a suitable one isn't there, then drop the cache_lock and allocate a
+ * new one, then search again in case one got inserted while this thread
+ * didn't hold the lock.
*/
int
nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct hlist_node *hn;
- struct hlist_head *rh;
- struct svc_cacherep *rp;
+ struct svc_cacherep *rp, *found;
__be32 xid = rqstp->rq_xid;
u32 proto = rqstp->rq_prot,
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
+ __wsum csum;
unsigned long age;
int type = rqstp->rq_cachetype;
int rtn;
rqstp->rq_cacherep = NULL;
- if (cache_disabled || type == RC_NOCACHE) {
+ if (type == RC_NOCACHE) {
nfsdstats.rcnocache++;
return RC_DOIT;
}
+ csum = nfsd_cache_csum(rqstp);
+
spin_lock(&cache_lock);
rtn = RC_DOIT;
- rh = &cache_hash[request_hash(xid)];
- hlist_for_each_entry(rp, hn, rh, c_hash) {
- if (rp->c_state != RC_UNUSED &&
- xid == rp->c_xid && proc == rp->c_proc &&
- proto == rp->c_prot && vers == rp->c_vers &&
- time_before(jiffies, rp->c_timestamp + 120*HZ) &&
- memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) {
- nfsdstats.rchits++;
- goto found_entry;
+ rp = nfsd_cache_search(rqstp, csum);
+ if (rp)
+ goto found_entry;
+
+ /* Try to use the first entry on the LRU */
+ if (!list_empty(&lru_head)) {
+ rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
+ if (nfsd_cache_entry_expired(rp) ||
+ num_drc_entries >= max_drc_entries) {
+ lru_put_end(rp);
+ prune_cache_entries();
+ goto setup_entry;
}
}
- nfsdstats.rcmisses++;
- /* This loop shouldn't take more than a few iterations normally */
- {
- int safe = 0;
- list_for_each_entry(rp, &lru_head, c_lru) {
- if (rp->c_state != RC_INPROG)
- break;
- if (safe++ > CACHESIZE) {
- printk("nfsd: loop in repcache LRU list\n");
- cache_disabled = 1;
- goto out;
- }
+ /* Drop the lock and allocate a new entry */
+ spin_unlock(&cache_lock);
+ rp = nfsd_reply_cache_alloc();
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ return RC_DOIT;
}
+ spin_lock(&cache_lock);
+ ++num_drc_entries;
+
+ /*
+ * Must search again just in case someone inserted one
+ * after we dropped the lock above.
+ */
+ found = nfsd_cache_search(rqstp, csum);
+ if (found) {
+ nfsd_reply_cache_free_locked(rp);
+ rp = found;
+ goto found_entry;
}
- /* All entries on the LRU are in-progress. This should not happen */
- if (&rp->c_lru == &lru_head) {
- static int complaints;
-
- printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
- if (++complaints > 5) {
- printk(KERN_WARNING "nfsd: disabling repcache.\n");
- cache_disabled = 1;
- }
- goto out;
- }
+ /*
+ * We're keeping the one we just allocated. Are we now over the
+ * limit? Prune one off the tip of the LRU in trade for the one we
+ * just allocated if so.
+ */
+ if (num_drc_entries >= max_drc_entries)
+ nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
+ struct svc_cacherep, c_lru));
+setup_entry:
+ nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
rp->c_xid = xid;
rp->c_proc = proc;
- memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr));
+ rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
rp->c_prot = proto;
rp->c_vers = vers;
- rp->c_timestamp = jiffies;
+ rp->c_len = rqstp->rq_arg.len;
+ rp->c_csum = csum;
hash_refile(rp);
+ lru_put_end(rp);
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -201,9 +402,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
return rtn;
found_entry:
+ nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
- rp->c_timestamp = jiffies;
lru_put_end(rp);
rtn = RC_DROPIT;
@@ -232,7 +433,7 @@ found_entry:
break;
default:
printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- rp->c_state = RC_UNUSED;
+ nfsd_reply_cache_free_locked(rp);
}
goto out;
@@ -257,11 +458,11 @@ found_entry:
void
nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
- struct svc_cacherep *rp;
+ struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
int len;
- if (!(rp = rqstp->rq_cacherep) || cache_disabled)
+ if (!rp)
return;
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
@@ -269,7 +470,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
/* Don't cache excessive amounts of data and XDR failures */
if (!statp || len > (256 >> 2)) {
- rp->c_state = RC_UNUSED;
+ nfsd_reply_cache_free(rp);
return;
}
@@ -283,21 +484,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
cachv = &rp->c_replvec;
cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
if (!cachv->iov_base) {
- spin_lock(&cache_lock);
- rp->c_state = RC_UNUSED;
- spin_unlock(&cache_lock);
+ nfsd_reply_cache_free(rp);
return;
}
cachv->iov_len = len << 2;
memcpy(cachv->iov_base, statp, len << 2);
break;
+ case RC_NOCACHE:
+ nfsd_reply_cache_free(rp);
+ return;
}
spin_lock(&cache_lock);
lru_put_end(rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
- rp->c_timestamp = jiffies;
spin_unlock(&cache_lock);
return;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dab350dfc376..13a21c8fca49 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -10,7 +10,7 @@
#include <linux/sunrpc/svcsock.h>
#include <linux/lockd/lockd.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
@@ -19,7 +19,7 @@
#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
-#include "fault_inject.h"
+#include "state.h"
#include "netns.h"
/*
@@ -85,7 +85,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
{
- ino_t ino = file->f_path.dentry->d_inode->i_ino;
+ ino_t ino = file_inode(file)->i_ino;
char *data;
ssize_t rv;
@@ -125,11 +125,11 @@ static const struct file_operations transaction_ops = {
.llseek = default_llseek,
};
-static int exports_open(struct inode *inode, struct file *file)
+static int exports_net_open(struct net *net, struct file *file)
{
int err;
struct seq_file *seq;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
err = seq_open(file, &nfs_exports_op);
if (err)
@@ -140,8 +140,26 @@ static int exports_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations exports_operations = {
- .open = exports_open,
+static int exports_proc_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(current->nsproxy->net_ns, file);
+}
+
+static const struct file_operations exports_proc_operations = {
+ .open = exports_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .owner = THIS_MODULE,
+};
+
+static int exports_nfsd_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(inode->i_sb->s_fs_info, file);
+}
+
+static const struct file_operations exports_nfsd_operations = {
+ .open = exports_nfsd_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
@@ -186,9 +204,6 @@ static struct file_operations supported_enctypes_ops = {
};
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
-extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
-extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
-
static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
@@ -223,6 +238,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
struct sockaddr *sap = (struct sockaddr *)&address;
size_t salen = sizeof(address);
char *fo_path;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
/* sanity check */
if (size == 0)
@@ -235,7 +251,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
- if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
+ if (rpc_pton(net, fo_path, size, sap, salen) == 0)
return -EINVAL;
return nlmsvc_unlock_all_by_ip(sap);
@@ -320,6 +336,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
int len;
struct auth_domain *dom;
struct knfsd_fh fh;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
if (size == 0)
return -EINVAL;
@@ -355,7 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
if (!dom)
return -ENOMEM;
- len = exp_rootfh(&init_net, dom, path, &fh, maxsize);
+ len = exp_rootfh(net, dom, path, &fh, maxsize);
auth_domain_put(dom);
if (len)
return len;
@@ -399,6 +416,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
int rv;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+
if (size > 0) {
int newthreads;
rv = get_int(&mesg, &newthreads);
@@ -406,11 +425,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return rv;
if (newthreads < 0)
return -EINVAL;
- rv = nfsd_svc(newthreads);
+ rv = nfsd_svc(newthreads, net);
if (rv < 0)
return rv;
} else
- rv = nfsd_nrthreads();
+ rv = nfsd_nrthreads(net);
return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
}
@@ -448,9 +467,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
int len;
int npools;
int *nthreads;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- npools = nfsd_nrpools();
+ npools = nfsd_nrpools(net);
if (npools == 0) {
/*
* NFS is shut down. The admin can start it by
@@ -478,12 +498,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
if (nthreads[i] < 0)
goto out_free;
}
- rv = nfsd_set_nrthreads(i, nthreads);
+ rv = nfsd_set_nrthreads(i, nthreads, net);
if (rv)
goto out_free;
}
- rv = nfsd_get_nrthreads(npools, nthreads);
+ rv = nfsd_get_nrthreads(npools, nthreads, net);
if (rv)
goto out_free;
@@ -510,11 +530,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
unsigned minor;
ssize_t tlen = 0;
char *sep;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (size>0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
/* Cannot change versions without updating
- * nfsd_serv->sv_xdrsize, and reallocing
+ * nn->nfsd_serv->sv_xdrsize, and reallocing
* rq_argp and rq_resp
*/
return -EBUSY;
@@ -532,7 +554,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
else
num = simple_strtol(vers, &minorp, 0);
if (*minorp == '.') {
- if (num < 4)
+ if (num != 4)
return -EINVAL;
minor = simple_strtoul(minorp+1, NULL, 0);
if (minor == 0)
@@ -645,11 +667,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
* Zero-length write. Return a list of NFSD's current listener
* transports.
*/
-static ssize_t __write_ports_names(char *buf)
+static ssize_t __write_ports_names(char *buf, struct net *net)
{
- if (nfsd_serv == NULL)
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
return 0;
- return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+ return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
}
/*
@@ -657,28 +681,28 @@ static ssize_t __write_ports_names(char *buf)
* a socket of a supported family/protocol, and we use it as an
* nfsd listener.
*/
-static ssize_t __write_ports_addfd(char *buf)
+static ssize_t __write_ports_addfd(char *buf, struct net *net)
{
char *mesg = buf;
int fd, err;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
err = get_int(&mesg, &fd);
if (err != 0 || fd < 0)
return -EINVAL;
- err = nfsd_create_serv();
+ err = nfsd_create_serv(net);
if (err != 0)
return err;
- err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+ err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
if (err < 0) {
nfsd_destroy(net);
return err;
}
/* Decrease the count, but don't shut down the service */
- nfsd_serv->sv_nrthreads--;
+ nn->nfsd_serv->sv_nrthreads--;
return err;
}
@@ -686,12 +710,12 @@ static ssize_t __write_ports_addfd(char *buf)
* A transport listener is added by writing it's transport name and
* a port number.
*/
-static ssize_t __write_ports_addxprt(char *buf)
+static ssize_t __write_ports_addxprt(char *buf, struct net *net)
{
char transport[16];
struct svc_xprt *xprt;
int port, err;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (sscanf(buf, "%15s %5u", transport, &port) != 2)
return -EINVAL;
@@ -699,25 +723,25 @@ static ssize_t __write_ports_addxprt(char *buf)
if (port < 1 || port > USHRT_MAX)
return -EINVAL;
- err = nfsd_create_serv();
+ err = nfsd_create_serv(net);
if (err != 0)
return err;
- err = svc_create_xprt(nfsd_serv, transport, net,
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
PF_INET, port, SVC_SOCK_ANONYMOUS);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nfsd_serv, transport, net,
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
PF_INET6, port, SVC_SOCK_ANONYMOUS);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
/* Decrease the count, but don't shut down the service */
- nfsd_serv->sv_nrthreads--;
+ nn->nfsd_serv->sv_nrthreads--;
return 0;
out_close:
- xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
+ xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
svc_close_xprt(xprt);
svc_xprt_put(xprt);
@@ -727,16 +751,17 @@ out_err:
return err;
}
-static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+static ssize_t __write_ports(struct file *file, char *buf, size_t size,
+ struct net *net)
{
if (size == 0)
- return __write_ports_names(buf);
+ return __write_ports_names(buf, net);
if (isdigit(buf[0]))
- return __write_ports_addfd(buf);
+ return __write_ports_addfd(buf, net);
if (isalpha(buf[0]))
- return __write_ports_addxprt(buf);
+ return __write_ports_addxprt(buf, net);
return -EINVAL;
}
@@ -787,9 +812,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
static ssize_t write_ports(struct file *file, char *buf, size_t size)
{
ssize_t rv;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- rv = __write_ports(file, buf, size);
+ rv = __write_ports(file, buf, size, net);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -821,6 +847,9 @@ int nfsd_max_blksize;
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
if (size > 0) {
int bsize;
int rv = get_int(&mesg, &bsize);
@@ -835,7 +864,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
bsize = NFSSVC_MAXBLKSIZE;
bsize &= ~(1024-1);
mutex_lock(&nfsd_mutex);
- if (nfsd_serv) {
+ if (nn->nfsd_serv) {
mutex_unlock(&nfsd_mutex);
return -EBUSY;
}
@@ -848,13 +877,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
}
#ifdef CONFIG_NFSD_V4
-static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time_t *time, struct nfsd_net *nn)
{
char *mesg = buf;
int rv, i;
if (size > 0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
return -EBUSY;
rv = get_int(&mesg, &i);
if (rv)
@@ -879,12 +909,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim
return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
}
-static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time_t *time, struct nfsd_net *nn)
{
ssize_t rv;
mutex_lock(&nfsd_mutex);
- rv = __nfsd4_write_time(file, buf, size, time);
+ rv = __nfsd4_write_time(file, buf, size, time, nn);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -912,7 +943,9 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_
*/
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
- return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
}
/**
@@ -927,17 +960,20 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
*/
static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
{
- return nfsd4_write_time(file, buf, size, &nfsd4_grace);
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
}
-static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
+ struct nfsd_net *nn)
{
char *mesg = buf;
char *recdir;
int len, status;
if (size > 0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
return -EBUSY;
if (size > PATH_MAX || buf[size-1] != '\n')
return -EINVAL;
@@ -981,9 +1017,11 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
{
ssize_t rv;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
mutex_lock(&nfsd_mutex);
- rv = __write_recoverydir(file, buf, size);
+ rv = __write_recoverydir(file, buf, size, nn);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -998,7 +1036,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
static struct tree_descr nfsd_files[] = {
- [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+ [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
[NFSD_FO_UnlockIP] = {"unlock_ip",
@@ -1022,20 +1060,35 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
#endif
/* last one */ {""}
};
- return simple_fill_super(sb, 0x6e667364, nfsd_files);
+ struct net *net = data;
+ int ret;
+
+ ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+ if (ret)
+ return ret;
+ sb->s_fs_info = get_net(net);
+ return 0;
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_single(fs_type, flags, data, nfsd_fill_super);
+ return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+}
+
+static void nfsd_umount(struct super_block *sb)
+{
+ struct net *net = sb->s_fs_info;
+
+ kill_litter_super(sb);
+ put_net(net);
}
static struct file_system_type nfsd_fs_type = {
.owner = THIS_MODULE,
.name = "nfsd",
.mount = nfsd_mount,
- .kill_sb = kill_litter_super,
+ .kill_sb = nfsd_umount,
};
#ifdef CONFIG_PROC_FS
@@ -1046,7 +1099,8 @@ static int create_proc_exports_entry(void)
entry = proc_mkdir("fs/nfs", NULL);
if (!entry)
return -ENOMEM;
- entry = proc_create("exports", 0, entry, &exports_operations);
+ entry = proc_create("exports", 0, entry,
+ &exports_proc_operations);
if (!entry)
return -ENOMEM;
return 0;
@@ -1063,6 +1117,7 @@ int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
int retval;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
retval = nfsd_export_init(net);
if (retval)
@@ -1070,6 +1125,8 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
return 0;
out_idmap_error:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 80d5ce40aadb..07a473fd49bc 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -55,36 +55,26 @@ extern struct svc_version nfsd_version2, nfsd_version3,
nfsd_version4;
extern u32 nfsd_supported_minorversion;
extern struct mutex nfsd_mutex;
-extern struct svc_serv *nfsd_serv;
extern spinlock_t nfsd_drc_lock;
-extern unsigned int nfsd_drc_max_mem;
-extern unsigned int nfsd_drc_mem_used;
+extern unsigned long nfsd_drc_max_mem;
+extern unsigned long nfsd_drc_mem_used;
extern const struct seq_operations nfs_exports_op;
/*
* Function prototypes.
*/
-int nfsd_svc(int nrservs);
+int nfsd_svc(int nrservs, struct net *net);
int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
-int nfsd_nrthreads(void);
-int nfsd_nrpools(void);
-int nfsd_get_nrthreads(int n, int *);
-int nfsd_set_nrthreads(int n, int *);
+int nfsd_nrthreads(struct net *);
+int nfsd_nrpools(struct net *);
+int nfsd_get_nrthreads(int n, int *, struct net *);
+int nfsd_set_nrthreads(int n, int *, struct net *);
int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
-static inline void nfsd_destroy(struct net *net)
-{
- int destroy = (nfsd_serv->sv_nrthreads == 1);
-
- if (destroy)
- svc_shutdown_net(nfsd_serv, net);
- svc_destroy(nfsd_serv);
- if (destroy)
- nfsd_serv = NULL;
-}
+void nfsd_destroy(struct net *net);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
#ifdef CONFIG_NFSD_V2_ACL
@@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
int nfsd_vers(int vers, enum vers_op change);
int nfsd_minorversion(u32 minorversion, enum vers_op change);
void nfsd_reset_versions(void);
-int nfsd_create_serv(void);
+int nfsd_create_serv(struct net *net);
extern int nfsd_max_blksize;
@@ -116,12 +106,14 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
* NFSv4 State
*/
#ifdef CONFIG_NFSD_V4
-extern unsigned int max_delegations;
+extern unsigned long max_delegations;
void nfs4_state_init(void);
int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
+int nfs4_state_start_net(struct net *net);
void nfs4_state_shutdown(void);
+void nfs4_state_shutdown_net(struct net *net);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
@@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
+static inline int nfs4_state_start_net(struct net *net) { return 0; }
static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_state_shutdown_net(struct net *net) { }
static inline void nfs4_reset_lease(time_t leasetime) { }
static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
static inline char * nfs4_recoverydir(void) {return NULL; }
@@ -265,16 +259,8 @@ void nfsd_lockd_shutdown(void);
/* Check for dir entries '.' and '..' */
#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
-/*
- * Time of server startup
- */
-extern struct timeval nfssvc_boot;
-
#ifdef CONFIG_NFSD_V4
-extern time_t nfsd4_lease;
-extern time_t nfsd4_grace;
-
/* before processing a COMPOUND operation, we have to check that there
* is enough space in the buffer for XDR encode to succeed. otherwise,
* we might process an operation with side effects, and be unable to
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 032af381b3aa..814afaa4458a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (inode)
_fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255) {
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
fh_put(fhp);
return nfserr_opnotsupp;
}
@@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)
goto out;
_fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255)
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
return nfserr_opnotsupp;
}
out:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index aad6d457b9e8..54c6b3d3cc79 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -26,17 +26,13 @@ static __be32
nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
{
if (err) return err;
- return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
- resp->fh.fh_dentry,
- &resp->stat));
+ return fh_getattr(&resp->fh, &resp->stat);
}
static __be32
nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
{
if (err) return err;
- return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
- resp->fh.fh_dentry,
- &resp->stat));
+ return fh_getattr(&resp->fh, &resp->stat);
}
/*
* Get a file's attributes
@@ -150,9 +146,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
&resp->count);
if (nfserr) return nfserr;
- return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
- resp->fh.fh_dentry,
- &resp->stat));
+ return fh_getattr(&resp->fh, &resp->stat);
}
/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2013aa001dab..262df5ccbf59 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,7 +11,6 @@
#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
-#include <linux/nsproxy.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
@@ -22,19 +21,19 @@
#include "nfsd.h"
#include "cache.h"
#include "vfs.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_SVC
extern struct svc_program nfsd_program;
static int nfsd(void *vrqstp);
-struct timeval nfssvc_boot;
/*
- * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
* of the svc_serv struct. In particular, ->sv_nrthreads but also to some
* extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
*
- * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
* properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
* of nfsd threads must exist and each must listed in ->sp_all_threads in each
* entry of ->sv_pools[].
@@ -52,7 +51,6 @@ struct timeval nfssvc_boot;
* nfsd_versions
*/
DEFINE_MUTEX(nfsd_mutex);
-struct svc_serv *nfsd_serv;
/*
* nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -61,8 +59,8 @@ struct svc_serv *nfsd_serv;
* nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
*/
spinlock_t nfsd_drc_lock;
-unsigned int nfsd_drc_max_mem;
-unsigned int nfsd_drc_mem_used;
+unsigned long nfsd_drc_max_mem;
+unsigned long nfsd_drc_mem_used;
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
@@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
*/
#define NFSD_MAXSERVS 8192
-int nfsd_nrthreads(void)
+int nfsd_nrthreads(struct net *net)
{
int rv = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
mutex_lock(&nfsd_mutex);
- if (nfsd_serv)
- rv = nfsd_serv->sv_nrthreads;
+ if (nn->nfsd_serv)
+ rv = nn->nfsd_serv->sv_nrthreads;
mutex_unlock(&nfsd_mutex);
return rv;
}
-static int nfsd_init_socks(void)
+static int nfsd_init_socks(struct net *net)
{
int error;
- if (!list_empty(&nfsd_serv->sv_permsocks))
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
+ error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
- error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
+ error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
@@ -202,14 +204,15 @@ static int nfsd_init_socks(void)
return 0;
}
-static bool nfsd_up = false;
+static int nfsd_users = 0;
-static int nfsd_startup(int nrservs)
+static int nfsd_startup_generic(int nrservs)
{
int ret;
- if (nfsd_up)
+ if (nfsd_users++)
return 0;
+
/*
* Readahead param cache - will no-op if it already exists.
* (Note therefore results will be suboptimal if number of
@@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)
ret = nfsd_racache_init(2*nrservs);
if (ret)
return ret;
- ret = nfsd_init_socks();
+ ret = nfs4_state_start();
if (ret)
goto out_racache;
- ret = lockd_up(&init_net);
+ return 0;
+
+out_racache:
+ nfsd_racache_shutdown();
+ return ret;
+}
+
+static void nfsd_shutdown_generic(void)
+{
+ if (--nfsd_users)
+ return;
+
+ nfs4_state_shutdown();
+ nfsd_racache_shutdown();
+}
+
+static int nfsd_startup_net(int nrservs, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int ret;
+
+ if (nn->nfsd_net_up)
+ return 0;
+
+ ret = nfsd_startup_generic(nrservs);
if (ret)
- goto out_racache;
- ret = nfs4_state_start();
+ return ret;
+ ret = nfsd_init_socks(net);
+ if (ret)
+ goto out_socks;
+ ret = lockd_up(net);
+ if (ret)
+ goto out_socks;
+ ret = nfs4_state_start_net(net);
if (ret)
goto out_lockd;
- nfsd_up = true;
+
+ nn->nfsd_net_up = true;
return 0;
+
out_lockd:
- lockd_down(&init_net);
-out_racache:
- nfsd_racache_shutdown();
+ lockd_down(net);
+out_socks:
+ nfsd_shutdown_generic();
return ret;
}
-static void nfsd_shutdown(void)
+static void nfsd_shutdown_net(struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_state_shutdown_net(net);
+ lockd_down(net);
+ nn->nfsd_net_up = false;
+ nfsd_shutdown_generic();
+}
+
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
* other initialization has been done.
*/
- if (!nfsd_up)
+ if (!nn->nfsd_net_up)
return;
- nfs4_state_shutdown();
- lockd_down(&init_net);
- nfsd_racache_shutdown();
- nfsd_up = false;
-}
-
-static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
-{
- nfsd_shutdown();
+ nfsd_shutdown_net(net);
svc_rpcb_cleanup(serv, net);
@@ -303,7 +342,7 @@ static void set_max_drc(void)
>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
nfsd_drc_mem_used = 0;
spin_lock_init(&nfsd_drc_lock);
- dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
+ dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
}
static int nfsd_get_default_max_blksize(void)
@@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-int nfsd_create_serv(void)
+int nfsd_create_serv(struct net *net)
{
int error;
- struct net *net = current->nsproxy->net_ns;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
WARN_ON(!mutex_is_locked(&nfsd_mutex));
- if (nfsd_serv) {
- svc_get(nfsd_serv);
+ if (nn->nfsd_serv) {
+ svc_get(nn->nfsd_serv);
return 0;
}
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions();
- nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
nfsd_last_thread, nfsd, THIS_MODULE);
- if (nfsd_serv == NULL)
+ if (nn->nfsd_serv == NULL)
return -ENOMEM;
- error = svc_bind(nfsd_serv, net);
+ error = svc_bind(nn->nfsd_serv, net);
if (error < 0) {
- svc_destroy(nfsd_serv);
+ svc_destroy(nn->nfsd_serv);
return error;
}
set_max_drc();
- do_gettimeofday(&nfssvc_boot); /* record boot time */
+ do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
-int nfsd_nrpools(void)
+int nfsd_nrpools(struct net *net)
{
- if (nfsd_serv == NULL)
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
return 0;
else
- return nfsd_serv->sv_nrpools;
+ return nn->nfsd_serv->sv_nrpools;
}
-int nfsd_get_nrthreads(int n, int *nthreads)
+int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (nfsd_serv != NULL) {
- for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++)
- nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads;
+ if (nn->nfsd_serv != NULL) {
+ for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
+ nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
}
return 0;
}
-int nfsd_set_nrthreads(int n, int *nthreads)
+void nfsd_destroy(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+
+ if (destroy)
+ svc_shutdown_net(nn->nfsd_serv, net);
+ svc_destroy(nn->nfsd_serv);
+ if (destroy)
+ nn->nfsd_serv = NULL;
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
int tot = 0;
int err = 0;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
WARN_ON(!mutex_is_locked(&nfsd_mutex));
- if (nfsd_serv == NULL || n <= 0)
+ if (nn->nfsd_serv == NULL || n <= 0)
return 0;
- if (n > nfsd_serv->sv_nrpools)
- n = nfsd_serv->sv_nrpools;
+ if (n > nn->nfsd_serv->sv_nrpools)
+ n = nn->nfsd_serv->sv_nrpools;
/* enforce a global maximum number of threads */
tot = 0;
@@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
nthreads[0] = 1;
/* apply the new numbers */
- svc_get(nfsd_serv);
+ svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
- err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
+ err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
nthreads[i]);
if (err)
break;
@@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)
* this is the first time nrservs is nonzero.
*/
int
-nfsd_svc(int nrservs)
+nfsd_svc(int nrservs, struct net *net)
{
int error;
bool nfsd_up_before;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
@@ -449,29 +503,29 @@ nfsd_svc(int nrservs)
if (nrservs > NFSD_MAXSERVS)
nrservs = NFSD_MAXSERVS;
error = 0;
- if (nrservs == 0 && nfsd_serv == NULL)
+ if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
- error = nfsd_create_serv();
+ error = nfsd_create_serv(net);
if (error)
goto out;
- nfsd_up_before = nfsd_up;
+ nfsd_up_before = nn->nfsd_net_up;
- error = nfsd_startup(nrservs);
+ error = nfsd_startup_net(nrservs, net);
if (error)
goto out_destroy;
- error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+ error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
if (error)
goto out_shutdown;
- /* We are holding a reference to nfsd_serv which
+ /* We are holding a reference to nn->nfsd_serv which
* we don't want to count in the return value,
* so subtract 1
*/
- error = nfsd_serv->sv_nrthreads - 1;
+ error = nn->nfsd_serv->sv_nrthreads - 1;
out_shutdown:
if (error < 0 && !nfsd_up_before)
- nfsd_shutdown();
+ nfsd_shutdown_net(net);
out_destroy:
nfsd_destroy(net); /* Release server */
out:
@@ -487,6 +541,8 @@ static int
nfsd(void *vrqstp)
{
struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+ struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
+ struct net *net = perm_sock->xpt_net;
int err;
/* Lock module and set up kernel thread */
@@ -551,7 +607,7 @@ out:
/* Release the thread */
svc_exit_thread(rqstp);
- nfsd_destroy(&init_net);
+ nfsd_destroy(net);
/* Release module */
mutex_unlock(&nfsd_mutex);
@@ -596,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
/* Check whether we have this call in the cache. */
switch (nfsd_cache_lookup(rqstp)) {
- case RC_INTR:
case RC_DROPIT:
return 0;
case RC_REPLY:
@@ -640,21 +695,23 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
}
/* Store reply in cache. */
- nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
return 1;
}
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
int ret;
+ struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
+
mutex_lock(&nfsd_mutex);
- if (nfsd_serv == NULL) {
+ if (nn->nfsd_serv == NULL) {
mutex_unlock(&nfsd_mutex);
return -ENODEV;
}
/* bump up the psudo refcount while traversing */
- svc_get(nfsd_serv);
- ret = svc_pool_stats_open(nfsd_serv, file);
+ svc_get(nn->nfsd_serv);
+ ret = svc_pool_stats_open(nn->nfsd_serv, file);
mutex_unlock(&nfsd_mutex);
return ret;
}
@@ -662,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
int ret = seq_release(inode, file);
- struct net *net = &init_net;
+ struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
/* this function really, really should have been called svc_put() */
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 65ec595e2226..9c769a47ac5a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -4,6 +4,7 @@
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*/
+#include "vfs.h"
#include "xdr.h"
#include "auth.h"
@@ -100,12 +101,14 @@ decode_sattr(__be32 *p, struct iattr *iap)
iap->ia_mode = tmp;
}
if ((tmp = ntohl(*p++)) != (u32)-1) {
- iap->ia_valid |= ATTR_UID;
- iap->ia_uid = tmp;
+ iap->ia_uid = make_kuid(&init_user_ns, tmp);
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
}
if ((tmp = ntohl(*p++)) != (u32)-1) {
- iap->ia_valid |= ATTR_GID;
- iap->ia_gid = tmp;
+ iap->ia_gid = make_kgid(&init_user_ns, tmp);
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
}
if ((tmp = ntohl(*p++)) != (u32)-1) {
iap->ia_valid |= ATTR_SIZE;
@@ -151,8 +154,8 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
*p++ = htonl(nfs_ftypes[type >> 12]);
*p++ = htonl((u32) stat->mode);
*p++ = htonl((u32) stat->nlink);
- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+ *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+ *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
*p++ = htonl(NFS_MAXPATHLEN);
@@ -194,11 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
}
/* Helper function for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
{
- struct kstat stat;
- vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat);
- return encode_fattr(rqstp, p, fhp, &stat);
+ return encode_fattr(rqstp, p, fhp, stat);
}
/*
@@ -246,7 +247,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readargs *args)
{
unsigned int len;
- int v,pn;
+ int v;
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -262,8 +263,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
*/
v=0;
while (len > 0) {
- pn = rqstp->rq_resused++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
len -= rqstp->rq_vec[v].iov_len;
v++;
@@ -355,7 +357,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -396,7 +398,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e036894bce57..1a8c7391f7ae 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {
u32 rdma_attrs;
};
+struct nfsd4_cb_sec {
+ u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */
+ kuid_t uid;
+ kgid_t gid;
+};
+
struct nfsd4_create_session {
clientid_t clientid;
struct nfs4_sessionid sessionid;
@@ -158,8 +164,12 @@ struct nfsd4_create_session {
struct nfsd4_channel_attrs fore_channel;
struct nfsd4_channel_attrs back_channel;
u32 callback_prog;
- u32 uid;
- u32 gid;
+ struct nfsd4_cb_sec cb_sec;
+};
+
+struct nfsd4_backchannel_ctl {
+ u32 bc_cb_program;
+ struct nfsd4_cb_sec bc_cb_sec;
};
struct nfsd4_bind_conn_to_session {
@@ -192,6 +202,7 @@ struct nfsd4_session {
struct nfs4_sessionid se_sessionid;
struct nfsd4_channel_attrs se_fchannel;
struct nfsd4_channel_attrs se_bchannel;
+ struct nfsd4_cb_sec se_cb_sec;
struct list_head se_conns;
u32 se_cb_prog;
u32 se_cb_seq_nr;
@@ -221,13 +232,12 @@ struct nfsd4_sessionid {
*/
struct nfs4_client {
struct list_head cl_idhash; /* hash by cl_clientid.id */
- struct list_head cl_strhash; /* hash by cl_name */
+ struct rb_node cl_namenode; /* link into by-name trees */
struct list_head cl_openowners;
struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
struct list_head cl_lru; /* tail queue */
struct xdr_netobj cl_name; /* id generated by client */
- char cl_recdir[HEXDIR_LEN]; /* recovery dir */
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
@@ -242,9 +252,11 @@ struct nfs4_client {
#define NFSD4_CLIENT_CB_KILL (1)
#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
+#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
+ struct rpc_cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
#define NFSD4_CB_UP 0
@@ -271,6 +283,7 @@ struct nfs4_client {
unsigned long cl_cb_slot_busy;
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
+ struct net *net;
};
static inline void
@@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)
*/
struct nfs4_client_reclaim {
struct list_head cr_strhash; /* hash by cr_name */
+ struct nfs4_client *cr_clp; /* pointer to associated clp */
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
@@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,
stateid_t *stateid, int flags, struct file **filp);
extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
-extern int nfs4_in_grace(void);
-extern void nfs4_release_reclaim(void);
-extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
+void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
+extern void nfs4_release_reclaim(struct nfsd_net *);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
+ struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
extern void nfs4_free_openowner(struct nfs4_openowner *);
extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
+extern void nfsd4_init_callback(struct nfsd4_callback *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_do_callback_rpc(struct work_struct *);
extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
-extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern int nfs4_client_to_reclaim(const char *name);
-extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
+ struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
extern void release_session_client(struct nfsd4_session *);
extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
@@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);
extern void nfsd4_client_record_create(struct nfs4_client *clp);
extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct net *net, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+
+/* nfs fault injection functions */
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
+
+u64 nfsd_forget_client(struct nfs4_client *, u64);
+u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
+u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
+
+u64 nfsd_print_client(struct nfs4_client *, u64);
+u64 nfsd_print_client_locks(struct nfs4_client *, u64);
+u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c120b48ec305..2a7eb536de0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -401,8 +401,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
/* Revoke setuid/setgid on chown */
if (!S_ISDIR(inode->i_mode) &&
- (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
- ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
+ (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) ||
+ ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
iap->ia_valid |= ATTR_KILL_PRIV;
if (iap->ia_valid & ATTR_MODE) {
/* we're setting mode too, just clear the s*id bits */
@@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
- struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+ struct page **pp = rqstp->rq_next_page;
struct page *page = buf->page;
size_t size;
@@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
if (rqstp->rq_res.page_len == 0) {
get_page(page);
- put_page(*pp);
- *pp = page;
- rqstp->rq_resused++;
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
rqstp->rq_res.page_base = buf->offset;
rqstp->rq_res.page_len = size;
} else if (page != pp[-1]) {
get_page(page);
- if (*pp)
- put_page(*pp);
- *pp = page;
- rqstp->rq_resused++;
+ if (*rqstp->rq_next_page)
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
rqstp->rq_res.page_len += size;
} else
rqstp->rq_res.page_len += size;
@@ -936,7 +934,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
.u.data = rqstp,
};
- rqstp->rq_resused = 1;
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
} else {
oldfs = get_fs();
@@ -981,7 +979,7 @@ static void kill_suid(struct dentry *dentry)
*/
static int wait_for_concurrent_writes(struct file *file)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
static ino_t last_ino;
static dev_t last_dev;
int err = 0;
@@ -1020,28 +1018,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
inode = dentry->d_inode;
exp = fhp->fh_export;
- /*
- * Request sync writes if
- * - the sync export option has been set, or
- * - the client requested O_SYNC behavior (NFSv3 feature).
- * - The file system doesn't support fsync().
- * When NFSv2 gathered writes have been configured for this volume,
- * flushing the data to disk is handled separately below.
- */
use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
- if (!file->f_op->fsync) {/* COMMIT3 cannot work */
- stable = 2;
- *stablep = 2; /* FILE_SYNC */
- }
-
if (!EX_ISSYNC(exp))
stable = 0;
- if (stable && !use_wgather) {
- spin_lock(&file->f_lock);
- file->f_flags |= O_SYNC;
- spin_unlock(&file->f_lock);
- }
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
@@ -1057,8 +1037,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (inode->i_mode & (S_ISUID | S_ISGID))
kill_suid(dentry);
- if (stable && use_wgather)
- host_err = wait_for_concurrent_writes(file);
+ if (stable) {
+ if (use_wgather)
+ host_err = wait_for_concurrent_writes(file);
+ else
+ host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
+ }
out_nfserr:
dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1086,7 +1070,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
return err;
- inode = file->f_path.dentry->d_inode;
+ inode = file_inode(file);
/* Get readahead parameters */
ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
@@ -1221,7 +1205,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
* send along the gid on create when it tries to implement
* setgid directories via NFS:
*/
- if (current_fsuid() != 0)
+ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
if (iap->ia_valid)
return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
@@ -1485,13 +1469,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_EXCLUSIVE:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
break;
+ }
case NFS4_CREATE_EXCLUSIVE4_1:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
goto set_attr;
+ }
/* fallthru */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
@@ -1967,7 +1957,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file->f_path.dentry->d_inode;
+ struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -2160,7 +2150,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
* with NFSv3.
*/
if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
- inode->i_uid == current_fsuid())
+ uid_eq(inode->i_uid, current_fsuid()))
return 0;
/* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 359594c393d2..5b5894159f22 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -6,6 +6,7 @@
#define LINUX_NFSD_VFS_H
#include "nfsfh.h"
+#include "nfsd.h"
/*
* Flags for nfsd_permission
@@ -125,4 +126,11 @@ static inline void fh_drop_write(struct svc_fh *fh)
}
}
+static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+{
+ struct path p = {.mnt = fh->fh_export->ex_path.mnt,
+ .dentry = fh->fh_dentry};
+ return nfserrno(vfs_getattr(&p, stat));
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 53b1863dd8f6..4f0481d63804 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -167,7 +167,7 @@ int nfssvc_encode_entry(void *, const char *name,
int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
/* Helper functions for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 7df980eb0562..b6d5542a4ac8 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -136,6 +136,7 @@ struct nfsd3_accessres {
__be32 status;
struct svc_fh fh;
__u32 access;
+ struct kstat stat;
};
struct nfsd3_readlinkres {
@@ -225,6 +226,7 @@ struct nfsd3_getaclres {
int mask;
struct posix_acl *acl_access;
struct posix_acl *acl_default;
+ struct kstat stat;
};
/* dummy type for release */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index acd127d4ee82..546f8983ecf1 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -385,7 +385,8 @@ struct nfsd4_write {
u64 wr_offset; /* request */
u32 wr_stable_how; /* request */
u32 wr_buflen; /* request */
- int wr_vlen;
+ struct kvec wr_head;
+ struct page ** wr_pagelist; /* request */
u32 wr_bytes_written; /* response */
u32 wr_how_written; /* response */
@@ -462,6 +463,7 @@ struct nfsd4_op {
/* NFSv4.1 */
struct nfsd4_exchange_id exchange_id;
+ struct nfsd4_backchannel_ctl backchannel_ctl;
struct nfsd4_bind_conn_to_session bind_conn_to_session;
struct nfsd4_create_session create_session;
struct nfsd4_destroy_session destroy_session;
@@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
|| nfsd4_is_solo_sequence(resp);
}
+static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ return argp->opcnt == resp->opcnt;
+}
+
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
static inline void
@@ -553,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 *buffer, int *countp,
+ struct dentry *dentry, __be32 **buffer, int countp,
u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
@@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
extern __be32 nfsd4_create_session(struct svc_rqst *,
struct nfsd4_compound_state *,
@@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,
extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
- struct nfsd4_open *open);
+ struct nfsd4_open *open, struct nfsd_net *nn);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 251da07b2a1d..80da8eb27393 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,5 @@
config NILFS2_FS
- tristate "NILFS2 file system support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ tristate "NILFS2 file system support"
select CRC32
help
NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index df1a7fb238d1..f30b017740a7 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -259,7 +259,7 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 16f35f7423c5..08fdb77852ac 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -67,7 +67,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct nilfs_transaction_info ti;
int ret = 0;
@@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {
};
const struct inode_operations nilfs_file_inode_operations = {
- .truncate = nilfs_truncate,
.setattr = nilfs_setattr,
.permission = nilfs_permission,
.fiemap = nilfs_fiemap,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4d31d2cca7fd..6b49f14eac8c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)
return ret;
}
+void nilfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ nilfs_truncate(inode);
+ }
+}
+
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
err = block_write_begin(mapping, pos, len, flags, pagep,
nilfs_get_block);
if (unlikely(err)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
-
+ nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
}
return err;
@@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
ssize_t size;
@@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ nilfs_write_failed(mapping, end);
}
return size;
@@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
-
- err = vmtruncate(inode, iattr->ia_size);
- if (unlikely(err))
- goto out_err;
+ truncate_setsize(inode, iattr->ia_size);
+ nilfs_truncate(inode);
}
setattr_copy(inode, iattr);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fdb180769485..b44bdb291b84 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
if (ret < 0)
printk(KERN_ERR "NILFS: GC failed during preparation: "
"cannot read source blocks: err=%d\n", ret);
- else
+ else {
+ if (nilfs_sb_need_update(nilfs))
+ set_nilfs_discontinued(nilfs);
ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+ }
nilfs_remove_all_gcinodes(nilfs);
clear_nilfs_gc_running(nilfs);
@@ -793,7 +796,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
void __user *argp = (void __user *)arg;
switch (cmd) {
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1d0c0b84c5a3..9de78f08989e 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -517,11 +517,11 @@ static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
*lenp = NILFS_FID_SIZE_CONNECTABLE;
- return 255;
+ return FILEID_INVALID;
}
if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
- return 255;
+ return FILEID_INVALID;
}
fid->cno = root->cno;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 74cece80e9a3..9bc72dec3fa6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index f1626f5011c5..ff00a0b7acb9 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
if (unlikely(err)) {
loff_t isize = inode->i_size;
if (pos + blocksize > isize)
- vmtruncate(inode, isize);
+ nilfs_write_failed(inode->i_mapping,
+ pos + blocksize);
goto failed_inode;
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 3344bdd5506e..2bfe6dc413a0 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -174,7 +174,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
struct dnotify_struct **prev;
struct inode *inode;
- inode = filp->f_path.dentry->d_inode;
+ inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode))
return;
@@ -201,7 +201,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
/* nothing else could have found us thanks to the dnotify_mark_mutex */
if (dn_mark->dn == NULL)
- fsnotify_destroy_mark(fsn_mark);
+ fsnotify_destroy_mark(fsn_mark, dnotify_group);
mutex_unlock(&dnotify_mark_mutex);
@@ -296,7 +296,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
}
/* dnotify only works on directories */
- inode = filp->f_path.dentry->d_inode;
+ inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode)) {
error = -ENOTDIR;
goto out_err;
@@ -385,7 +385,7 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark(fsn_mark);
+ fsnotify_destroy_mark(fsn_mark, dnotify_group);
mutex_unlock(&dnotify_mark_mutex);
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index a50636025364..0c2f9122b262 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,6 +18,12 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
old->tgid == new->tgid) {
switch (old->data_type) {
case (FSNOTIFY_EVENT_PATH):
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ /* dont merge two permission events */
+ if ((old->mask & FAN_ALL_PERM_EVENTS) &&
+ (new->mask & FAN_ALL_PERM_EVENTS))
+ return false;
+#endif
if ((old->path.mnt == new->path.mnt) &&
(old->path.dentry == new->path.dentry))
return true;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index a5cd9bba022f..5d8444268a16 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -397,8 +397,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
wake_up(&group->fanotify_data.access_waitq);
#endif
+
+ if (file->f_flags & FASYNC)
+ fsnotify_fasync(-1, file, 0);
+
/* matches the fanotify_init->fsnotify_alloc_group */
- fsnotify_put_group(group);
+ fsnotify_destroy_group(group);
return 0;
}
@@ -462,7 +466,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
ret = -ENOTDIR;
if ((flags & FAN_MARK_ONLYDIR) &&
- !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) {
+ !(S_ISDIR(file_inode(f.file)->i_mode))) {
fdput(f);
goto out;
}
@@ -493,7 +497,8 @@ out:
static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
__u32 mask,
- unsigned int flags)
+ unsigned int flags,
+ int *destroy)
{
__u32 oldmask;
@@ -507,8 +512,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
}
spin_unlock(&fsn_mark->lock);
- if (!(oldmask & ~mask))
- fsnotify_destroy_mark(fsn_mark);
+ *destroy = !(oldmask & ~mask);
return mask & oldmask;
}
@@ -519,12 +523,17 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark = NULL;
__u32 removed;
+ int destroy_mark;
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
if (!fsn_mark)
return -ENOENT;
- removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+ removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
+ &destroy_mark);
+ if (destroy_mark)
+ fsnotify_destroy_mark(fsn_mark, group);
+
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
@@ -538,12 +547,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark = NULL;
__u32 removed;
+ int destroy_mark;
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark)
return -ENOENT;
- removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+ removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
+ &destroy_mark);
+ if (destroy_mark)
+ fsnotify_destroy_mark(fsn_mark, group);
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
if (removed & inode->i_fsnotify_mask)
@@ -710,13 +723,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
break;
default:
fd = -EINVAL;
- goto out_put_group;
+ goto out_destroy_group;
}
if (flags & FAN_UNLIMITED_QUEUE) {
fd = -EPERM;
if (!capable(CAP_SYS_ADMIN))
- goto out_put_group;
+ goto out_destroy_group;
group->max_events = UINT_MAX;
} else {
group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
@@ -725,7 +738,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if (flags & FAN_UNLIMITED_MARKS) {
fd = -EPERM;
if (!capable(CAP_SYS_ADMIN))
- goto out_put_group;
+ goto out_destroy_group;
group->fanotify_data.max_marks = UINT_MAX;
} else {
group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
@@ -733,12 +746,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
if (fd < 0)
- goto out_put_group;
+ goto out_destroy_group;
return fd;
-out_put_group:
- fsnotify_put_group(group);
+out_destroy_group:
+ fsnotify_destroy_group(group);
return fd;
}
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 514c4b81483d..238a5930cb3c 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -27,13 +27,13 @@ static int show_fdinfo(struct seq_file *m, struct file *f,
struct fsnotify_mark *mark;
int ret = 0;
- spin_lock(&group->mark_lock);
+ mutex_lock(&group->mark_mutex);
list_for_each_entry(mark, &group->marks_list, g_list) {
ret = show(m, mark);
if (ret)
break;
}
- spin_unlock(&group->mark_lock);
+ mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 6baadb5a8430..4bb21d67d9b1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -52,7 +52,6 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
void __fsnotify_update_child_dentry_flags(struct inode *inode)
{
struct dentry *alias;
- struct hlist_node *p;
int watched;
if (!S_ISDIR(inode->i_mode))
@@ -64,7 +63,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
spin_lock(&inode->i_lock);
/* run all of the dentries associated with this inode. Since this is a
* directory, there damn well better only be one item on this list */
- hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
struct dentry *child;
/* run all of the children of the original inode and fix their
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 63fc294a4692..bd2625bd88b4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -33,9 +33,6 @@
*/
void fsnotify_final_destroy_group(struct fsnotify_group *group)
{
- /* clear the notification queue of all events */
- fsnotify_flush_notify(group);
-
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
@@ -43,23 +40,30 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
}
/*
- * Trying to get rid of a group. We need to first get rid of any outstanding
- * allocations and then free the group. Remember that fsnotify_clear_marks_by_group
- * could miss marks that are being freed by inode and those marks could still
- * hold a reference to this group (via group->num_marks) If we get into that
- * situtation, the fsnotify_final_destroy_group will get called when that final
- * mark is freed.
+ * Trying to get rid of a group. Remove all marks, flush all events and release
+ * the group reference.
+ * Note that another thread calling fsnotify_clear_marks_by_group() may still
+ * hold a ref to the group.
*/
-static void fsnotify_destroy_group(struct fsnotify_group *group)
+void fsnotify_destroy_group(struct fsnotify_group *group)
{
/* clear all inode marks for this group */
fsnotify_clear_marks_by_group(group);
synchronize_srcu(&fsnotify_mark_srcu);
- /* past the point of no return, matches the initial value of 1 */
- if (atomic_dec_and_test(&group->num_marks))
- fsnotify_final_destroy_group(group);
+ /* clear the notification queue of all events */
+ fsnotify_flush_notify(group);
+
+ fsnotify_put_group(group);
+}
+
+/*
+ * Get reference to a group.
+ */
+void fsnotify_get_group(struct fsnotify_group *group)
+{
+ atomic_inc(&group->refcnt);
}
/*
@@ -68,7 +72,7 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
void fsnotify_put_group(struct fsnotify_group *group)
{
if (atomic_dec_and_test(&group->refcnt))
- fsnotify_destroy_group(group);
+ fsnotify_final_destroy_group(group);
}
/*
@@ -84,21 +88,24 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
/* set to 0 when there a no external references to this group */
atomic_set(&group->refcnt, 1);
- /*
- * hits 0 when there are no external references AND no marks for
- * this group
- */
- atomic_set(&group->num_marks, 1);
+ atomic_set(&group->num_marks, 0);
mutex_init(&group->notification_mutex);
INIT_LIST_HEAD(&group->notification_list);
init_waitqueue_head(&group->notification_waitq);
group->max_events = UINT_MAX;
- spin_lock_init(&group->mark_lock);
+ mutex_init(&group->mark_mutex);
INIT_LIST_HEAD(&group->marks_list);
group->ops = ops;
return group;
}
+
+int fsnotify_fasync(int fd, struct file *file, int on)
+{
+ struct fsnotify_group *group = file->private_data;
+
+ return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index f3035691f528..74825be65b7b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -36,12 +36,11 @@
static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
{
struct fsnotify_mark *mark;
- struct hlist_node *pos;
__u32 new_mask = 0;
assert_spin_locked(&inode->i_lock);
- hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
+ hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list)
new_mask |= mark->mask;
inode->i_fsnotify_mask = new_mask;
}
@@ -63,8 +62,8 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
{
struct inode *inode = mark->i.inode;
+ BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
assert_spin_locked(&mark->lock);
- assert_spin_locked(&mark->group->mark_lock);
spin_lock(&inode->i_lock);
@@ -87,11 +86,11 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
void fsnotify_clear_marks_by_inode(struct inode *inode)
{
struct fsnotify_mark *mark, *lmark;
- struct hlist_node *pos, *n;
+ struct hlist_node *n;
LIST_HEAD(free_list);
spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
+ hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) {
list_add(&mark->i.free_i_list, &free_list);
hlist_del_init_rcu(&mark->i.i_list);
fsnotify_get_mark(mark);
@@ -99,8 +98,16 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
spin_unlock(&inode->i_lock);
list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
- fsnotify_destroy_mark(mark);
+ struct fsnotify_group *group;
+
+ spin_lock(&mark->lock);
+ fsnotify_get_group(mark->group);
+ group = mark->group;
+ spin_unlock(&mark->lock);
+
+ fsnotify_destroy_mark(mark, group);
fsnotify_put_mark(mark);
+ fsnotify_put_group(group);
}
}
@@ -121,11 +128,10 @@ static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
struct inode *inode)
{
struct fsnotify_mark *mark;
- struct hlist_node *pos;
assert_spin_locked(&inode->i_lock);
- hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
+ hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) {
if (mark->group == group) {
fsnotify_get_mark(mark);
return mark;
@@ -186,14 +192,13 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group, struct inode *inode,
int allow_dups)
{
- struct fsnotify_mark *lmark;
- struct hlist_node *node, *last = NULL;
+ struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
+ BUG_ON(!mutex_is_locked(&group->mark_mutex));
assert_spin_locked(&mark->lock);
- assert_spin_locked(&group->mark_lock);
spin_lock(&inode->i_lock);
@@ -206,8 +211,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
}
/* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
- last = node;
+ hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) {
+ last = lmark;
if ((lmark->group == group) && !allow_dups) {
ret = -EEXIST;
@@ -227,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
- hlist_add_after_rcu(last, &mark->i.i_list);
+ hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list);
out:
fsnotify_recalc_inode_mask_locked(inode);
spin_unlock(&inode->i_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e3cbd746f64a..4216308b81b4 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -118,6 +118,7 @@ static int inotify_handle_event(struct fsnotify_group *group,
fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+ fsnotify_get_group(group);
fsn_event_priv->group = group;
event_priv->wd = wd;
@@ -131,7 +132,7 @@ static int inotify_handle_event(struct fsnotify_group *group,
}
if (inode_mark->mask & IN_ONESHOT)
- fsnotify_destroy_mark(inode_mark);
+ fsnotify_destroy_mark(inode_mark, group);
return ret;
}
@@ -196,7 +197,6 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
{
/* ideally the idr is empty and we won't hit the BUG in the callback */
idr_for_each(&group->inotify_data.idr, idr_callback, group);
- idr_remove_all(&group->inotify_data.idr);
idr_destroy(&group->inotify_data.idr);
atomic_dec(&group->inotify_data.user->inotify_devs);
free_uid(group->inotify_data.user);
@@ -210,6 +210,7 @@ void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
fsnotify_event_priv_data);
+ fsnotify_put_group(fsn_event_priv->group);
kmem_cache_free(event_priv_cachep, event_priv);
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 36cb013c7c13..e0f7c1241a6a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -265,7 +265,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
ret = -EAGAIN;
if (file->f_flags & O_NONBLOCK)
break;
- ret = -EINTR;
+ ret = -ERESTARTSYS;
if (signal_pending(current))
break;
@@ -281,23 +281,17 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
return ret;
}
-static int inotify_fasync(int fd, struct file *file, int on)
-{
- struct fsnotify_group *group = file->private_data;
-
- return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
-}
-
static int inotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
pr_debug("%s: group=%p\n", __func__, group);
- fsnotify_clear_marks_by_group(group);
+ if (file->f_flags & FASYNC)
+ fsnotify_fasync(-1, file, 0);
/* free this group, matching get was inotify_init->fsnotify_obtain_group */
- fsnotify_put_group(group);
+ fsnotify_destroy_group(group);
return 0;
}
@@ -339,7 +333,7 @@ static const struct file_operations inotify_fops = {
.show_fdinfo = inotify_show_fdinfo,
.poll = inotify_poll,
.read = inotify_read,
- .fasync = inotify_fasync,
+ .fasync = fsnotify_fasync,
.release = inotify_release,
.unlocked_ioctl = inotify_ioctl,
.compat_ioctl = inotify_ioctl,
@@ -370,22 +364,20 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
{
int ret;
- do {
- if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
- return -ENOMEM;
+ idr_preload(GFP_KERNEL);
+ spin_lock(idr_lock);
- spin_lock(idr_lock);
- ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
- &i_mark->wd);
+ ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT);
+ if (ret >= 0) {
/* we added the mark to the idr, take a reference */
- if (!ret) {
- *last_wd = i_mark->wd;
- fsnotify_get_mark(&i_mark->fsn_mark);
- }
- spin_unlock(idr_lock);
- } while (ret == -EAGAIN);
+ i_mark->wd = ret;
+ *last_wd = i_mark->wd;
+ fsnotify_get_mark(&i_mark->fsn_mark);
+ }
- return ret;
+ spin_unlock(idr_lock);
+ idr_preload_end();
+ return ret < 0 ? ret : 0;
}
static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
@@ -521,13 +513,13 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_event_private_data *fsn_event_priv;
int ret;
+ i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
FSNOTIFY_EVENT_NONE, NULL, 0,
GFP_NOFS);
if (!ignored_event)
- return;
-
- i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+ goto skip_send_ignore;
event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
if (unlikely(!event_priv))
@@ -535,6 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+ fsnotify_get_group(group);
fsn_event_priv->group = group;
event_priv->wd = i_mark->wd;
@@ -548,9 +541,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
}
skip_send_ignore:
-
/* matches the reference taken when the event was created */
- fsnotify_put_event(ignored_event);
+ if (ignored_event)
+ fsnotify_put_event(ignored_event);
/* remove this mark from the idr */
inotify_remove_from_idr(group, i_mark);
@@ -581,8 +574,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
- if (unlikely(!(mask & IN_ALL_EVENTS)))
- return -EINVAL;
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark)
@@ -634,8 +625,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
/* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
- if (unlikely(!(mask & IN_ALL_EVENTS)))
- return -EINVAL;
tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
if (unlikely(!tmp_i_mark))
@@ -709,12 +698,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
group->inotify_data.last_wd = 0;
- group->inotify_data.fa = NULL;
group->inotify_data.user = get_current_user();
if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
inotify_max_user_instances) {
- fsnotify_put_group(group);
+ fsnotify_destroy_group(group);
return ERR_PTR(-EMFILE);
}
@@ -743,7 +731,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
ret = anon_inode_getfd("inotify", &inotify_fops, group,
O_RDONLY | flags);
if (ret < 0)
- fsnotify_put_group(group);
+ fsnotify_destroy_group(group);
return ret;
}
@@ -819,7 +807,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
ret = 0;
- fsnotify_destroy_mark(&i_mark->fsn_mark);
+ fsnotify_destroy_mark(&i_mark->fsn_mark, group);
/* match ref taken by inotify_idr_find */
fsnotify_put_mark(&i_mark->fsn_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f104d565b682..fc6b49bf7360 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -109,8 +109,11 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
- if (atomic_dec_and_test(&mark->refcnt))
+ if (atomic_dec_and_test(&mark->refcnt)) {
+ if (mark->group)
+ fsnotify_put_group(mark->group);
mark->free_mark(mark);
+ }
}
/*
@@ -118,14 +121,14 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
* The caller had better be holding a reference to this mark so we don't actually
* do the final put under the mark->lock
*/
-void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
{
- struct fsnotify_group *group;
struct inode *inode = NULL;
- spin_lock(&mark->lock);
+ BUG_ON(!mutex_is_locked(&group->mark_mutex));
- group = mark->group;
+ spin_lock(&mark->lock);
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
@@ -135,8 +138,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
- spin_lock(&group->mark_lock);
-
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
inode = mark->i.inode;
fsnotify_destroy_inode_mark(mark);
@@ -147,13 +148,22 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
list_del_init(&mark->g_list);
- spin_unlock(&group->mark_lock);
spin_unlock(&mark->lock);
+ if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+ iput(inode);
+ /* release lock temporarily */
+ mutex_unlock(&group->mark_mutex);
+
spin_lock(&destroy_lock);
list_add(&mark->destroy_list, &destroy_list);
spin_unlock(&destroy_lock);
wake_up(&destroy_waitq);
+ /*
+ * We don't necessarily have a ref on mark from caller so the above destroy
+ * may have actually freed it, unless this group provides a 'freeing_mark'
+ * function which must be holding a reference.
+ */
/*
* Some groups like to know that marks are being freed. This is a
@@ -175,21 +185,17 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
* is just a lazy update (and could be a perf win...)
*/
- if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
- iput(inode);
+ atomic_dec(&group->num_marks);
- /*
- * We don't necessarily have a ref on mark from caller so the above iput
- * may have already destroyed it. Don't touch from now on.
- */
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+}
- /*
- * it's possible that this group tried to destroy itself, but this
- * this mark was simultaneously being freed by inode. If that's the
- * case, we finish freeing the group here.
- */
- if (unlikely(atomic_dec_and_test(&group->num_marks)))
- fsnotify_final_destroy_group(group);
+void fsnotify_destroy_mark(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
+{
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_destroy_mark_locked(mark, group);
+ mutex_unlock(&group->mark_mutex);
}
void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
@@ -214,26 +220,26 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
*/
-int fsnotify_add_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
- struct vfsmount *mnt, int allow_dups)
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+ struct fsnotify_group *group, struct inode *inode,
+ struct vfsmount *mnt, int allow_dups)
{
int ret = 0;
BUG_ON(inode && mnt);
BUG_ON(!inode && !mnt);
+ BUG_ON(!mutex_is_locked(&group->mark_mutex));
/*
* LOCKING ORDER!!!!
+ * group->mark_mutex
* mark->lock
- * group->mark_lock
* inode->i_lock
*/
spin_lock(&mark->lock);
- spin_lock(&group->mark_lock);
-
mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+ fsnotify_get_group(group);
mark->group = group;
list_add(&mark->g_list, &group->marks_list);
atomic_inc(&group->num_marks);
@@ -251,11 +257,8 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
BUG();
}
- spin_unlock(&group->mark_lock);
-
/* this will pin the object if appropriate */
fsnotify_set_mark_mask_locked(mark, mark->mask);
-
spin_unlock(&mark->lock);
if (inode)
@@ -265,10 +268,10 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
err:
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
list_del_init(&mark->g_list);
+ fsnotify_put_group(group);
mark->group = NULL;
atomic_dec(&group->num_marks);
- spin_unlock(&group->mark_lock);
spin_unlock(&mark->lock);
spin_lock(&destroy_lock);
@@ -279,6 +282,16 @@ err:
return ret;
}
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
+ struct inode *inode, struct vfsmount *mnt, int allow_dups)
+{
+ int ret;
+ mutex_lock(&group->mark_mutex);
+ ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+ mutex_unlock(&group->mark_mutex);
+ return ret;
+}
+
/*
* clear any marks in a group in which mark->flags & flags is true
*/
@@ -286,22 +299,16 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
unsigned int flags)
{
struct fsnotify_mark *lmark, *mark;
- LIST_HEAD(free_list);
- spin_lock(&group->mark_lock);
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if (mark->flags & flags) {
- list_add(&mark->free_g_list, &free_list);
- list_del_init(&mark->g_list);
fsnotify_get_mark(mark);
+ fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_put_mark(mark);
}
}
- spin_unlock(&group->mark_lock);
-
- list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
- fsnotify_destroy_mark(mark);
- fsnotify_put_mark(mark);
- }
+ mutex_unlock(&group->mark_mutex);
}
/*
@@ -317,6 +324,8 @@ void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *ol
assert_spin_locked(&old->lock);
new->i.inode = old->i.inode;
new->m.mnt = old->m.mnt;
+ if (old->group)
+ fsnotify_get_group(old->group);
new->group = old->group;
new->mask = old->mask;
new->free_mark = old->free_mark;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 48cb994e4922..7b51b05f160c 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -225,6 +225,7 @@ alloc_holder:
mutex_unlock(&group->notification_mutex);
wake_up(&group->notification_waitq);
+ kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
return return_event;
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index b7b4b0e8554f..68ca5a8704b5 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -33,12 +33,12 @@
void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
struct fsnotify_mark *mark, *lmark;
- struct hlist_node *pos, *n;
+ struct hlist_node *n;
struct mount *m = real_mount(mnt);
LIST_HEAD(free_list);
spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) {
+ hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) {
list_add(&mark->m.free_m_list, &free_list);
hlist_del_init_rcu(&mark->m.m_list);
fsnotify_get_mark(mark);
@@ -46,8 +46,16 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
spin_unlock(&mnt->mnt_root->d_lock);
list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
- fsnotify_destroy_mark(mark);
+ struct fsnotify_group *group;
+
+ spin_lock(&mark->lock);
+ fsnotify_get_group(mark->group);
+ group = mark->group;
+ spin_unlock(&mark->lock);
+
+ fsnotify_destroy_mark(mark, group);
fsnotify_put_mark(mark);
+ fsnotify_put_group(group);
}
}
@@ -63,12 +71,11 @@ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
{
struct mount *m = real_mount(mnt);
struct fsnotify_mark *mark;
- struct hlist_node *pos;
__u32 new_mask = 0;
assert_spin_locked(&mnt->mnt_root->d_lock);
- hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list)
+ hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list)
new_mask |= mark->mask;
m->mnt_fsnotify_mask = new_mask;
}
@@ -88,8 +95,8 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
{
struct vfsmount *mnt = mark->m.mnt;
+ BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
assert_spin_locked(&mark->lock);
- assert_spin_locked(&mark->group->mark_lock);
spin_lock(&mnt->mnt_root->d_lock);
@@ -106,11 +113,10 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_
{
struct mount *m = real_mount(mnt);
struct fsnotify_mark *mark;
- struct hlist_node *pos;
assert_spin_locked(&mnt->mnt_root->d_lock);
- hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) {
+ hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) {
if (mark->group == group) {
fsnotify_get_mark(mark);
return mark;
@@ -145,14 +151,13 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
int allow_dups)
{
struct mount *m = real_mount(mnt);
- struct fsnotify_mark *lmark;
- struct hlist_node *node, *last = NULL;
+ struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
+ BUG_ON(!mutex_is_locked(&group->mark_mutex));
assert_spin_locked(&mark->lock);
- assert_spin_locked(&group->mark_lock);
spin_lock(&mnt->mnt_root->d_lock);
@@ -165,8 +170,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
}
/* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) {
- last = node;
+ hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) {
+ last = lmark;
if ((lmark->group == group) && !allow_dups) {
ret = -EEXIST;
@@ -186,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
- hlist_add_after_rcu(last, &mark->m.m_list);
+ hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list);
out:
fsnotify_recalc_vfsmount_mask_locked(mnt);
spin_unlock(&mnt->mnt_root->d_lock);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 99e36107ff60..aa411c3f20e9 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1101,7 +1101,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
loff_t fpos, i_size;
- struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode;
+ struct inode *bmp_vi, *vdir = file_inode(filp);
struct super_block *sb = vdir->i_sb;
ntfs_inode *ndir = NTFS_I(vdir);
ntfs_volume *vol = NTFS_SB(sb);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1ecf46448f85..5b2d4f0853ac 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1762,6 +1762,16 @@ err_out:
return err;
}
+static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ ntfs_truncate_vfs(inode);
+ }
+}
+
/**
* ntfs_file_buffered_write -
*
@@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* allocated space, which is not a disaster.
*/
i_size = i_size_read(vi);
- if (pos + bytes > i_size)
- vmtruncate(vi, i_size);
+ if (pos + bytes > i_size) {
+ ntfs_write_failed(mapping, pos + bytes);
+ }
break;
}
}
@@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {
const struct inode_operations ntfs_file_inode_ops = {
#ifdef NTFS_RW
- .truncate = ntfs_truncate_vfs,
.setattr = ntfs_setattr,
#endif /* NTFS_RW */
};
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1d27331e6fc9..d3e118cc6ffa 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,9 +2866,11 @@ conv_err_out:
*
* See ntfs_truncate() description above for details.
*/
+#ifdef NTFS_RW
void ntfs_truncate_vfs(struct inode *vi) {
ntfs_truncate(vi);
}
+#endif
/**
* ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
NInoCompressed(ni) ?
"compressed" : "encrypted");
err = -EOPNOTSUPP;
- } else
- err = vmtruncate(vi, attr->ia_size);
+ } else {
+ truncate_setsize(vi, attr->ia_size);
+ ntfs_truncate_vfs(vi);
+ }
if (err || ia_valid == ATTR_SIZE)
goto out;
} else {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index db29695f845c..76b6cfb579d7 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)
return;
}
+#else
+
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
+
#endif /* NTFS_RW */
#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 260b16281fc3..8a404576fb26 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -65,7 +65,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ default:
+ break;
+ }
value += sizeof(struct posix_acl_entry);
}
@@ -91,7 +104,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
for (n = 0; n < acl->a_count; n++, entry++) {
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[n].e_uid));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[n].e_gid));
+ break;
+ default:
+ entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
}
return ocfs2_acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 31b9463fba1f..b8a9d87231b1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
mlog_errno(ret);
out:
- if (pages)
- kfree(pages);
+ kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 657743254eb9..20dfec72e903 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -569,7 +569,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
int ret,
bool is_async)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
int level;
wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
@@ -593,9 +593,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
+ inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
- inode_dio_done(inode);
}
/*
@@ -626,7 +626,7 @@ static ssize_t ocfs2_direct_IO(int rw,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+ struct inode *inode = file_inode(file)->i_mapping->host;
/*
* Fallback to buffered I/O if we see an inode without
@@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
}
+ wait_for_stable_page(wc->w_pages[i]);
if (index == target_index)
wc->w_target_page = wc->w_pages[i];
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f7c648d7d6bf..42252bf64b51 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item)
mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
- if (reg->hr_tmp_block)
- kfree(reg->hr_tmp_block);
+ kfree(reg->hr_tmp_block);
if (reg->hr_slot_data) {
for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item)
if (reg->hr_bdev)
blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- if (reg->hr_slots)
- kfree(reg->hr_slots);
+ kfree(reg->hr_slots);
kfree(reg->hr_db_regnum);
kfree(reg->hr_db_livenodes);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1bfe8802cc1e..aa88bd8bcedc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -304,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)
static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
{
- int ret = 0;
-
- do {
- if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
- ret = -EAGAIN;
- break;
- }
- spin_lock(&nn->nn_lock);
- ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
- if (ret == 0)
- list_add_tail(&nsw->ns_node_item,
- &nn->nn_status_list);
- spin_unlock(&nn->nn_lock);
- } while (ret == -EAGAIN);
+ int ret;
- if (ret == 0) {
- init_waitqueue_head(&nsw->ns_wq);
- nsw->ns_sys_status = O2NET_ERR_NONE;
- nsw->ns_status = 0;
+ spin_lock(&nn->nn_lock);
+ ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ nsw->ns_id = ret;
+ list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
}
+ spin_unlock(&nn->nn_lock);
+ if (ret < 0)
+ return ret;
- return ret;
+ init_waitqueue_head(&nsw->ns_wq);
+ nsw->ns_sys_status = O2NET_ERR_NONE;
+ nsw->ns_status = 0;
+ return 0;
}
static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -870,7 +864,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
/* we've had some trouble with handlers seemingly vanishing. */
mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
&parent) == NULL,
- "couldn't find handler we *just* registerd "
+ "couldn't find handler we *just* registered "
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
@@ -1165,10 +1159,8 @@ out:
o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
if (sc)
sc_put(sc);
- if (vec)
- kfree(vec);
- if (msg)
- kfree(msg);
+ kfree(vec);
+ kfree(msg);
o2net_complete_nsw(nn, &nsw, 0, 0, 0);
return ret;
}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 8db4b58b2e4b..ef999729e274 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -169,11 +169,10 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
int skip_unhashed)
{
- struct hlist_node *p;
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
trace_ocfs2_find_local_alias(dentry->d_name.len,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8fe4e2892ab9..f1e1aed8f638 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -67,7 +67,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2015,12 +2014,12 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
int error = 0;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 9e89d70df337..dbb17c07656a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
if (dlm->master_hash)
dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- if (dlm->name)
- kfree(dlm->name);
-
+ kfree(dlm->name);
kfree(dlm);
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 005261c333b0..33ecbe0e6734 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2020,7 +2020,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
- item = kzalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_ATOMIC);
if (!item)
return -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 01ebfd0bdad7..eeac97bb3bfa 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2083,7 +2083,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct hlist_node *hash_iter;
struct hlist_head *bucket;
struct dlm_lock_resource *res, *next;
@@ -2114,7 +2113,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* if necessary */
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
+ hlist_for_each_entry(res, bucket, hash_node) {
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
@@ -2273,7 +2272,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct hlist_node *iter;
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
@@ -2299,7 +2297,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node) {
+ hlist_for_each_entry(res, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 16b712d260d4..4c5fc8d77dc2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -224,7 +224,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
{
int event = 0;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct dlmfs_inode_private *ip = DLMFS_I(inode);
poll_wait(file, &ip->ip_lockres.l_event, wait);
@@ -245,7 +245,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
int bytes_left;
ssize_t readlen, got;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -293,7 +293,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
int bytes_left;
ssize_t writelen;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4f7795fb5fc0..12ae194ac943 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2045,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_version = OCFS2_LVB_VERSION;
lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
- lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
- lvb->lvb_igid = cpu_to_be32(inode->i_gid);
+ lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
+ lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
lvb->lvb_iatime_packed =
@@ -2095,8 +2095,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
- inode->i_gid = be32_to_cpu(lvb->lvb_igid);
+ i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+ i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
@@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
if (status < 0) {
+ ocfs2_cluster_unlock(osb, lockres, level);
mlog_errno(status);
goto bail;
}
@@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
ocfs2_complete_lock_res_refresh(lockres, status);
- if (status < 0)
+ if (status < 0) {
+ ocfs2_cluster_unlock(osb, lockres, level);
mlog_errno(status);
+ }
ocfs2_track_lock_refresh(lockres);
}
bail:
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 322216a5f0dd..29651167190d 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -195,11 +195,11 @@ static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
if (parent && (len < 6)) {
*max_len = 6;
- type = 255;
+ type = FILEID_INVALID;
goto bail;
} else if (len < 3) {
*max_len = 3;
- type = 255;
+ type = FILEID_INVALID;
goto bail;
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f487aa343442..1c39efb71bab 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -282,8 +282,7 @@ search:
spin_unlock(&oi->ip_lock);
out:
- if (new_emi)
- kfree(new_emi);
+ kfree(new_emi);
}
static int ocfs2_last_eb_is_empty(struct inode *inode,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fe492e1a3cfc..6474cb44004d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1116,7 +1116,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
(unsigned long long)OCFS2_I(inode)->ip_blkno,
dentry->d_name.len, dentry->d_name.name,
attr->ia_valid, attr->ia_mode,
- attr->ia_uid, attr->ia_gid);
+ from_kuid(&init_user_ns, attr->ia_uid),
+ from_kgid(&init_user_ns, attr->ia_gid));
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
@@ -1174,14 +1175,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
/*
* Gather pointers to quota structures so that allocation /
* freeing of quota structures happens here and not inside
* dquot_transfer() where we have problems with lock ordering
*/
- if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+ if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
@@ -1190,7 +1191,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock;
}
}
- if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+ if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
@@ -1218,24 +1219,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- /*
- * This will intentionally not wind up calling truncate_setsize(),
- * since all the work for a size change has been done above.
- * Otherwise, we could get into problems with truncate as
- * ip_alloc_sem is used there to protect against i_size
- * changes.
- *
- * XXX: this means the conditional below can probably be removed.
- */
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- status = vmtruncate(inode, attr->ia_size);
- if (status) {
- mlog_errno(status);
- goto bail_commit;
- }
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
@@ -1967,7 +1950,7 @@ out:
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int ret;
@@ -1995,7 +1978,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_space_resv sr;
int change_size = 1;
@@ -2250,7 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
loff_t old_size, *ppos = &iocb->ki_pos;
u32 old_clusters;
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2534,7 +2517,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
unsigned int flags)
{
int ret = 0, lock_level = 0;
- struct inode *inode = in->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(in);
trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2544,7 +2527,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
/*
* See the comment in ocfs2_file_aio_read()
*/
- ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
@@ -2564,7 +2547,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
{
int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2607,7 +2590,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d89e08a81eda..f87f9bd1edff 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -269,8 +269,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
/* Fast symlinks will have i_size but no allocated clusters. */
if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
@@ -1259,8 +1259,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_size = cpu_to_le64(i_size_read(inode));
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
@@ -1290,8 +1290,8 @@ void ocfs2_refresh_inode(struct inode *inode,
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
set_nlink(inode, ocfs2_read_links_count(fe));
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
inode->i_mode = le16_to_cpu(fe->i_mode);
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f20edcbfe700..752f0b26221d 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -881,7 +881,7 @@ bail:
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int flags;
int new_clusters;
int status;
@@ -994,7 +994,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
bool preserve;
struct reflink_arguments args;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_info info;
void __user *argp = (void __user *)arg;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 2dd36af79e26..8eccfabcd12e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
/* Though we wish to avoid it, we are in fact safe in
* skipping local alloc cleanup as fsck.ocfs2 is more
* than capable of reclaiming unused space. */
- if (la_dinode)
- kfree(la_dinode);
-
- if (tl_dinode)
- kfree(tl_dinode);
+ kfree(la_dinode);
+ kfree(tl_dinode);
if (qrec)
ocfs2_free_quota_recovery(qrec);
@@ -1408,8 +1405,7 @@ bail:
mutex_unlock(&osb->recovery_lock);
- if (rm_quota)
- kfree(rm_quota);
+ kfree(rm_quota);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index a9f78c74d687..aebeacd807c3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,8 +476,7 @@ out:
if (local_alloc_inode)
iput(local_alloc_inode);
- if (alloc_copy)
- kfree(alloc_copy);
+ kfree(alloc_copy);
}
/*
@@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mlog_errno(status);
bail:
- if ((status < 0) && (*alloc_copy)) {
+ if (status < 0) {
kfree(*alloc_copy);
*alloc_copy = NULL;
}
@@ -1290,8 +1289,7 @@ bail:
if (main_bm_inode)
iput(main_bm_inode);
- if (alloc_copy)
- kfree(alloc_copy);
+ kfree(alloc_copy);
if (ac)
ocfs2_free_alloc_context(ac);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 47a87dda54ce..10d66c75cecb 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -62,7 +62,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
int ret = VM_FAULT_NOPAGE;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
@@ -131,7 +131,7 @@ out:
static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct buffer_head *di_bh = NULL;
sigset_t oldset;
int ret;
@@ -180,13 +180,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
- ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
- file->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(file_inode(file),
+ file->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
+ ocfs2_inode_unlock(file_inode(file), lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
return 0;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6083432f667e..9f8dcadd9a50 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1055,7 +1055,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
{
int status;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ocfs2_move_extents range;
struct ocfs2_move_extents_context *context = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f1fd0741162b..04ee1b57c243 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -512,8 +512,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 30a055049e16..998b17eda09d 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2927,7 +2927,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3020,7 +3020,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
u32 new_cluster, u32 new_len)
{
int ret = 0;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -4407,7 +4407,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
* rights to do so.
*/
if (preserve) {
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
return -EPERM;
if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
return -EPERM;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 94368017edb3..bf1f8930456f 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
out_free:
- if (rc && conn->cc_private)
+ if (rc)
kfree(conn->cc_private);
out:
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f169da4624fd..b7e74b580c0f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
* cluster groups will be staying in cache for the duration of
* this operation.
*/
- ac->ac_allow_chain_relink = 0;
+ ac->ac_disable_chain_relink = 1;
/* Claim the first region */
status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
@@ -1823,7 +1823,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* Do this *after* figuring out how many bits we're taking out
* of our target group.
*/
- if (ac->ac_allow_chain_relink &&
+ if (!ac->ac_disable_chain_relink &&
(prev_group_bh) &&
(ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
status = ocfs2_relink_block_group(handle, alloc_inode,
@@ -1928,7 +1928,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
- ac->ac_allow_chain_relink = 1;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
@@ -1947,7 +1946,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
* searching each chain in order. Don't allow chain relinking
* because we only calculate enough journal credits for one
* relink per alloc. */
- ac->ac_allow_chain_relink = 0;
+ ac->ac_disable_chain_relink = 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index b8afabfeede4..a36d0aa50911 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -49,7 +49,7 @@ struct ocfs2_alloc_context {
/* these are used by the chain search */
u16 ac_chain;
- int ac_allow_chain_relink;
+ int ac_disable_chain_relink;
group_search_t *ac_group_search;
u64 ac_last_group;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0e91ec22a940..9b6910dec4ba 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2525,8 +2525,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
mlog_errno(status);
finally:
- if (local_alloc)
- kfree(local_alloc);
+ kfree(local_alloc);
if (status)
mlog_errno(status);
@@ -2553,8 +2552,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
* we free it here.
*/
kfree(osb->journal);
- if (osb->local_alloc_copy)
- kfree(osb->local_alloc_copy);
+ kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f1fbb4b552ad..66edce7ecfd7 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -57,7 +57,7 @@
static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
const char *link;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 3d635f4bbb20..f053688d22a3 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb,
} else
osb->local_system_inodes = local_system_inodes;
spin_unlock(&osb->osb_lock);
- if (unlikely(free))
- kfree(free);
+ kfree(free);
}
index = (slot * NUM_LOCAL_SYSTEM_INODES) +
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 0ba9ea1e7961..2e3ea308c144 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7189,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
- if (!ret) {
+ if (ret) {
mlog_errno(ret);
goto leave;
}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index fb5b3ff79dc6..acbaebcad3a8 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -330,7 +330,7 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
u64 fsblock, int hindex)
{
- struct inode *dir = filp->f_dentry->d_inode;
+ struct inode *dir = file_inode(filp);
struct buffer_head *bh;
struct omfs_inode *oi;
u64 self;
@@ -405,7 +405,7 @@ out:
static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *dir = filp->f_dentry->d_inode;
+ struct inode *dir = file_inode(filp);
struct buffer_head *bh;
loff_t offset, res;
unsigned int hchain, hindex;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 77e3cb2962b4..e0d9b3e722bd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
return mpage_writepages(mapping, wbc, omfs_get_block);
}
+static void omfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ omfs_truncate(inode);
+ }
+}
+
static int omfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
omfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ omfs_write_failed(mapping, pos + len);
return ret;
}
@@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+ truncate_setsize(inode, attr->ia_size);
+ omfs_truncate(inode);
}
setattr_copy(inode, attr);
@@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
const struct inode_operations omfs_file_inops = {
.setattr = omfs_setattr,
- .truncate = omfs_truncate
};
const struct address_space_operations omfs_aops = {
diff --git a/fs/open.c b/fs/open.c
index 182d8667b7bd..68354466879f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
+#include <linux/compat.h>
#include "internal.h"
@@ -61,33 +62,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
return ret;
}
-static long do_sys_truncate(const char __user *pathname, loff_t length)
+long vfs_truncate(struct path *path, loff_t length)
{
- struct path path;
struct inode *inode;
- int error;
+ long error;
- error = -EINVAL;
- if (length < 0) /* sorry, but loff_t says... */
- goto out;
-
- error = user_path(pathname, &path);
- if (error)
- goto out;
- inode = path.dentry->d_inode;
+ inode = path->dentry->d_inode;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
- error = -EISDIR;
if (S_ISDIR(inode->i_mode))
- goto dput_and_out;
-
- error = -EINVAL;
+ return -EISDIR;
if (!S_ISREG(inode->i_mode))
- goto dput_and_out;
+ return -EINVAL;
- error = mnt_want_write(path.mnt);
+ error = mnt_want_write(path->mnt);
if (error)
- goto dput_and_out;
+ goto out;
error = inode_permission(inode, MAY_WRITE);
if (error)
@@ -111,25 +101,53 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error)
- error = security_path_truncate(&path);
+ error = security_path_truncate(path);
if (!error)
- error = do_truncate(path.dentry, length, 0, NULL);
+ error = do_truncate(path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
- mnt_drop_write(path.mnt);
-dput_and_out:
- path_put(&path);
+ mnt_drop_write(path->mnt);
out:
return error;
}
+EXPORT_SYMBOL_GPL(vfs_truncate);
+
+static long do_sys_truncate(const char __user *pathname, loff_t length)
+{
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int error;
+
+ if (length < 0) /* sorry, but loff_t says... */
+ return -EINVAL;
+
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ if (!error) {
+ error = vfs_truncate(&path, length);
+ path_put(&path);
+ }
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ return error;
+}
SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
return do_sys_truncate(path, length);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
+{
+ return do_sys_truncate(path, length);
+}
+#endif
+
static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
@@ -185,6 +203,13 @@ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+{
+ return do_sys_ftruncate(fd, length, 1);
+}
+#endif
+
/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
@@ -218,7 +243,7 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
long ret;
if (offset < 0 || len <= 0)
@@ -306,6 +331,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
struct path path;
struct inode *inode;
int res;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
return -EINVAL;
@@ -328,8 +354,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
}
old_cred = override_creds(override_cred);
-
- res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+retry:
+ res = user_path_at(dfd, filename, lookup_flags, &path);
if (res)
goto out;
@@ -364,6 +390,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
out_path_release:
path_put(&path);
+ if (retry_estale(res, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
revert_creds(old_cred);
put_cred(override_cred);
@@ -379,8 +409,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
int error;
-
- error = user_path_dir(filename, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+ error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
@@ -392,6 +423,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
dput_and_out:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -406,7 +441,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
if (!f.file)
goto out;
- inode = f.file->f_path.dentry->d_inode;
+ inode = file_inode(f.file);
error = -ENOTDIR;
if (!S_ISDIR(inode->i_mode))
@@ -425,8 +460,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
struct path path;
int error;
-
- error = user_path_dir(filename, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+ error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
@@ -445,6 +481,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
error = 0;
dput_and_out:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -489,11 +529,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode
{
struct path path;
int error;
-
- error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(dfd, filename, lookup_flags, &path);
if (!error) {
error = chmod_common(&path, mode);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
@@ -552,6 +597,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
@@ -562,6 +608,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -654,7 +704,7 @@ static int do_dentry_open(struct file *f,
f->f_mode = FMODE_PATH;
path_get(&f->f_path);
- inode = f->f_path.dentry->d_inode;
+ inode = f->f_inode = f->f_path.dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, f->f_path.mnt);
if (error)
@@ -664,7 +714,6 @@ static int do_dentry_open(struct file *f,
}
f->f_mapping = inode->i_mapping;
- f->f_pos = 0;
file_sb_list_add(f, inode->i_sb);
if (unlikely(f->f_mode & FMODE_PATH)) {
@@ -718,6 +767,7 @@ cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
f->f_path.dentry = NULL;
+ f->f_inode = NULL;
return error;
}
@@ -775,23 +825,22 @@ struct file *dentry_open(const struct path *path, int flags,
/* We must always pass in a valid mount pointer. */
BUG_ON(!path->mnt);
- error = -ENFILE;
f = get_empty_filp();
- if (f == NULL)
- return ERR_PTR(error);
-
- f->f_flags = flags;
- f->f_path = *path;
- error = do_dentry_open(f, NULL, cred);
- if (!error) {
- error = open_check_o_direct(f);
- if (error) {
- fput(f);
+ if (!IS_ERR(f)) {
+ f->f_flags = flags;
+ f->f_path = *path;
+ error = do_dentry_open(f, NULL, cred);
+ if (!error) {
+ /* from now on we need fput() to dispose of f */
+ error = open_check_o_direct(f);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+ } else {
+ put_filp(f);
f = ERR_PTR(error);
}
- } else {
- put_filp(f);
- f = ERR_PTR(error);
}
return f;
}
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 2ad080faca34..ae47fa7efb9d 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -262,7 +262,7 @@ found:
static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct op_inode_info *oi = OP_I(inode);
struct device_node *dp = oi->u.node;
struct device_node *child;
diff --git a/fs/pipe.c b/fs/pipe.c
index bd3479db4b62..64a494cef0a0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -361,7 +361,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct pipe_inode_info *pipe;
int do_wakeup;
ssize_t ret;
@@ -486,7 +486,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
unsigned long nr_segs, loff_t ppos)
{
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct pipe_inode_info *pipe;
ssize_t ret;
int do_wakeup;
@@ -677,7 +677,7 @@ bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct pipe_inode_info *pipe;
int count, buf, nrbufs;
@@ -705,7 +705,7 @@ static unsigned int
pipe_poll(struct file *filp, poll_table *wait)
{
unsigned int mask;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct pipe_inode_info *pipe = inode->i_pipe;
int nrbufs;
@@ -758,7 +758,7 @@ pipe_release(struct inode *inode, int decr, int decw)
static int
pipe_read_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int retval;
mutex_lock(&inode->i_mutex);
@@ -772,7 +772,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
static int
pipe_write_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int retval;
mutex_lock(&inode->i_mutex);
@@ -786,7 +786,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
static int
pipe_rdwr_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct pipe_inode_info *pipe = inode->i_pipe;
int retval;
@@ -1037,13 +1037,13 @@ int create_pipe_files(struct file **res, int flags)
err = -ENFILE;
f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
- if (!f)
+ if (IS_ERR(f))
goto err_dentry;
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
- if (!res[0])
+ if (IS_ERR(res[0]))
goto err_file;
path_get(&path);
@@ -1226,7 +1226,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
*/
struct pipe_inode_info *get_pipe_info(struct file *file)
{
- struct inode *i = file->f_path.dentry->d_inode;
+ struct inode *i = file_inode(file);
return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
}
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 981b05601931..712f24db9600 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,8 @@ proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := mmu.o task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
- proc_tty.o fd.o
+ fd.o
+proc-$(CONFIG_TTY) += proc_tty.o
proc-y += cmdline.o
proc-y += consoles.o
proc-y += cpuinfo.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6ffbcbd..f7ed9ee46eb9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime += t->gtime;
+ gtime += task_gtime(t);
t = next_thread(t);
} while (t != task);
@@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt = task->min_flt;
maj_flt = task->maj_flt;
task_cputime_adjusted(task, &utime, &stime);
- gtime = task->gtime;
+ gtime = task_gtime(task);
}
/* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a5a0be40e40..69078c7cef1f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/printk.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
@@ -383,7 +384,7 @@ static int lstats_open(struct inode *inode, struct file *file)
static ssize_t lstats_write(struct file *file, const char __user *buf,
size_t count, loff_t *offs)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -542,13 +543,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
@@ -609,7 +603,7 @@ static const struct inode_operations proc_def_inode_operations = {
static ssize_t proc_info_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
unsigned long page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -675,7 +669,7 @@ static const struct file_operations proc_single_file_operations = {
static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
if (!task)
@@ -876,7 +870,7 @@ static const struct file_operations proc_environ_operations = {
static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
int oom_adj = OOM_ADJUST_MIN;
size_t len;
@@ -923,7 +917,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file->f_path.dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task) {
err = -ESRCH;
goto out;
@@ -959,7 +953,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
* /proc/pid/oom_adj is provided for legacy purposes, ask users to use
* /proc/pid/oom_score_adj instead.
*/
- printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
current->comm, task_pid_nr(current), task_pid_nr(task),
task_pid_nr(task));
@@ -983,7 +977,7 @@ static const struct file_operations proc_oom_adj_operations = {
static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
short oom_score_adj = OOM_SCORE_ADJ_MIN;
unsigned long flags;
@@ -1026,7 +1020,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file->f_path.dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task) {
err = -ESRCH;
goto out;
@@ -1074,7 +1068,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
struct task_struct *task = get_proc_task(inode);
ssize_t length;
char tmpbuf[TMPBUFLEN];
@@ -1091,7 +1085,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *page, *tmp;
ssize_t length;
uid_t loginuid;
@@ -1149,7 +1143,7 @@ static const struct file_operations proc_loginuid_operations = {
static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
struct task_struct *task = get_proc_task(inode);
ssize_t length;
char tmpbuf[TMPBUFLEN];
@@ -1172,7 +1166,7 @@ static const struct file_operations proc_sessionid_operations = {
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
size_t len;
int make_it_fail;
@@ -1204,7 +1198,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
- task = get_proc_task(file->f_dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
task->make_it_fail = make_it_fail;
@@ -1244,7 +1238,7 @@ static ssize_t
sched_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct task_struct *p;
p = get_proc_task(inode);
@@ -1295,7 +1289,7 @@ static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct task_struct *p;
char buffer[PROC_NUMBUF];
int nice;
@@ -1350,7 +1344,7 @@ static const struct file_operations proc_pid_sched_autogroup_operations = {
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct task_struct *p;
char buffer[TASK_COMM_LEN];
@@ -1718,7 +1712,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
if (!capable(CAP_SYS_ADMIN)) {
- status = -EACCES;
+ status = -EPERM;
goto out_notask;
}
@@ -1851,7 +1845,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
struct dentry *result;
struct mm_struct *mm;
- result = ERR_PTR(-EACCES);
+ result = ERR_PTR(-EPERM);
if (!capable(CAP_SYS_ADMIN))
goto out;
@@ -1907,7 +1901,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
ino_t ino;
int ret;
- ret = -EACCES;
+ ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto out;
@@ -2153,7 +2147,7 @@ out_no_task:
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *p = NULL;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2174,7 +2168,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2263,7 +2257,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
char buffer[PROC_NUMBUF];
size_t len;
@@ -2315,7 +2309,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
goto out_no_task;
ret = -ESRCH;
- task = get_proc_task(file->f_dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
@@ -2625,6 +2619,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", pid);
+ /* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
shrink_dcache_parent(dentry);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7b3ae3cc0ef9..4b3b3ffb52f1 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/printk.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
@@ -42,7 +43,7 @@ static ssize_t
__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *page;
ssize_t retval=0;
int eof=0;
@@ -132,11 +133,8 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes,
}
if (start == NULL) {
- if (n > PAGE_SIZE) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
+ if (n > PAGE_SIZE) /* Apparent buffer overflow */
n = PAGE_SIZE;
- }
n -= *ppos;
if (n <= 0)
break;
@@ -144,26 +142,19 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes,
n = count;
start = page + *ppos;
} else if (start < page) {
- if (n > PAGE_SIZE) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
+ if (n > PAGE_SIZE) /* Apparent buffer overflow */
n = PAGE_SIZE;
- }
if (n > count) {
/*
* Don't reduce n because doing so might
* cut off part of a data block.
*/
- printk(KERN_WARNING
- "proc_file_read: Read count exceeded\n");
+ pr_warn("proc_file_read: count exceeded\n");
}
} else /* start >= page */ {
unsigned long startoff = (unsigned long)(start - page);
- if (n > (PAGE_SIZE - startoff)) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
+ if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */
n = PAGE_SIZE - startoff;
- }
if (n > count)
n = count;
}
@@ -188,7 +179,7 @@ static ssize_t
proc_file_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
spin_lock(&pde->pde_unload_lock);
@@ -209,7 +200,7 @@ static ssize_t
proc_file_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
if (pde->write_proc) {
@@ -261,16 +252,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
if (error)
return error;
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, iattr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, iattr);
mark_inode_dirty(inode);
-
+
de->uid = inode->i_uid;
de->gid = inode->i_gid;
de->mode = inode->i_mode;
@@ -359,18 +343,18 @@ retry:
if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
return -ENOMEM;
- spin_lock(&proc_inum_lock);
+ spin_lock_irq(&proc_inum_lock);
error = ida_get_new(&proc_inum_ida, &i);
- spin_unlock(&proc_inum_lock);
+ spin_unlock_irq(&proc_inum_lock);
if (error == -EAGAIN)
goto retry;
else if (error)
return error;
if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
- spin_lock(&proc_inum_lock);
+ spin_lock_irq(&proc_inum_lock);
ida_remove(&proc_inum_ida, i);
- spin_unlock(&proc_inum_lock);
+ spin_unlock_irq(&proc_inum_lock);
return -ENOSPC;
}
*inum = PROC_DYNAMIC_FIRST + i;
@@ -379,9 +363,10 @@ retry:
void proc_free_inum(unsigned int inum)
{
- spin_lock(&proc_inum_lock);
+ unsigned long flags;
+ spin_lock_irqsave(&proc_inum_lock, flags);
ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
- spin_unlock(&proc_inum_lock);
+ spin_unlock_irqrestore(&proc_inum_lock, flags);
}
static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -418,8 +403,7 @@ static const struct dentry_operations proc_dentry_operations =
struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = NULL;
- int error = -ENOENT;
+ struct inode *inode;
spin_lock(&proc_subdir_lock);
for (de = de->subdir; de ; de = de->next) {
@@ -428,22 +412,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
pde_get(de);
spin_unlock(&proc_subdir_lock);
- error = -ENOMEM;
inode = proc_get_inode(dir->i_sb, de);
- goto out_unlock;
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ d_set_d_op(dentry, &proc_dentry_operations);
+ d_add(dentry, inode);
+ return NULL;
}
}
spin_unlock(&proc_subdir_lock);
-out_unlock:
-
- if (inode) {
- d_set_d_op(dentry, &proc_dentry_operations);
- d_add(dentry, inode);
- return NULL;
- }
- if (de)
- pde_put(de);
- return ERR_PTR(error);
+ return ERR_PTR(-ENOENT);
}
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
@@ -466,7 +444,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
{
unsigned int ino;
int i;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int ret = 0;
ino = inode->i_ino;
@@ -528,7 +506,7 @@ out:
int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
return proc_readdir_de(PDE(inode), filp, dirent, filldir);
}
@@ -582,7 +560,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
for (tmp = dir->subdir; tmp; tmp = tmp->next)
if (strcmp(tmp->name, dp->name) == 0) {
- WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+ WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
break;
}
@@ -843,9 +821,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
if (S_ISDIR(de->mode))
parent->nlink--;
de->nlink = 0;
- WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
- "'%s/%s', leaking at least '%s'\n", __func__,
- de->parent->name, de->name, de->subdir->name);
+ WARN(de->subdir, "%s: removing non-empty directory "
+ "'%s/%s', leaking at least '%s'\n", __func__,
+ de->parent->name, de->name, de->subdir->name);
pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 439ae6886507..a86aebc9ba7c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
+#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
@@ -144,7 +145,7 @@ void pde_users_dec(struct proc_dir_entry *pde)
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
loff_t (*llseek)(struct file *, loff_t, int);
@@ -179,7 +180,7 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
@@ -201,7 +202,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
@@ -223,7 +224,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
unsigned int rv = DEFAULT_POLLMASK;
unsigned int (*poll)(struct file *, struct poll_table_struct *);
@@ -245,7 +246,7 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
long (*ioctl)(struct file *, unsigned int, unsigned long);
@@ -268,7 +269,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
#ifdef CONFIG_COMPAT
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
@@ -291,7 +292,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
int (*mmap)(struct file *, struct vm_area_struct *);
@@ -445,12 +446,9 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode * inode;
+ struct inode *inode = iget_locked(sb, de->low_ino);
- inode = iget_locked(sb, de->low_ino);
- if (!inode)
- return NULL;
- if (inode->i_state & I_NEW) {
+ if (inode && (inode->i_state & I_NEW)) {
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
PROC_I(inode)->pde = de;
@@ -482,10 +480,12 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
} else
pde_put(de);
return inode;
-}
+}
int proc_fill_super(struct super_block *s)
{
+ struct inode *root_inode;
+
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -494,11 +494,17 @@ int proc_fill_super(struct super_block *s)
s->s_time_gran = 1;
pde_get(&proc_root);
- s->s_root = d_make_root(proc_get_inode(s, &proc_root));
- if (s->s_root)
- return 0;
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
- printk("proc_read_super: get root inode failed\n");
- pde_put(&proc_root);
- return -ENOMEM;
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
+
+ return 0;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 252544c05207..85ff3a4598b3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/proc_fs.h>
+#include <linux/binfmts.h>
struct ctl_table_header;
struct mempolicy;
@@ -108,7 +109,7 @@ static inline int task_dumpable(struct task_struct *task)
if (mm)
dumpable = get_dumpable(mm);
task_unlock(task);
- if (dumpable == SUID_DUMPABLE_ENABLED)
+ if (dumpable == SUID_DUMP_USER)
return 1;
return 0;
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e96d4f18ca3a..eda6f017f272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,6 +17,7 @@
#include <linux/elfcore.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
+#include <linux/printk.h>
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -619,7 +620,7 @@ static int __init proc_kcore_init(void)
proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
&proc_kcore_operations);
if (!proc_root_kcore) {
- printk(KERN_ERR "couldn't create /proc/kcore\n");
+ pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
}
/* Store text area if it's special */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645f7990..1efaaa19c4f3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
* sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) -
- total_swapcache_pages - i.bufferram;
+ total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
@@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeram),
K(i.bufferram),
K(cached),
- K(total_swapcache_pages),
+ K(total_swapcache_pages()),
K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
K(pages[LRU_ACTIVE_ANON]),
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
vmi.used >> 10,
vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
- ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+ ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b1822dde55c2..ccfd99bd1c5a 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,7 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
file = region->vm_file;
if (file) {
- struct inode *inode = region->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(region->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de20ec480fa0..30b590f5bd35 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -8,6 +8,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/printk.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/of.h>
@@ -110,8 +111,8 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde,
if (ent->data == oldprop)
break;
if (ent == NULL) {
- printk(KERN_WARNING "device-tree: property \"%s\" "
- " does not exist\n", oldprop->name);
+ pr_warn("device-tree: property \"%s\" does not exist\n",
+ oldprop->name);
} else {
ent->data = newprop;
ent->size = newprop->length;
@@ -153,8 +154,8 @@ static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
realloc:
fixed_name = kmalloc(fixup_len, GFP_KERNEL);
if (fixed_name == NULL) {
- printk(KERN_ERR "device-tree: Out of memory trying to fixup "
- "name \"%s\"\n", name);
+ pr_err("device-tree: Out of memory trying to fixup "
+ "name \"%s\"\n", name);
return name;
}
@@ -175,8 +176,8 @@ retry:
goto retry;
}
- printk(KERN_WARNING "device-tree: Duplicate name in %s, "
- "renamed to \"%s\"\n", np->full_name, fixed_name);
+ pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
+ np->full_name, fixed_name);
return fixed_name;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index fe72cd073dea..b4ac6572474f 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -163,7 +163,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
struct net *net;
ret = -EINVAL;
- net = get_proc_task_net(filp->f_path.dentry->d_inode);
+ net = get_proc_task_net(file_inode(filp));
if (net != NULL) {
ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
put_net(net);
@@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = {
.readdir = proc_tgid_net_readdir,
};
-
-struct proc_dir_entry *proc_net_fops_create(struct net *net,
- const char *name, umode_t mode, const struct file_operations *fops)
-{
- return proc_create(name, mode, net->proc_net, fops);
-}
-EXPORT_SYMBOL_GPL(proc_net_fops_create);
-
-void proc_net_remove(struct net *net, const char *name)
-{
- remove_proc_entry(name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(proc_net_remove);
-
static __net_init int proc_net_ns_init(struct net *net)
{
struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 701580ddfcc3..ac05f33a0dde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
#include <linux/sysctl.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
+#include <linux/printk.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/namei.h>
@@ -57,7 +58,7 @@ static void sysctl_print_dir(struct ctl_dir *dir)
{
if (dir->header.parent)
sysctl_print_dir(dir->header.parent);
- printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
+ pr_cont("%s/", dir->header.ctl_table[0].procname);
}
static int namecmp(const char *name1, int len1, const char *name2, int len2)
@@ -134,9 +135,9 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
else if (cmp > 0)
p = &(*p)->rb_right;
else {
- printk(KERN_ERR "sysctl duplicate entry: ");
+ pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
- printk(KERN_CONT "/%s\n", entry->procname);
+ pr_cont("/%s\n", entry->procname);
return -EEXIST;
}
}
@@ -478,7 +479,7 @@ out:
static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
size_t count, loff_t *ppos, int write)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
ssize_t error;
@@ -542,7 +543,7 @@ static int proc_sys_open(struct inode *inode, struct file *filp)
static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
unsigned int ret = DEFAULT_POLLMASK;
@@ -736,13 +737,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
@@ -934,9 +928,9 @@ found:
subdir->header.nreg++;
failed:
if (unlikely(IS_ERR(subdir))) {
- printk(KERN_ERR "sysctl could not get directory: ");
+ pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
- printk(KERN_CONT "/%*.*s %ld\n",
+ pr_cont("/%*.*s %ld\n",
namelen, namelen, name, PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
@@ -1002,8 +996,8 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
- path, table->procname, &vaf);
+ pr_err("sysctl table check failed: %s/%s %pV\n",
+ path, table->procname, &vaf);
va_end(args);
return -EINVAL;
@@ -1517,9 +1511,9 @@ static void put_links(struct ctl_table_header *header)
drop_sysctl_table(link_head);
}
else {
- printk(KERN_ERR "sysctl link missing during unregister: ");
+ pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
- printk(KERN_CONT "/%s\n", name);
+ pr_cont("/%s\n", name);
}
}
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 448455b7fd91..3e636d864d56 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -271,7 +271,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
const char *name = NULL;
if (file) {
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -743,7 +743,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return rv;
if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
return -EINVAL;
- task = get_proc_task(file->f_path.dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
@@ -1015,7 +1015,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
struct pagemapread pm;
int ret = -ESRCH;
@@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
walk.mm = mm;
pol = get_vma_policy(task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol, 0);
+ mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ccfa537f5f5..56123a6f462e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -149,7 +149,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
file = vma->vm_file;
if (file) {
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0d5071d29985..b870f740ab5a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/printk.h>
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/crash_dump.h>
@@ -175,15 +176,15 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
if (!curr_m)
return -EINVAL;
- if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
- tsz = buflen;
-
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
while (buflen) {
+ tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
+
+ /* Calculate left bytes in current memory segment. */
+ nr_bytes = (curr_m->size - (start - curr_m->paddr));
+ if (tsz > nr_bytes)
+ tsz = nr_bytes;
+
tmp = read_from_oldmem(buffer, tsz, &start, 1);
if (tmp < 0)
return tmp;
@@ -198,12 +199,6 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
struct vmcore, list);
start = curr_m->paddr;
}
- if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
- tsz = buflen;
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
}
return acc;
}
@@ -553,8 +548,7 @@ static int __init parse_crash_elf64_headers(void)
ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
ehdr.e_phnum == 0) {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- "sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
@@ -609,8 +603,7 @@ static int __init parse_crash_elf32_headers(void)
ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
ehdr.e_phnum == 0) {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- "sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
@@ -653,8 +646,7 @@ static int __init parse_crash_elf_headers(void)
if (rc < 0)
return rc;
if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
- printk(KERN_WARNING "Warning: Core image elf header"
- " not found\n");
+ pr_warn("Warning: Core image elf header not found\n");
return -EINVAL;
}
@@ -673,8 +665,7 @@ static int __init parse_crash_elf_headers(void)
/* Determine vmcore size. */
vmcore_size = get_vmcore_size_elf32(elfcorebuf);
} else {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- " sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
return 0;
@@ -690,7 +681,7 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
- printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+ pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 67de74ca85f4..e4bcb2cf055a 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -418,9 +418,25 @@ static struct file_system_type pstore_fs_type = {
.kill_sb = pstore_kill_sb,
};
+static struct kobject *pstore_kobj;
+
static int __init init_pstore_fs(void)
{
- return register_filesystem(&pstore_fs_type);
+ int err = 0;
+
+ /* Create a convenient mount point for people to access pstore */
+ pstore_kobj = kobject_create_and_add("pstore", fs_kobj);
+ if (!pstore_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = register_filesystem(&pstore_fs_type);
+ if (err < 0)
+ kobject_put(pstore_kobj);
+
+out:
+ return err;
}
module_init(init_pstore_fs)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5ea2e77ff023..86d1038b5a12 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
}
}
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
+ if (in_nmi())
+ return true;
+
+ switch (reason) {
+ /* In panic case, other cpus are stopped by smp_send_stop(). */
+ case KMSG_DUMP_PANIC:
+ /* Emergency restart shouldn't be blocked by spin lock. */
+ case KMSG_DUMP_EMERG:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+
/*
* callback from kmsg_dump. (s2,l2) has the most recently
* written bytes, older bytes are in (s1,l1). Save as much
@@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,
why = get_reason_str(reason);
- if (in_nmi()) {
- is_locked = spin_trylock(&psinfo->buf_lock);
- if (!is_locked)
- pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+ if (pstore_cannot_block_path(reason)) {
+ is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
+ if (!is_locked) {
+ pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+ , in_nmi() ? "NMI" : why);
+ }
} else
spin_lock_irqsave(&psinfo->buf_lock, flags);
oopscount++;
@@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += hsize + len;
part++;
}
- if (in_nmi()) {
+ if (pstore_cannot_block_path(reason)) {
if (is_locked)
- spin_unlock(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
} else
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index f883e7e74305..288f068740f6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
{
char *hdr;
- struct timeval timestamp;
+ struct timespec timestamp;
size_t len;
- do_gettimeofday(&timestamp);
+ /* Report zeroed timestamp if called before timekeeping has resumed. */
+ if (__getnstimeofday(&timestamp)) {
+ timestamp.tv_sec = 0;
+ timestamp.tv_nsec = 0;
+ }
hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
- (long)timestamp.tv_sec, (long)timestamp.tv_usec);
+ (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));
WARN_ON_ONCE(!hdr);
len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
@@ -291,9 +295,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
kfree(cxt->przs);
}
-static int __devinit ramoops_init_przs(struct device *dev,
- struct ramoops_context *cxt,
- phys_addr_t *paddr, size_t dump_mem_sz)
+static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
+ phys_addr_t *paddr, size_t dump_mem_sz)
{
int err = -ENOMEM;
int i;
@@ -336,10 +339,9 @@ fail_prz:
return err;
}
-static int __devinit ramoops_init_prz(struct device *dev,
- struct ramoops_context *cxt,
- struct persistent_ram_zone **prz,
- phys_addr_t *paddr, size_t sz, u32 sig)
+static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+ struct persistent_ram_zone **prz,
+ phys_addr_t *paddr, size_t sz, u32 sig)
{
if (!sz)
return 0;
@@ -367,7 +369,7 @@ static int __devinit ramoops_init_prz(struct device *dev,
return 0;
}
-static int __devinit ramoops_probe(struct platform_device *pdev)
+static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct ramoops_platform_data *pdata = pdev->dev.platform_data;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index eecd2a8a84dd..0306303be372 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
return 0;
}
-static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz,
- u32 sig, int ecc_size)
+static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
+ int ecc_size)
{
int ret;
@@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
kfree(prz);
}
-struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start,
- size_t size, u32 sig,
- int ecc_size)
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+ u32 sig, int ecc_size)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 7b0329468a5d..28ce014b3cef 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -16,7 +16,7 @@
static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int offset;
struct buffer_head *bh;
struct qnx4_inode_entry *de;
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index dc597353db3b..8798d065e400 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -117,7 +117,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *s = inode->i_sb;
struct qnx6_sb_info *sbi = QNX6_SB(s);
loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index b6addf560483..57199a52a351 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
/* we got a big endian fs */
QNX6DEBUG((KERN_INFO "qnx6: fs got different"
- " endianess.\n"));
+ " endianness.\n"));
return bh;
} else
sbi->s_bytesex = BYTESEX_LE;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d5378d028589..8d5b438cc188 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -202,7 +202,7 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long pgoff, unsigned long flags)
{
unsigned long maxpages, lpages, nr, loop, ret;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct page **pages = NULL, **ptr, *page;
loff_t isize;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eab8c09d3801..c24f1e10b946 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -260,6 +260,7 @@ static struct file_system_type ramfs_fs_type = {
.name = "ramfs",
.mount = ramfs_mount,
.kill_sb = ramfs_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type rootfs_fs_type = {
.name = "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 1edaf099ddd7..a698eff457fb 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/compat.h>
#include "read_write.h"
#include <asm/uaccess.h>
@@ -163,7 +164,7 @@ EXPORT_SYMBOL(no_llseek);
loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
loff_t retval;
mutex_lock(&inode->i_mutex);
@@ -247,6 +248,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
return retval;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
+{
+ return sys_lseek(fd, offset, whence);
+}
+#endif
+
#ifdef __ARCH_WANT_SYS_LLSEEK
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned long, offset_low, loff_t __user *, result,
@@ -278,7 +286,6 @@ out_putf:
}
#endif
-
/*
* rw_verify_area doesn't like huge counts. We limit
* them to something that fits in "int" so that others
@@ -290,7 +297,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
loff_t pos;
int retval = -EINVAL;
- inode = file->f_path.dentry->d_inode;
+ inode = file_inode(file);
if (unlikely((ssize_t) count < 0))
return retval;
pos = *ppos;
@@ -901,8 +908,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
if (!(out.file->f_mode & FMODE_WRITE))
goto fput_out;
retval = -EINVAL;
- in_inode = in.file->f_path.dentry->d_inode;
- out_inode = out.file->f_path.dentry->d_inode;
+ in_inode = file_inode(in.file);
+ out_inode = file_inode(out.file);
retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
if (retval < 0)
goto fput_out;
@@ -935,6 +942,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
+ fsnotify_access(in.file);
+ fsnotify_modify(out.file);
}
inc_syscr(current);
diff --git a/fs/readdir.c b/fs/readdir.c
index 5e69ef533b77..fee38e04fae4 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -22,7 +22,7 @@
int vfs_readdir(struct file *file, filldir_t filler, void *buf)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
int res = -ENOTDIR;
if (!file->f_op || !file->f_op->readdir)
goto out;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 8375c922c0d5..6165bd4784f6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
return err;
}
-static void reiserfs_vfs_truncate_file(struct inode *inode)
+void reiserfs_vfs_truncate_file(struct inode *inode)
{
mutex_lock(&(REISERFS_I(inode)->tailpack));
reiserfs_truncate_file(inode, 1);
@@ -268,7 +268,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
* new current position before returning. */
)
{
- struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to.
+ struct inode *inode = file_inode(file); // Inode of the file that we are writing to.
/* To simplify coding at this time, we store
locked pages in array for now */
struct reiserfs_transaction_handle th;
@@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {
};
const struct inode_operations reiserfs_file_inode_operations = {
- .truncate = reiserfs_vfs_truncate_file,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d83736fbc26c..ea5061fd4f3e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1603,10 +1603,10 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
if (parent && (maxlen < 5)) {
*lenp = 5;
- return 255;
+ return FILEID_INVALID;
} else if (maxlen < 3) {
*lenp = 3;
- return 255;
+ return FILEID_INVALID;
}
data[0] = inode->i_ino;
@@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
loff_t isize = i_size_read(inode);
loff_t end = offset + iov_length(iov, nr_segs);
- if (end > isize)
- vmtruncate(inode, isize);
+ if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
+ truncate_setsize(inode, isize);
+ reiserfs_vfs_truncate_file(inode);
+ }
}
return ret;
@@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
*/
reiserfs_write_unlock_once(inode->i_sb, depth);
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- error = vmtruncate(inode, attr->ia_size);
+ attr->ia_size != i_size_read(inode)) {
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (!error) {
+ truncate_setsize(inode, attr->ia_size);
+ reiserfs_vfs_truncate_file(inode);
+ }
+ }
if (!error) {
setattr_copy(inode, attr);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0c2185042d5f..15cb5fe6b425 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -21,7 +21,7 @@
*/
long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int flags;
int err = 0;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index e60e87035bb3..9cc0740adffa 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -281,7 +281,7 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
}
#if defined( REISERFS_USE_OIDMAPF )
if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
- loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size;
+ loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
total_used += size / sizeof(reiserfs_oidinterval_d_t);
}
#endif
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 33215f57ea06..157e474ab303 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
*,
int count);
int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+void reiserfs_vfs_truncate_file(struct inode *inode);
int reiserfs_commit_page(struct inode *inode, struct page *page,
unsigned from, unsigned to);
void reiserfs_flush_old_commits(struct super_block *);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index fd7c5f60b46b..7e8d3a80bdab 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -147,7 +147,7 @@ static const struct address_space_operations romfs_aops = {
*/
static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *i = filp->f_dentry->d_inode;
+ struct inode *i = file_inode(filp);
struct romfs_inode ri;
unsigned long offset, maxoff;
int j, ino, nextfh;
diff --git a/fs/select.c b/fs/select.c
index 2ef72d965036..8c1c96c27062 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -26,6 +26,7 @@
#include <linux/fs.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
+#include <linux/sched/rt.h>
#include <asm/uaccess.h>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9d863fb501f9..38bb59f3f2ad 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read);
* seq_lseek - ->llseek() method for sequential files.
* @file: the file in question
* @offset: new position
- * @origin: 0 for absolute, 1 for relative position
+ * @whence: 0 for absolute, 1 for relative position
*
* Ready-made ->f_op->llseek()
*/
@@ -308,27 +308,27 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
mutex_lock(&m->lock);
m->version = file->f_version;
switch (whence) {
- case 1:
- offset += file->f_pos;
- case 0:
- if (offset < 0)
- break;
- retval = offset;
- if (offset != m->read_pos) {
- while ((retval=traverse(m, offset)) == -EAGAIN)
- ;
- if (retval) {
- /* with extreme prejudice... */
- file->f_pos = 0;
- m->read_pos = 0;
- m->version = 0;
- m->index = 0;
- m->count = 0;
- } else {
- m->read_pos = offset;
- retval = file->f_pos = offset;
- }
+ case SEEK_CUR:
+ offset += file->f_pos;
+ case SEEK_SET:
+ if (offset < 0)
+ break;
+ retval = offset;
+ if (offset != m->read_pos) {
+ while ((retval = traverse(m, offset)) == -EAGAIN)
+ ;
+ if (retval) {
+ /* with extreme prejudice... */
+ file->f_pos = 0;
+ m->read_pos = 0;
+ m->version = 0;
+ m->index = 0;
+ m->count = 0;
+ } else {
+ m->read_pos = offset;
+ retval = file->f_pos = offset;
}
+ }
}
file->f_version = m->version;
mutex_unlock(&m->lock);
@@ -339,7 +339,7 @@ EXPORT_SYMBOL(seq_lseek);
/**
* seq_release - free the structures associated with sequential file.
* @file: file in question
- * @inode: file->f_path.dentry->d_inode
+ * @inode: its inode
*
* Frees the structures associated with sequential file; can be used
* as ->f_op->release() if you don't have private data to destroy.
diff --git a/fs/splice.c b/fs/splice.c
index 8890604e3fcd..718bd0056384 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -569,7 +569,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
return res;
}
-static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+ssize_t kernel_write(struct file *file, const char *buf, size_t count,
loff_t pos)
{
mm_segment_t old_fs;
@@ -578,11 +578,12 @@ static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_write(file, (const char __user *)buf, count, &pos);
+ res = vfs_write(file, (__force const char __user *)buf, count, &pos);
set_fs(old_fs);
return res;
}
+EXPORT_SYMBOL(kernel_write);
ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
@@ -696,8 +697,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
return -EINVAL;
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
- if (sd->len < sd->total_len)
+
+ if (sd->len < sd->total_len && pipe->nrbufs > 1)
more |= MSG_SENDPAGE_NOTLAST;
+
return file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
}
@@ -1168,7 +1171,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* randomly drop data for eg socket -> socket splicing. Use the
* piped splicing for that!
*/
- i_mode = in->f_path.dentry->d_inode->i_mode;
+ i_mode = file_inode(in)->i_mode;
if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
return -EINVAL;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index b381305c9a47..57dc70ebbb19 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -102,7 +102,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
u64 block = squashfs_i(inode)->start + msblk->directory_table;
int offset = squashfs_i(inode)->offset, length, dir_count, size,
diff --git a/fs/stat.c b/fs/stat.c
index eae494630a36..04ce1ac20d20 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -37,17 +37,17 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
EXPORT_SYMBOL(generic_fillattr);
-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int vfs_getattr(struct path *path, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = path->dentry->d_inode;
int retval;
- retval = security_inode_getattr(mnt, dentry);
+ retval = security_inode_getattr(path->mnt, path->dentry);
if (retval)
return retval;
if (inode->i_op->getattr)
- return inode->i_op->getattr(mnt, dentry, stat);
+ return inode->i_op->getattr(path->mnt, path->dentry, stat);
generic_fillattr(inode, stat);
return 0;
@@ -61,8 +61,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
int error = -EBADF;
if (f.file) {
- error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry,
- stat);
+ error = vfs_getattr(&f.file->f_path, stat);
fdput(f);
}
return error;
@@ -74,7 +73,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ unsigned int lookup_flags = 0;
if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,13 +83,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
lookup_flags |= LOOKUP_FOLLOW;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
-
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
- error = vfs_getattr(path.mnt, path.dentry, stat);
+ error = vfs_getattr(&path, stat);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -296,11 +299,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
struct path path;
int error;
int empty = 0;
+ unsigned int lookup_flags = LOOKUP_EMPTY;
if (bufsiz <= 0)
return -EINVAL;
- error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
+retry:
+ error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
if (!error) {
struct inode *inode = path.dentry->d_inode;
@@ -314,6 +319,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
}
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
diff --git a/fs/statfs.c b/fs/statfs.c
index f8e832e6f0a2..c219e733f553 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);
int user_statfs(const char __user *pathname, struct kstatfs *st)
{
struct path path;
- int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+ int error;
+ unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (!error) {
error = vfs_statfs(&path, st);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
diff --git a/fs/super.c b/fs/super.c
index 12f123712161..7465d4364208 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -447,14 +447,13 @@ struct super_block *sget(struct file_system_type *type,
void *data)
{
struct super_block *s = NULL;
- struct hlist_node *node;
struct super_block *old;
int err;
retry:
spin_lock(&sb_lock);
if (test) {
- hlist_for_each_entry(old, node, &type->fs_supers, s_instances) {
+ hlist_for_each_entry(old, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
if (!grab_super(old))
@@ -554,10 +553,9 @@ void iterate_supers_type(struct file_system_type *type,
void (*f)(struct super_block *, void *), void *arg)
{
struct super_block *sb, *p = NULL;
- struct hlist_node *node;
spin_lock(&sb_lock);
- hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) {
+ hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
sb->s_count++;
spin_unlock(&sb_lock);
@@ -842,7 +840,7 @@ int get_anon_bdev(dev_t *p)
else if (error)
return -EAGAIN;
- if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
+ if (dev == (1 << MINORBITS)) {
spin_lock(&unnamed_dev_lock);
ida_remove(&unnamed_dev_ida, dev);
if (unnamed_dev_start > dev)
diff --git a/fs/sync.c b/fs/sync.c
index 14eefeb44636..2c5d6639a66a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -332,7 +332,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
if (!f.file)
goto out;
- i_mode = f.file->f_path.dentry->d_inode->i_mode;
+ i_mode = file_inode(f.file)->i_mode;
ret = -ESPIPE;
if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
!S_ISLNK(i_mode))
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 614b2b544880..15c68f9489ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -70,7 +70,7 @@ static ssize_t
read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
{
struct bin_buffer *bb = file->private_data;
- int size = file->f_path.dentry->d_inode->i_size;
+ int size = file_inode(file)->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
char *temp;
@@ -140,7 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
size_t bytes, loff_t *off)
{
struct bin_buffer *bb = file->private_data;
- int size = file->f_path.dentry->d_inode->i_size;
+ int size = file_inode(file)->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
char *temp;
@@ -461,15 +461,14 @@ const struct file_operations bin_fops = {
void unmap_bin_file(struct sysfs_dirent *attr_sd)
{
struct bin_buffer *bb;
- struct hlist_node *tmp;
if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
return;
mutex_lock(&sysfs_bin_lock);
- hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
- struct inode *inode = bb->file->f_path.dentry->d_inode;
+ hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) {
+ struct inode *inode = file_inode(bb->file);
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2df555c66d57..aec3d5c98c94 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj,
}
EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+/**
+ * sysfs_add_link_to_group - add a symlink to an attribute group.
+ * @kobj: The kobject containing the group.
+ * @group_name: The name of the group.
+ * @target: The target kobject of the symlink to create.
+ * @link_name: The name of the symlink to create.
+ */
+int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
+ struct kobject *target, const char *link_name)
+{
+ struct sysfs_dirent *dir_sd;
+ int error = 0;
+
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+ if (!dir_sd)
+ return -ENOENT;
+
+ error = sysfs_create_link_sd(dir_sd, target, link_name);
+ sysfs_put(dir_sd);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
+
+/**
+ * sysfs_remove_link_from_group - remove a symlink from an attribute group.
+ * @kobj: The kobject containing the group.
+ * @group_name: The name of the group.
+ * @link_name: The name of the symlink to remove.
+ */
+void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
+ const char *link_name)
+{
+ struct sysfs_dirent *dir_sd;
+
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+ if (dir_sd) {
+ sysfs_hash_and_remove(dir_sd, NULL, link_name);
+ sysfs_put(dir_sd);
+ }
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
EXPORT_SYMBOL_GPL(sysfs_create_group);
EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index db940a9be045..8d924b5ec733 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -10,7 +10,7 @@
* Please see Documentation/filesystems/sysfs.txt for more information.
*/
-#define DEBUG
+#define DEBUG
#include <linux/fs.h>
#include <linux/mount.h>
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3c9eb5624f5e..8c940df97a52 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -21,26 +21,17 @@
#include "sysfs.h"
-static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
- const char *name, int warn)
+static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
+ struct kobject *target,
+ const char *name, int warn)
{
- struct sysfs_dirent *parent_sd = NULL;
struct sysfs_dirent *target_sd = NULL;
struct sysfs_dirent *sd = NULL;
struct sysfs_addrm_cxt acxt;
enum kobj_ns_type ns_type;
int error;
- BUG_ON(!name);
-
- if (!kobj)
- parent_sd = &sysfs_root;
- else
- parent_sd = kobj->sd;
-
- error = -EFAULT;
- if (!parent_sd)
- goto out_put;
+ BUG_ON(!name || !parent_sd);
/* target->sd can go away beneath us but is protected with
* sysfs_assoc_lock. Fetch target_sd from it.
@@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
}
/**
+ * sysfs_create_link_sd - create symlink to a given object.
+ * @sd: directory we're creating the link in.
+ * @target: object we're pointing to.
+ * @name: name of the symlink.
+ */
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+ const char *name)
+{
+ return sysfs_do_create_link_sd(sd, target, name, 1);
+}
+
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+ const char *name, int warn)
+{
+ struct sysfs_dirent *parent_sd = NULL;
+
+ if (!kobj)
+ parent_sd = &sysfs_root;
+ else
+ parent_sd = kobj->sd;
+
+ if (!parent_sd)
+ return -EFAULT;
+
+ return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+}
+
+/**
* sysfs_create_link - create symlink between two objects.
* @kobj: object whose directory we're creating the link in.
* @target: object we're pointing to.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d73c0932bbd6..d1e4043eb0c3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd);
* symlink.c
*/
extern const struct inode_operations sysfs_symlink_inode_operations;
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+ const char *name);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index a77c42157620..3799e8dac3eb 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -68,7 +68,7 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
unsigned long pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
unsigned offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0a65939508e9..9d4dc6831792 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+ truncate_setsize(inode, attr->ia_size);
+ sysv_truncate(inode);
}
setattr_copy(inode, attr);
@@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
}
const struct inode_operations sysv_file_inode_operations = {
- .truncate = sysv_truncate,
.setattr = sysv_setattr,
.getattr = sysv_getattr,
};
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 90b54b438789..c1a591a4725b 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, get_block);
}
+static void sysv_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ sysv_truncate(inode);
+ }
+}
+
static int sysv_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
int ret;
ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ sysv_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d03822bbf190..32b644f03690 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,6 +22,7 @@
#include <linux/anon_inodes.h>
#include <linux/timerfd.h>
#include <linux/syscalls.h>
+#include <linux/compat.h>
#include <linux/rcupdate.h>
struct timerfd_ctx {
@@ -278,21 +279,17 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
return ufd;
}
-SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
- const struct itimerspec __user *, utmr,
- struct itimerspec __user *, otmr)
+static int do_timerfd_settime(int ufd, int flags,
+ const struct itimerspec *new,
+ struct itimerspec *old)
{
struct fd f;
struct timerfd_ctx *ctx;
- struct itimerspec ktmr, kotmr;
int ret;
- if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
- return -EFAULT;
-
if ((flags & ~TFD_SETTIME_FLAGS) ||
- !timespec_valid(&ktmr.it_value) ||
- !timespec_valid(&ktmr.it_interval))
+ !timespec_valid(&new->it_value) ||
+ !timespec_valid(&new->it_interval))
return -EINVAL;
ret = timerfd_fget(ufd, &f);
@@ -323,27 +320,23 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
if (ctx->expired && ctx->tintv.tv64)
hrtimer_forward_now(&ctx->tmr, ctx->tintv);
- kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+ old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+ old->it_interval = ktime_to_timespec(ctx->tintv);
/*
* Re-program the timer to the new value ...
*/
- ret = timerfd_setup(ctx, flags, &ktmr);
+ ret = timerfd_setup(ctx, flags, new);
spin_unlock_irq(&ctx->wqh.lock);
fdput(f);
- if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
- return -EFAULT;
-
return ret;
}
-SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+static int do_timerfd_gettime(int ufd, struct itimerspec *t)
{
struct fd f;
struct timerfd_ctx *ctx;
- struct itimerspec kotmr;
int ret = timerfd_fget(ufd, &f);
if (ret)
return ret;
@@ -356,11 +349,65 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
hrtimer_restart(&ctx->tmr);
}
- kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+ t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+ t->it_interval = ktime_to_timespec(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
fdput(f);
+ return 0;
+}
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+ const struct itimerspec __user *, utmr,
+ struct itimerspec __user *, otmr)
+{
+ struct itimerspec new, old;
+ int ret;
+
+ if (copy_from_user(&new, utmr, sizeof(new)))
+ return -EFAULT;
+ ret = do_timerfd_settime(ufd, flags, &new, &old);
+ if (ret)
+ return ret;
+ if (otmr && copy_to_user(otmr, &old, sizeof(old)))
+ return -EFAULT;
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+{
+ struct itimerspec kotmr;
+ int ret = do_timerfd_gettime(ufd, &kotmr);
+ if (ret)
+ return ret;
return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+ const struct compat_itimerspec __user *, utmr,
+ struct compat_itimerspec __user *, otmr)
+{
+ struct itimerspec new, old;
+ int ret;
+
+ if (get_compat_itimerspec(&new, utmr))
+ return -EFAULT;
+ ret = do_timerfd_settime(ufd, flags, &new, &old);
+ if (ret)
+ return ret;
+ if (otmr && put_compat_itimerspec(otmr, &old))
+ return -EFAULT;
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+ struct compat_itimerspec __user *, otmr)
+{
+ struct itimerspec kotmr;
+ int ret = do_timerfd_gettime(ufd, &kotmr);
+ if (ret)
+ return ret;
+ return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0;
+}
+#endif
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 12817ffc7345..7f60e900edff 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2459,7 +2459,7 @@ error_dump:
static inline int chance(unsigned int n, unsigned int out_of)
{
- return !!((random32() % out_of) + 1 <= n);
+ return !!((prandom_u32() % out_of) + 1 <= n);
}
@@ -2477,13 +2477,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
/* Fail withing 1 minute */
- delay = random32() % 60000;
+ delay = prandom_u32() % 60000;
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
ubifs_warn("failing after %lums", delay);
} else {
d->pc_delay = 2;
- delay = random32() % 10000;
+ delay = prandom_u32() % 10000;
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
ubifs_warn("failing after %lu calls", delay);
@@ -2563,7 +2563,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = random32() % (len + 1);
+ from = prandom_u32() % (len + 1);
/* Corruption may only span one max. write unit */
to = min(len, ALIGN(from, c->max_write_size));
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 8a574776a493..de08c92f2e23 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -352,7 +352,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
struct qstr nm;
union ubifs_key key;
struct ubifs_dent_node *dent;
- struct inode *dir = file->f_path.dentry->d_inode;
+ struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5bc77817f382..f12189d2db1d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1444,7 +1444,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct timespec now = ubifs_current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
@@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
ubifs_release_dirty_inode_budget(c, ui);
}
+ wait_for_stable_page(page);
unlock_page(page);
return 0;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 1a7e2d8bdbe9..648b143606cc 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -147,7 +147,7 @@ out_unlock:
long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
int flags, err;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
switch (cmd) {
case FS_IOC_GETFLAGS:
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 9daaeef675dd..4b826abb1528 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -2007,28 +2007,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
if (!dbg_is_chk_gen(c))
return 0;
- if (random32() & 3)
+ if (prandom_u32() & 3)
return 0;
for (i = 0; i < c->lsave_cnt; i++)
c->lsave[i] = c->main_first;
list_for_each_entry(lprops, &c->empty_list, list)
- c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
list_for_each_entry(lprops, &c->freeable_list, list)
- c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
list_for_each_entry(lprops, &c->frdi_idx_list, list)
- c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_FREE - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
return 1;
}
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 769701ccb5c9..ba32da3fe08a 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -126,13 +126,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
else if (inum > o->inum)
p = p->rb_right;
else {
- if (o->dnext) {
+ if (o->del) {
spin_unlock(&c->orphan_lock);
dbg_gen("deleted twice ino %lu",
(unsigned long)inum);
return;
}
- if (o->cnext) {
+ if (o->cmt) {
+ o->del = 1;
o->dnext = c->orph_dnext;
c->orph_dnext = o;
spin_unlock(&c->orphan_lock);
@@ -172,7 +173,9 @@ int ubifs_orphan_start_commit(struct ubifs_info *c)
last = &c->orph_cnext;
list_for_each_entry(orphan, &c->orph_new, new_list) {
ubifs_assert(orphan->new);
+ ubifs_assert(!orphan->cmt);
orphan->new = 0;
+ orphan->cmt = 1;
*last = orphan;
last = &orphan->cnext;
}
@@ -299,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
cnext = c->orph_cnext;
for (i = 0; i < cnt; i++) {
orphan = cnext;
+ ubifs_assert(orphan->cmt);
orph->inos[i] = cpu_to_le64(orphan->inum);
+ orphan->cmt = 0;
cnext = orphan->cnext;
orphan->cnext = NULL;
}
@@ -378,6 +383,7 @@ static int consolidate(struct ubifs_info *c)
list_for_each_entry(orphan, &c->orph_list, list) {
if (orphan->new)
continue;
+ orphan->cmt = 1;
*last = orphan;
last = &orphan->cnext;
cnt += 1;
@@ -442,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c)
orphan = dnext;
dnext = orphan->dnext;
ubifs_assert(!orphan->new);
+ ubifs_assert(orphan->del);
rb_erase(&orphan->rb, &c->orph_tree);
list_del(&orphan->list);
c->tot_orphans -= 1;
@@ -531,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
rb_link_node(&orphan->rb, parent, p);
rb_insert_color(&orphan->rb, &c->orph_tree);
list_add_tail(&orphan->list, &c->orph_list);
+ orphan->del = 1;
orphan->dnext = c->orph_dnext;
c->orph_dnext = orphan;
dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 523bbad69c0c..52a6559275c4 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -683,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_is_chk_index(c) && !(random32() & 7))
+ if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
return -ENOSPC;
return 0;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d133c276fe05..b2babce4d70f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -904,6 +904,8 @@ struct ubifs_budget_req {
* @dnext: next orphan to delete
* @inum: inode number
* @new: %1 => added since the last commit, otherwise %0
+ * @cmt: %1 => commit pending, otherwise %0
+ * @del: %1 => delete pending, otherwise %0
*/
struct ubifs_orphan {
struct rb_node rb;
@@ -912,7 +914,9 @@ struct ubifs_orphan {
struct ubifs_orphan *cnext;
struct ubifs_orphan *dnext;
ino_t inum;
- int new;
+ unsigned new:1;
+ unsigned cmt:1;
+ unsigned del:1;
};
/**
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index eb8bfe2b89a5..b3e93f5e17c3 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -186,7 +186,7 @@ out:
static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct inode *dir = filp->f_path.dentry->d_inode;
+ struct inode *dir = file_inode(filp);
int result;
if (filp->f_pos == 0) {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 77b5953eaac8..29569dd08168 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -139,7 +139,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
ssize_t retval;
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
int err, pos;
size_t count = iocb->ki_left;
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -178,7 +178,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
long old_block, new_block;
int result = -EINVAL;
@@ -204,7 +204,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
goto out;
case UDF_RELOCATE_BLOCKS:
if (!capable(CAP_SYS_ADMIN)) {
- result = -EACCES;
+ result = -EPERM;
goto out;
}
if (get_user(old_block, (long __user *)arg)) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index cbae1ed0b7c1..7a12e48ad819 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -67,6 +67,74 @@ static void udf_update_extents(struct inode *,
struct extent_position *);
static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
+static void __udf_clear_extent_cache(struct inode *inode)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+
+ if (iinfo->cached_extent.lstart != -1) {
+ brelse(iinfo->cached_extent.epos.bh);
+ iinfo->cached_extent.lstart = -1;
+ }
+}
+
+/* Invalidate extent cache */
+static void udf_clear_extent_cache(struct inode *inode)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+
+ spin_lock(&iinfo->i_extent_cache_lock);
+ __udf_clear_extent_cache(inode);
+ spin_unlock(&iinfo->i_extent_cache_lock);
+}
+
+/* Return contents of extent cache */
+static int udf_read_extent_cache(struct inode *inode, loff_t bcount,
+ loff_t *lbcount, struct extent_position *pos)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ int ret = 0;
+
+ spin_lock(&iinfo->i_extent_cache_lock);
+ if ((iinfo->cached_extent.lstart <= bcount) &&
+ (iinfo->cached_extent.lstart != -1)) {
+ /* Cache hit */
+ *lbcount = iinfo->cached_extent.lstart;
+ memcpy(pos, &iinfo->cached_extent.epos,
+ sizeof(struct extent_position));
+ if (pos->bh)
+ get_bh(pos->bh);
+ ret = 1;
+ }
+ spin_unlock(&iinfo->i_extent_cache_lock);
+ return ret;
+}
+
+/* Add extent to extent cache */
+static void udf_update_extent_cache(struct inode *inode, loff_t estart,
+ struct extent_position *pos, int next_epos)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+
+ spin_lock(&iinfo->i_extent_cache_lock);
+ /* Invalidate previously cached extent */
+ __udf_clear_extent_cache(inode);
+ if (pos->bh)
+ get_bh(pos->bh);
+ memcpy(&iinfo->cached_extent.epos, pos,
+ sizeof(struct extent_position));
+ iinfo->cached_extent.lstart = estart;
+ if (next_epos)
+ switch (iinfo->i_alloc_type) {
+ case ICBTAG_FLAG_AD_SHORT:
+ iinfo->cached_extent.epos.offset -=
+ sizeof(struct short_ad);
+ break;
+ case ICBTAG_FLAG_AD_LONG:
+ iinfo->cached_extent.epos.offset -=
+ sizeof(struct long_ad);
+ }
+ spin_unlock(&iinfo->i_extent_cache_lock);
+}
void udf_evict_inode(struct inode *inode)
{
@@ -90,6 +158,7 @@ void udf_evict_inode(struct inode *inode)
}
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
+ udf_clear_extent_cache(inode);
if (want_delete) {
udf_free_inode(inode);
}
@@ -105,6 +174,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, to, isize);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
down_write(&iinfo->i_data_sem);
+ udf_clear_extent_cache(inode);
udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
}
@@ -372,7 +442,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
-
+ udf_clear_extent_cache(inode);
phys = inode_getblk(inode, block, &err, &new);
if (!phys)
goto abort;
@@ -1171,6 +1241,7 @@ set_size:
} else {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
down_write(&iinfo->i_data_sem);
+ udf_clear_extent_cache(inode);
memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
0x00, bsize - newsize -
udf_file_entry_alloc_offset(inode));
@@ -1184,6 +1255,7 @@ set_size:
if (err)
return err;
down_write(&iinfo->i_data_sem);
+ udf_clear_extent_cache(inode);
truncate_setsize(inode, newsize);
udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
@@ -2156,11 +2228,12 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
struct udf_inode_info *iinfo;
iinfo = UDF_I(inode);
- pos->offset = 0;
- pos->block = iinfo->i_location;
- pos->bh = NULL;
+ if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) {
+ pos->offset = 0;
+ pos->block = iinfo->i_location;
+ pos->bh = NULL;
+ }
*elen = 0;
-
do {
etype = udf_next_aext(inode, pos, eloc, elen, 1);
if (etype == -1) {
@@ -2170,7 +2243,8 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
}
lbcount += *elen;
} while (lbcount <= bcount);
-
+ /* update extent cache */
+ udf_update_extent_cache(inode, lbcount - *elen, pos, 1);
*offset = (bcount + *elen - lbcount) >> blocksize_bits;
return etype;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 95fee278ab9d..102c072c6bbf 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1270,10 +1270,10 @@ static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
if (parent && (len < 5)) {
*lenp = 5;
- return 255;
+ return FILEID_INVALID;
} else if (len < 3) {
*lenp = 3;
- return 255;
+ return FILEID_INVALID;
}
*lenp = 3;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index d44fb568abe1..bc5b30a819e8 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -134,6 +134,8 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
ei->i_next_alloc_goal = 0;
ei->i_strat4096 = 0;
init_rwsem(&ei->i_data_sem);
+ ei->cached_extent.lstart = -1;
+ spin_lock_init(&ei->i_extent_cache_lock);
return &ei->vfs_inode;
}
@@ -307,7 +309,8 @@ static void udf_sb_free_partitions(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
int i;
-
+ if (sbi->s_partmaps == NULL)
+ return;
for (i = 0; i < sbi->s_partitions; i++)
udf_free_partition(&sbi->s_partmaps[i]);
kfree(sbi->s_partmaps);
@@ -1020,7 +1023,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
if (bitmap == NULL)
return NULL;
- bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
bitmap->s_nr_groups = nr_groups;
return bitmap;
}
@@ -1078,8 +1080,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!bitmap)
return 1;
map->s_uspace.s_bitmap = bitmap;
- bitmap->s_extLength = le32_to_cpu(
- phd->unallocSpaceBitmap.extLength);
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
@@ -1114,8 +1114,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!bitmap)
return 1;
map->s_fspace.s_bitmap = bitmap;
- bitmap->s_extLength = le32_to_cpu(
- phd->freedSpaceBitmap.extLength);
bitmap->s_extPosition = le32_to_cpu(
phd->freedSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
@@ -1865,6 +1863,8 @@ static void udf_open_lvid(struct super_block *sb)
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
+ /* Make opening of filesystem visible on the media immediately */
+ sync_dirty_buffer(bh);
}
static void udf_close_lvid(struct super_block *sb)
@@ -1905,6 +1905,8 @@ static void udf_close_lvid(struct super_block *sb)
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
+ /* Make closing of filesystem visible on the media immediately */
+ sync_dirty_buffer(bh);
}
u64 lvid_get_unique_id(struct super_block *sb)
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index bb8309dcd5c1..b5cd8ed2aa12 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,19 @@
#ifndef _UDF_I_H
#define _UDF_I_H
+struct extent_position {
+ struct buffer_head *bh;
+ uint32_t offset;
+ struct kernel_lb_addr block;
+};
+
+struct udf_ext_cache {
+ /* Extent position */
+ struct extent_position epos;
+ /* Start logical offset in bytes */
+ loff_t lstart;
+};
+
/*
* The i_data_sem and i_mutex serve for protection of allocation information
* of a regular files and symlinks. This includes all extents belonging to
@@ -35,6 +48,9 @@ struct udf_inode_info {
__u8 *i_data;
} i_ext;
struct rw_semaphore i_data_sem;
+ struct udf_ext_cache cached_extent;
+ /* Spinlock for protecting extent cache */
+ spinlock_t i_extent_cache_lock;
struct inode vfs_inode;
};
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 5f027227f085..ed401e94aa8c 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -80,10 +80,9 @@ struct udf_virtual_data {
};
struct udf_bitmap {
- __u32 s_extLength;
__u32 s_extPosition;
- __u16 s_nr_groups;
- struct buffer_head **s_block_bitmap;
+ int s_nr_groups;
+ struct buffer_head *s_block_bitmap[0];
};
struct udf_part_map {
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index de038da6f6bd..be7dabbbcb49 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -113,11 +113,6 @@ struct ustr {
uint8_t u_len;
};
-struct extent_position {
- struct buffer_head *bh;
- uint32_t offset;
- struct kernel_lb_addr block;
-};
/* super.c */
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..0bf6e16f8d79 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -29,7 +29,7 @@ config UFS_FS
config UFS_FS_WRITE
bool "UFS file system write support (DANGEROUS)"
- depends on UFS_FS && EXPERIMENTAL
+ depends on UFS_FS
help
Say Y here if you want to try writing to UFS partitions. This is
experimental, so you should back up your UFS partitions beforehand.
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbc90994715a..3a75ca09c506 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -433,7 +433,7 @@ static int
ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index eb6d0b7dc879..ff24e4449ece 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ufs_getfrag_block);
}
+static void ufs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int ufs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
ufs_getfrag_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ ufs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/utimes.c b/fs/utimes.c
index bb0696a41735..f4fb7eca10e8 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
if (!(flags & AT_SYMLINK_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
-
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
error = utimes_common(&path, times);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
out:
diff --git a/fs/xattr.c b/fs/xattr.c
index e21c119f4f99..3377dff18404 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
{
struct path path;
ssize_t error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = getxattr(path.dentry, name, value, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
{
struct path path;
ssize_t error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = getxattr(path.dentry, name, value, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
{
struct path path;
ssize_t error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = listxattr(path.dentry, list, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
{
struct path path;
ssize_t error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = listxattr(path.dentry, list, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5a7ffe54f5d5..cc33aaf219f1 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -70,8 +70,8 @@ config XFS_RT
If unsure, say N.
config XFS_DEBUG
- bool "XFS Debugging support (EXPERIMENTAL)"
- depends on XFS_FS && EXPERIMENTAL
+ bool "XFS Debugging support"
+ depends on XFS_FS
help
Say Y here to get an XFS build with many debugging features,
including ASSERT checks, function wrappers around macros,
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 393055fe3aef..0ad23253e8b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist(
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.mod = targs.minleft = targs.wasdel = targs.userdata =
- targs.minalignslop = 0;
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4111a40ebe1a..5f707e537171 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
}
if (ioend->io_iocb) {
+ inode_dio_done(ioend->io_inode);
if (ioend->io_isasync) {
aio_complete(ioend->io_iocb, ioend->io_error ?
ioend->io_error : ioend->io_result, 0);
}
- inode_dio_done(ioend->io_inode);
}
mempool_free(ioend, xfs_ioend_pool);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index aaf472532b3c..888683844d98 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -300,9 +300,12 @@ xfs_attr_set_int(
if (rsvd)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans, args.total,
- XFS_ATTRSET_LOG_RES(mp, args.total), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+ error = xfs_trans_reserve(args.trans, args.total,
+ XFS_ATTRSETM_LOG_RES(mp) +
+ XFS_ATTRSETRT_LOG_RES(mp) * args.total,
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_ATTRSET_LOG_COUNT);
+ if (error) {
xfs_trans_cancel(args.trans, 0);
return(error);
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0e92d12765d2..b44af9211bd9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -147,7 +147,10 @@ xfs_bmap_local_to_extents(
xfs_fsblock_t *firstblock, /* first block allocated in xaction */
xfs_extlen_t total, /* total blocks needed by transaction */
int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
+ int whichfork, /* data or attr fork */
+ void (*init_fn)(struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp));
/*
* Search the extents list for the inode, for the extent containing bno.
@@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Block initialisation functions for local to extent format conversion.
+ * As these get more complex, they will be moved to the relevant files,
+ * but for now they are too simple to worry about.
+ */
+STATIC void
+xfs_bmap_local_to_extents_init_fn(
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp)
+{
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+}
+
+STATIC void
+xfs_symlink_local_to_remote(
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp)
+{
+ /* remote symlink blocks are not verifiable until CRCs come along */
+ bp->b_ops = NULL;
+ memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter. it would also require passing the transaction through to the init
+ * function.
*/
STATIC int /* error */
xfs_bmap_add_attrfork_local(
@@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local(
int *flags) /* inode logging flags */
{
xfs_da_args_t dargs; /* args for dir/attr code */
- int error; /* error return value */
- xfs_mount_t *mp; /* mount structure pointer */
if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
return 0;
+
if (S_ISDIR(ip->i_d.di_mode)) {
- mp = ip->i_mount;
memset(&dargs, 0, sizeof(dargs));
dargs.dp = ip;
dargs.firstblock = firstblock;
dargs.flist = flist;
- dargs.total = mp->m_dirblkfsbs;
+ dargs.total = ip->i_mount->m_dirblkfsbs;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
- error = xfs_dir2_sf_to_block(&dargs);
- } else
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
- XFS_DATA_FORK);
- return error;
+ return xfs_dir2_sf_to_block(&dargs);
+ }
+
+ if (S_ISLNK(ip->i_d.di_mode))
+ return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+ flags, XFS_DATA_FORK,
+ xfs_symlink_local_to_remote);
+
+ return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+ XFS_DATA_FORK,
+ xfs_bmap_local_to_extents_init_fn);
}
/*
@@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree(
args.fsbno = *firstblock;
}
args.minlen = args.maxlen = args.prod = 1;
- args.total = args.minleft = args.alignment = args.mod = args.isfl =
- args.minalignslop = 0;
args.wasdel = wasdel;
*logflagsp = 0;
if ((error = xfs_alloc_vextent(&args))) {
@@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents(
xfs_fsblock_t *firstblock, /* first block allocated in xaction */
xfs_extlen_t total, /* total blocks needed by transaction */
int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+ int whichfork,
+ void (*init_fn)(struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp))
{
int error; /* error return value */
int flags; /* logging flags returned */
@@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents(
xfs_buf_t *bp; /* buffer for extent block */
xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+ ASSERT((ifp->if_flags &
+ (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
args.firstblock = *firstblock;
- ASSERT((ifp->if_flags &
- (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
/*
* Allocate a block. We know we need only one, since the
* file currently fits in an inode.
@@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents(
args.type = XFS_ALLOCTYPE_NEAR_BNO;
}
args.total = total;
- args.mod = args.minleft = args.alignment = args.wasdel =
- args.isfl = args.minalignslop = 0;
args.minlen = args.maxlen = args.prod = 1;
- if ((error = xfs_alloc_vextent(&args)))
+ error = xfs_alloc_vextent(&args);
+ if (error)
goto done;
- /*
- * Can't fail, the space was reserved.
- */
+
+ /* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
*firstblock = args.fsbno;
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
- bp->b_ops = &xfs_bmbt_buf_ops;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+
+ /* initialise the block and copy the data */
+ init_fn(bp, ip, ifp);
+
+ /* account for the change in fork size and log everything */
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -4680,9 +4724,6 @@ __xfs_bmapi_allocate(
return error;
}
- if (bma->flags & XFS_BMAPI_STACK_SWITCH)
- bma->stack_switch = 1;
-
error = xfs_bmap_alloc(bma);
if (error)
return error;
@@ -4922,8 +4963,32 @@ xfs_bmapi_write(
XFS_STATS_INC(xs_blk_mapw);
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ /*
+ * XXX (dgc): This assumes we are only called for inodes that
+ * contain content neutral data in local format. Anything that
+ * contains caller-specific data in local format that needs
+ * transformation to move to a block format needs to do the
+ * conversion to extent format itself.
+ *
+ * Directory data forks and attribute forks handle this
+ * themselves, but with the addition of metadata verifiers every
+ * data fork in local format now contains caller specific data
+ * and as such conversion through this function is likely to be
+ * broken.
+ *
+ * The only likely user of this branch is for remote symlinks,
+ * but we cannot overwrite the data fork contents of the symlink
+ * (EEXIST occurs higher up the stack) and so it will never go
+ * from local format to extent format here. Hence I don't think
+ * this branch is ever executed intentionally and we should
+ * consider removing it and asserting that xfs_bmapi_write()
+ * cannot be called directly on local format forks. i.e. callers
+ * are completely responsible for local to extent format
+ * conversion, not xfs_bmapi_write().
+ */
error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
- &bma.logflags, whichfork);
+ &bma.logflags, whichfork,
+ xfs_bmap_local_to_extents_init_fn);
if (error)
goto error0;
}
@@ -4956,6 +5021,9 @@ xfs_bmapi_write(
bma.flist = flist;
bma.firstblock = firstblock;
+ if (flags & XFS_BMAPI_STACK_SWITCH)
+ bma.stack_switch = 1;
+
while (bno < end && n < *nmap) {
inhole = eof || bma.got.br_startoff > bno;
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 26673a0b20e7..4e8f0df82d02 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -175,7 +175,7 @@ xfs_buf_get_maps(
bp->b_map_count = map_count;
if (map_count == 1) {
- bp->b_maps = &bp->b_map;
+ bp->b_maps = &bp->__b_map;
return 0;
}
@@ -193,7 +193,7 @@ static void
xfs_buf_free_maps(
struct xfs_buf *bp)
{
- if (bp->b_maps != &bp->b_map) {
+ if (bp->b_maps != &bp->__b_map) {
kmem_free(bp->b_maps);
bp->b_maps = NULL;
}
@@ -377,8 +377,8 @@ xfs_buf_allocate_memory(
}
use_alloc_page:
- start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT;
- end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1)
+ start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
+ end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
page_count = end - start;
error = _xfs_buf_get_pages(bp, page_count, flags);
@@ -487,6 +487,7 @@ _xfs_buf_find(
struct rb_node *parent;
xfs_buf_t *bp;
xfs_daddr_t blkno = map[0].bm_bn;
+ xfs_daddr_t eofs;
int numblks = 0;
int i;
@@ -498,6 +499,23 @@ _xfs_buf_find(
ASSERT(!(numbytes < (1 << btp->bt_sshift)));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ /*
+ * Corrupted block numbers can get through to here, unfortunately, so we
+ * have to check that the buffer falls within the filesystem bounds.
+ */
+ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+ if (blkno >= eofs) {
+ /*
+ * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * but none of the higher level infrastructure supports
+ * returning a specific error on buffer lookup failures.
+ */
+ xfs_alert(btp->bt_mount,
+ "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+ __func__, blkno, eofs);
+ return NULL;
+ }
+
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -640,7 +658,7 @@ _xfs_buf_read(
xfs_buf_flags_t flags)
{
ASSERT(!(flags & XBF_WRITE));
- ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL);
+ ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
@@ -933,8 +951,6 @@ xfs_buf_trylock(
locked = down_trylock(&bp->b_sema) == 0;
if (locked)
XB_SET_OWNER(bp);
- else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
- xfs_log_force(bp->b_target->bt_mount, 0);
trace_xfs_buf_trylock(bp, _RET_IP_);
return locked;
@@ -1487,6 +1503,8 @@ restart:
while (!list_empty(&btp->bt_lru)) {
bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
if (atomic_read(&bp->b_hold) > 1) {
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ list_move_tail(&bp->b_lru, &btp->bt_lru);
spin_unlock(&btp->bt_lru_lock);
delay(100);
goto restart;
@@ -1709,7 +1727,7 @@ xfs_buf_cmp(
struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
xfs_daddr_t diff;
- diff = ap->b_map.bm_bn - bp->b_map.bm_bn;
+ diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
if (diff < 0)
return -1;
if (diff > 0)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 23f5642480bb..433a12ed7b17 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -151,7 +151,7 @@ typedef struct xfs_buf {
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
struct xfs_buf_map *b_maps; /* compound buffer map */
- struct xfs_buf_map b_map; /* inline compound buffer map */
+ struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
@@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp);
* In future, uncached buffers will pass the block number directly to the io
* request function and hence these macros will go away at that point.
*/
-#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn)
-#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
+#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index becf4a97efc6..cf263476d6b4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-
-#ifdef XFS_TRANS_DEBUG
-/*
- * This function uses an alternate strategy for tracking the bytes
- * that the user requests to be logged. This can then be used
- * in conjunction with the bli_orig array in the buf log item to
- * catch bugs in our callers' code.
- *
- * We also double check the bits set in xfs_buf_item_log using a
- * simple algorithm to check that every byte is accounted for.
- */
-STATIC void
-xfs_buf_item_log_debug(
- xfs_buf_log_item_t *bip,
- uint first,
- uint last)
-{
- uint x;
- uint byte;
- uint nbytes;
- uint chunk_num;
- uint word_num;
- uint bit_num;
- uint bit_set;
- uint *wordp;
-
- ASSERT(bip->bli_logged != NULL);
- byte = first;
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
- for (x = 0; x < nbytes; x++) {
- chunk_num = byte >> XFS_BLF_SHIFT;
- word_num = chunk_num >> BIT_TO_WORD_SHIFT;
- bit_num = chunk_num & (NBWORD - 1);
- wordp = &(bip->bli_format.blf_data_map[word_num]);
- bit_set = *wordp & (1 << bit_num);
- ASSERT(bit_set);
- byte++;
- }
-}
-
-/*
- * This function is called when we flush something into a buffer without
- * logging it. This happens for things like inodes which are logged
- * separately from the buffer.
- */
-void
-xfs_buf_item_flush_log_debug(
- xfs_buf_t *bp,
- uint first,
- uint last)
-{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
- uint nbytes;
-
- if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF))
- return;
-
- ASSERT(bip->bli_logged != NULL);
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
-}
-
-/*
- * This function is called to verify that our callers have logged
- * all the bytes that they changed.
- *
- * It does this by comparing the original copy of the buffer stored in
- * the buf log item's bli_orig array to the current copy of the buffer
- * and ensuring that all bytes which mismatch are set in the bli_logged
- * array of the buf log item.
- */
-STATIC void
-xfs_buf_item_log_check(
- xfs_buf_log_item_t *bip)
-{
- char *orig;
- char *buffer;
- int x;
- xfs_buf_t *bp;
-
- ASSERT(bip->bli_orig != NULL);
- ASSERT(bip->bli_logged != NULL);
-
- bp = bip->bli_buf;
- ASSERT(bp->b_length > 0);
- ASSERT(bp->b_addr != NULL);
- orig = bip->bli_orig;
- buffer = bp->b_addr;
- for (x = 0; x < BBTOB(bp->b_length); x++) {
- if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
- xfs_emerg(bp->b_mount,
- "%s: bip %x buffer %x orig %x index %d",
- __func__, bip, bp, orig, x);
- ASSERT(0);
- }
- }
-}
-#else
-#define xfs_buf_item_log_debug(x,y,z)
-#define xfs_buf_item_log_check(x)
-#endif
-
STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
/*
@@ -237,7 +134,7 @@ xfs_buf_item_size(
* cancel flag in it.
*/
trace_xfs_buf_item_size_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
return bip->bli_format_count;
}
@@ -278,7 +175,7 @@ xfs_buf_item_format_segment(
uint buffer_offset;
/* copy the flags across from the base format item */
- blfp->blf_flags = bip->bli_format.blf_flags;
+ blfp->blf_flags = bip->__bli_format.blf_flags;
/*
* Base size is the actual size of the ondisk structure - it reflects
@@ -287,6 +184,17 @@ xfs_buf_item_format_segment(
*/
base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+
+ nvecs = 0;
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+ /*
+ * If the map is not be dirty in the transaction, mark
+ * the size as zero and do not advance the vector pointer.
+ */
+ goto out;
+ }
+
vecp->i_addr = blfp;
vecp->i_len = base_size;
vecp->i_type = XLOG_REG_TYPE_BFORMAT;
@@ -301,15 +209,13 @@ xfs_buf_item_format_segment(
*/
trace_xfs_buf_item_format_stale(bip);
ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
- blfp->blf_size = nvecs;
- return vecp;
+ goto out;
}
/*
* Fill in an iovec for each set of contiguous chunks.
*/
- first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
- ASSERT(first_bit != -1);
+
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -371,7 +277,8 @@ xfs_buf_item_format_segment(
nbits++;
}
}
- bip->bli_format.blf_size = nvecs;
+out:
+ blfp->blf_size = nvecs;
return vecp;
}
@@ -405,7 +312,7 @@ xfs_buf_item_format(
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
- bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+ bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
@@ -419,7 +326,6 @@ xfs_buf_item_format(
* Check to make sure everything is consistent.
*/
trace_xfs_buf_item_format(bip);
- xfs_buf_item_log_check(bip);
}
/*
@@ -485,7 +391,7 @@ xfs_buf_item_unpin(
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(XFS_BUF_ISSTALE(bp));
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
trace_xfs_buf_item_unpin_stale(bip);
@@ -563,8 +469,18 @@ xfs_buf_item_push(
if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
- if (!xfs_buf_trylock(bp))
+ if (!xfs_buf_trylock(bp)) {
+ /*
+ * If we have just raced with a buffer being pinned and it has
+ * been marked stale, we could end up stalling until someone else
+ * issues a log force to unpin the stale buffer. Check for the
+ * race condition here so xfsaild recognizes the buffer is pinned
+ * and queues a log force to move it along.
+ */
+ if (xfs_buf_ispinned(bp))
+ return XFS_ITEM_PINNED;
return XFS_ITEM_LOCKED;
+ }
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -601,7 +517,7 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted;
+ int aborted, clean, i;
uint hold;
/* Clear the buffer's association with this transaction. */
@@ -631,7 +547,7 @@ xfs_buf_item_unlock(
*/
if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
atomic_dec(&bip->bli_refcount);
return;
@@ -642,12 +558,27 @@ xfs_buf_item_unlock(
/*
* If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it.
+ * reference we hold to it. If we are aborting the transaction, this may
+ * be the only reference to the buf item, so we free it anyway
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
*/
- if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size))
+ clean = 1;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = 0;
+ break;
+ }
+ }
+ if (clean)
xfs_buf_item_relse(bp);
- else
+ else if (aborted) {
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_buf_item_relse(bp);
+ }
+ } else
atomic_dec(&bip->bli_refcount);
if (!hold)
@@ -716,7 +647,7 @@ xfs_buf_item_get_format(
bip->bli_format_count = count;
if (count == 1) {
- bip->bli_formats = &bip->bli_format;
+ bip->bli_formats = &bip->__bli_format;
return 0;
}
@@ -731,7 +662,7 @@ STATIC void
xfs_buf_item_free_format(
struct xfs_buf_log_item *bip)
{
- if (bip->bli_formats != &bip->bli_format) {
+ if (bip->bli_formats != &bip->__bli_format) {
kmem_free(bip->bli_formats);
bip->bli_formats = NULL;
}
@@ -898,8 +829,6 @@ xfs_buf_item_log_segment(
mask = (1 << end_bit) - 1;
*wordp |= mask;
}
-
- xfs_buf_item_log_debug(bip, first, last);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 6850f49f4af3..ee36c88ecfde 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -98,13 +98,9 @@ typedef struct xfs_buf_log_item {
unsigned int bli_flags; /* misc flags */
unsigned int bli_recur; /* lock recursion count */
atomic_t bli_refcount; /* cnt of tp refs */
-#ifdef XFS_TRANS_DEBUG
- char *bli_orig; /* original buffer copy */
- char *bli_logged; /* bytes logged (bitmap) */
-#endif
int bli_format_count; /* count of headers */
struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
- struct xfs_buf_log_format bli_format; /* embedded in-log header */
+ struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
@@ -117,16 +113,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-#ifdef XFS_TRANS_DEBUG
-void
-xfs_buf_item_flush_log_debug(
- struct xfs_buf *bp,
- uint first,
- uint last);
-#else
-#define xfs_buf_item_flush_log_debug(bp, first, last)
-#endif
-
#endif /* __KERNEL__ */
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d0e9c74d3d96..f852b082a084 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -78,14 +78,14 @@ xfs_swapext(
goto out_put_tmp_file;
}
- if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
- IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
+ if (IS_SWAPFILE(file_inode(f.file)) ||
+ IS_SWAPFILE(file_inode(tmp.file))) {
error = XFS_ERROR(EINVAL);
goto out_put_tmp_file;
}
- ip = XFS_I(f.file->f_path.dentry->d_inode);
- tip = XFS_I(tmp.file->f_path.dentry->d_inode);
+ ip = XFS_I(file_inode(f.file));
+ tip = XFS_I(file_inode(tmp.file));
if (ip->i_mount != tip->i_mount) {
error = XFS_ERROR(EINVAL);
@@ -246,10 +246,10 @@ xfs_swap_extents(
goto out_unlock;
}
- error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
if (error)
goto out_unlock;
- truncate_pagecache_range(VFS_I(ip), 0, -1);
+ truncate_pagecache_range(VFS_I(tip), 0, -1);
/* Verify O_DIRECT for ftmp */
if (VN_CACHED(VFS_I(tip)) != 0) {
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 7536faaa61e7..12afe07a91d7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -355,10 +355,12 @@ xfs_dir2_block_addname(
/*
* If need to compact the leaf entries, do it now.
*/
- if (compact)
+ if (compact) {
xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
&lfloghigh, &lfloglow);
- else if (btp->stale) {
+ /* recalculate blp post-compaction */
+ blp = xfs_dir2_block_leaf_p(btp);
+ } else if (btp->stale) {
/*
* Set leaf logging boundaries to impossible state.
* For the no-stale case they're set explicitly.
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9e1bf5294c91..8025eb23ad72 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -612,15 +612,9 @@ xfs_qm_dqread(
if (flags & XFS_QMOPT_DQALLOC) {
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- /*
- * Round the chunklen up to the next multiple
- * of 128 (buf log item chunk size)).
- */
- BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
+ XFS_QM_DQALLOC_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
if (error)
goto error1;
cancelflags = XFS_TRANS_RELEASE_LOG_RES;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index a83611849cee..c585bc646395 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -48,7 +48,7 @@ static int xfs_fileid_length(int fileid_type)
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
return 6;
}
- return 255; /* invalid */
+ return FILEID_INVALID;
}
STATIC int
@@ -90,7 +90,7 @@ xfs_fs_encode_fh(
len = xfs_fileid_length(fileid_type);
if (*max_len < len) {
*max_len = len;
- return 255;
+ return FILEID_INVALID;
}
*max_len = len;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 67284edb84d7..f03bf1a456fb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -811,7 +811,7 @@ xfs_file_fallocate(
loff_t offset,
loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
long error;
loff_t new_size = 0;
xfs_flock64_t bf;
@@ -912,7 +912,7 @@ xfs_file_readdir(
void *dirent,
filldir_t filldir)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
xfs_inode_t *ip = XFS_I(inode);
int error;
size_t bufsize;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94eaeedc5498..2866b8c78b7a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -709,8 +709,8 @@ xfs_fs_log_dummy(
int error;
tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index a815412eab80..515bf71ce01c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc(
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
/*
@@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc(
* Allocate a fixed-size extent of inodes.
*/
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
/*
* Allow space for the inode btree to split.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 66282dcb821b..4f201656d2d9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2379,9 +2379,6 @@ xfs_iflush_fork(
char *cp;
xfs_ifork_t *ifp;
xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
static const short brootflag[2] =
{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
static const short dataflag[2] =
@@ -2724,9 +2721,6 @@ xfs_iflush_int(
xfs_inode_log_item_t *iip;
xfs_dinode_t *dip;
xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 22baf6ea4fac..237e7f6f2ab3 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip)
static inline void xfs_ifunlock(struct xfs_inode *ip)
{
xfs_iflags_clear(ip, XFS_IFLOCK);
+ smp_mb();
wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d041d47d9d86..f034bd1652f0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -269,17 +269,6 @@ xfs_inode_item_format(
} else {
ASSERT(!(iip->ili_fields &
XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
- if (iip->ili_root_size > 0) {
- ASSERT(iip->ili_root_size ==
- ip->i_df.if_broot_bytes);
- ASSERT(memcmp(iip->ili_orig_root,
- ip->i_df.if_broot,
- iip->ili_root_size) == 0);
- } else {
- ASSERT(ip->i_df.if_broot_bytes == 0);
- }
-#endif
iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
@@ -678,11 +667,6 @@ void
xfs_inode_item_destroy(
xfs_inode_t *ip)
{
-#ifdef XFS_TRANS_DEBUG
- if (ip->i_itemp->ili_root_size != 0) {
- kmem_free(ip->i_itemp->ili_orig_root);
- }
-#endif
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 376d4d0b2635..779812fb3d80 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item {
data exts */
struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
attr exts */
-#ifdef XFS_TRANS_DEBUG
- int ili_root_size;
- char *ili_orig_root;
-#endif
xfs_inode_log_format_t ili_format; /* logged structure */
} xfs_inode_log_item_t;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1c3ef88a260..d681e34c2950 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -80,7 +80,7 @@ xfs_find_handle(
f = fdget(hreq->fd);
if (!f.file)
return -EBADF;
- inode = f.file->f_path.dentry->d_inode;
+ inode = file_inode(f.file);
} else {
error = user_lpath((const char __user *)hreq->path, &path);
if (error)
@@ -168,7 +168,7 @@ xfs_handle_to_dentry(
/*
* Only allow handle opens under a directory.
*/
- if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+ if (!S_ISDIR(file_inode(parfilp)->i_mode))
return ERR_PTR(-ENOTDIR);
if (hlen != sizeof(xfs_handle_t))
@@ -1334,7 +1334,7 @@ xfs_file_ioctl(
unsigned int cmd,
unsigned long p)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1244274a5674..63b8fc432151 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -530,7 +530,7 @@ xfs_file_compat_ioctl(
unsigned cmd,
unsigned long p)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index add06b4e9a63..912d83d8860a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate(
}
/*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size. Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC int
+xfs_iomap_eof_prealloc_initial_size(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_bmbt_irec_t *imap,
+ int nimaps)
+{
+ xfs_fileoff_t start_fsb;
+ int imaps = 1;
+ int error;
+
+ ASSERT(nimaps >= imaps);
+
+ /* if we are using a specific prealloc size, return now */
+ if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+ return 0;
+
+ /*
+ * As we write multiple pages, the offset will always align to the
+ * start of a page and hence point to a hole at EOF. i.e. if the size is
+ * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+ * will return FSB 1. Hence if there are blocks in the file, we want to
+ * point to the block prior to the EOF block and not the hole that maps
+ * directly at @offset.
+ */
+ start_fsb = XFS_B_TO_FSB(mp, offset);
+ if (start_fsb)
+ start_fsb--;
+ error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+ if (error)
+ return 0;
+
+ ASSERT(imaps == 1);
+ if (imap[0].br_startblock == HOLESTARTBLOCK)
+ return 0;
+ if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+ return imap[0].br_blockcount;
+ return XFS_B_TO_FSB(mp, offset);
+}
+
+/*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
@@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_mount *mp,
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
xfs_fsblock_t alloc_blocks = 0;
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+ alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+ imap, nimaps);
+ if (alloc_blocks > 0) {
int shift = 0;
int64_t freesp;
- /*
- * rounddown_pow_of_two() returns an undefined result
- * if we pass in alloc_blocks = 0. Hence the "+ 1" to
- * ensure we always pass in a non-zero value.
- */
- alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
@@ -351,6 +406,15 @@ xfs_iomap_prealloc_size(
}
if (shift)
alloc_blocks >>= shift;
+
+ /*
+ * If we are still trying to allocate more space than is
+ * available, squash the prealloc hard. This can happen if we
+ * have a large file on a small filesystem and the above
+ * lowspace thresholds are smaller than MAXEXTLEN.
+ */
+ while (alloc_blocks >= freesp)
+ alloc_blocks >>= 4;
}
if (alloc_blocks < mp->m_writeio_blocks)
@@ -390,7 +454,6 @@ xfs_iomap_write_delay(
extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
imap, XFS_WRITE_IMAPS, &prealloc);
if (error)
@@ -398,7 +461,10 @@ xfs_iomap_write_delay(
retry:
if (prealloc) {
- xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+ xfs_fsblock_t alloc_blocks;
+
+ alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+ XFS_WRITE_IMAPS);
aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 46bd9d52ab51..eec226f78a40 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -120,7 +120,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing);
+ bool syncing);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -1737,7 +1737,7 @@ xlog_sync(
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
- xlog_verify_iclog(log, iclog, count, B_TRUE);
+ xlog_verify_iclog(log, iclog, count, true);
/* account for log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
@@ -3611,7 +3611,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing)
+ bool syncing)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3659,7 +3659,7 @@ xlog_verify_iclog(
/* clientid is only 1 byte */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
@@ -3682,7 +3682,7 @@ xlog_verify_iclog(
/* check length */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((__psint_t)&ophead->oh_len -
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 96fcbb85ff83..d1dba7ce75ae 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1442,9 +1442,8 @@ xlog_recover_find_tid(
xlog_tid_t tid)
{
xlog_recover_t *trans;
- struct hlist_node *n;
- hlist_for_each_entry(trans, n, head, r_list) {
+ hlist_for_each_entry(trans, head, r_list) {
if (trans->r_log_tid == tid)
return trans;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da508463ff10..3806088a8f77 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify(
return;
}
/* quietly fail */
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, EWRONGFS);
}
static void
@@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags(
return 0;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
return 0;
tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
@@ -1945,8 +1945,8 @@ xfs_mount_log_sb(
XFS_SB_VERSIONNUM));
tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bab8314507e4..bc907061d392 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations {
uint tr_addafork; /* cvt inode to attributed trans */
uint tr_writeid; /* write setuid/setgid file */
uint tr_attrinval; /* attr fork buffer invalidation */
- uint tr_attrset; /* set/create an attribute */
+ uint tr_attrsetm; /* set/create an attribute at mount time */
+ uint tr_attrsetrt; /* set/create an attribute at runtime */
uint tr_attrrm; /* remove an attribute */
uint tr_clearagi; /* clear bad agi unlinked ino bucket */
uint tr_growrtalloc; /* grow realtime allocations */
uint tr_growrtzero; /* grow realtime zeroing */
uint tr_growrtfree; /* grow realtime freeing */
+ uint tr_qm_sbchange; /* change quota flags */
+ uint tr_qm_setqlim; /* adjust quota limits */
+ uint tr_qm_dqalloc; /* allocate quota on disk */
+ uint tr_qm_quotaoff; /* turn quota off */
+ uint tr_qm_equotaoff;/* end of turn quota off */
+ uint tr_sb; /* modify superblock */
} xfs_trans_reservations_t;
#ifndef __KERNEL__
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 60eff4763156..e5b5cf973781 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
- if ((error = xfs_trans_reserve(tp, 0,
- mp->m_sb.sb_sectsize + 128, 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 6b39115bf145..2d02eac1c9a8 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -146,7 +146,7 @@ xfs_qm_newmount(
* inode goes inactive and wants to free blocks,
* or via xfs_log_mount_finish.
*/
- *needquotamount = B_TRUE;
+ *needquotamount = true;
*quotaflags = mp->m_qflags;
mp->m_qflags = 0;
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5f53e75409b8..cf9a34051e07 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -408,10 +408,10 @@ xfs_qm_scall_getqstat(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_inode *uip, *gip;
- boolean_t tempuqip, tempgqip;
+ bool tempuqip, tempgqip;
uip = gip = NULL;
- tempuqip = tempgqip = B_FALSE;
+ tempuqip = tempgqip = false;
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
@@ -434,12 +434,12 @@ xfs_qm_scall_getqstat(
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
0, 0, &uip) == 0)
- tempuqip = B_TRUE;
+ tempuqip = true;
}
if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
0, 0, &gip) == 0)
- tempgqip = B_TRUE;
+ tempgqip = true;
}
if (uip) {
out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
@@ -490,8 +490,9 @@ xfs_qm_scall_setqlim(
return 0;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error) {
xfs_trans_cancel(tp, 0);
return (error);
}
@@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error) {
xfs_trans_cancel(tp, 0);
return (error);
}
@@ -671,14 +673,10 @@ xfs_qm_log_quotaoff(
uint oldsbqflag=0;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
- if ((error = xfs_trans_reserve(tp, 0,
- sizeof(xfs_qoff_logitem_t) * 2 +
- mp->m_sb.sb_sectsize + 128,
- 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp),
+ 0, 0, XFS_DEFAULT_LOG_COUNT);
+ if (error)
goto error0;
- }
qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
@@ -784,11 +782,11 @@ xfs_qm_scall_getquota(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
+ if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
+ if ((dst->d_icount > dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ab8839b26272..c407121873b4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -139,9 +139,9 @@ static const match_table_t tokens = {
STATIC unsigned long
-suffix_strtoul(char *s, char **endp, unsigned int base)
+suffix_kstrtoint(char *s, unsigned int base, int *res)
{
- int last, shift_left_factor = 0;
+ int last, shift_left_factor = 0, _res;
char *value = s;
last = strlen(value) - 1;
@@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
value[last] = '\0';
}
- return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+ if (kstrtoint(s, base, &_res))
+ return -EINVAL;
+ *res = _res << shift_left_factor;
+ return 0;
}
/*
@@ -174,7 +177,7 @@ xfs_parseargs(
char *options)
{
struct super_block *sb = mp->m_super;
- char *this_char, *value, *eov;
+ char *this_char, *value;
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
@@ -230,14 +233,16 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- mp->m_logbufs = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &mp->m_logbufs))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
return EINVAL;
}
- mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+ if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
@@ -266,7 +271,8 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- iosize = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &iosize))
+ return EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
if (!value || !*value) {
@@ -274,7 +280,8 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- iosize = suffix_strtoul(value, &eov, 10);
+ if (suffix_kstrtoint(value, 10, &iosize))
+ return EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_GRPID) ||
!strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -296,14 +303,16 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- dsunit = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &dsunit))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
return EINVAL;
}
- dswidth = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &dswidth))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2e137d4a85ae..16a812977eab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06ed520a767f..2fd7c1ff1d21 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -37,14 +37,45 @@
#include "xfs_extent_busy.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
+#include "xfs_qm.h"
#include "xfs_trans_priv.h"
#include "xfs_trans_space.h"
#include "xfs_inode_item.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_trace.h"
kmem_zone_t *xfs_trans_zone;
kmem_zone_t *xfs_log_item_desc_zone;
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer. Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+ return round_up(sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction. size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+ uint nbufs,
+ uint size)
+{
+ return nbufs * (size + xfs_buf_log_overhead());
+}
/*
* Various log reservation values.
@@ -85,18 +116,15 @@ xfs_calc_write_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
- XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+ MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
- 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
- (4 * mp->m_sb.sb_sectsize +
- 4 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 4) +
- 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
- 128 * 5 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+ MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(5, 0) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+ mp->m_in_maxlevels, 0)));
}
/*
@@ -148,14 +175,12 @@ xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((4 * mp->m_sb.sb_inodesize +
- 2 * XFS_DIROP_LOG_RES(mp) +
- 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
- (3 * mp->m_sb.sb_sectsize +
- 3 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 3) +
- 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
+ MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -175,15 +200,12 @@ xfs_calc_link_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
- (mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+ MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -203,15 +225,12 @@ xfs_calc_remove_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+ MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -233,18 +252,18 @@ xfs_calc_symlink_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, 1) +
- XFS_DIROP_LOG_RES(mp) +
- 1024 +
- 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
- (2 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
- XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+ MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(1, 1024)),
+ (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -267,18 +286,19 @@ xfs_calc_create_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
+ MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, 1) +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
- (3 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
- XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+ xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -306,16 +326,16 @@ xfs_calc_ifree_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
XFS_INODE_CLUSTER_SIZE(mp)) +
- 128 * 5 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+ xfs_calc_buf_res(1, 0) +
+ xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+ mp->m_in_maxlevels, 0) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -343,9 +363,9 @@ STATIC uint
xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_sectsize * 3 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+ return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -362,12 +382,12 @@ STATIC uint
xfs_calc_growrtalloc_reservation(
struct xfs_mount *mp)
{
- return 2 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
- mp->m_sb.sb_inodesize +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -379,7 +399,7 @@ STATIC uint
xfs_calc_growrtzero_reservation(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_blocksize + 128;
+ return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
}
/*
@@ -396,11 +416,10 @@ STATIC uint
xfs_calc_growrtfree_reservation(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_inodesize +
- mp->m_sb.sb_blocksize +
- mp->m_rsumsize +
- 128 * 5;
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(1, mp->m_rsumsize);
}
/*
@@ -411,7 +430,7 @@ STATIC uint
xfs_calc_swrite_reservation(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_inodesize + 128;
+ return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
}
/*
@@ -421,7 +440,7 @@ xfs_calc_swrite_reservation(
STATIC uint
xfs_calc_writeid_reservation(xfs_mount_t *mp)
{
- return mp->m_sb.sb_inodesize + 128;
+ return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
}
/*
@@ -437,13 +456,13 @@ xfs_calc_addafork_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize * 2 +
- mp->m_dirblksize +
- XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+ xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, mp->m_dirblksize) +
+ xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -461,35 +480,51 @@ STATIC uint
xfs_calc_attrinval_reservation(
struct xfs_mount *mp)
{
- return MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
- 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
- (4 * mp->m_sb.sb_sectsize +
- 4 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 4) +
- 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
+ return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
- * Setting an attribute.
+ * Setting an attribute at mount time.
* the inode getting the attribute
* the superblock for allocations
* the agfs extents are allocated from
* the attribute btree * max depth
* the inode allocation btree
* Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
*/
STATIC uint
-xfs_calc_attrset_reservation(
+xfs_calc_attrsetm_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
- 128 * (2 + XFS_DA_NODE_MAXDEPTH);
+ xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * the superblock for allocations: sector size
+ * the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
- 128 * (1 + XFS_DA_NODE_MAXDEPTH +
- XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+ MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+ XFS_FSB_TO_B(mp, 1)) +
+ (uint)XFS_FSB_TO_B(mp,
+ XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
}
/*
@@ -527,7 +561,78 @@ STATIC uint
xfs_calc_clear_agi_bucket_reservation(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_sectsize + 128;
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ * the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ * the write transaction log space: XFS_WRITE_LOG_RES(mp)
+ * the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_WRITE_LOG_RES(mp) +
+ xfs_calc_buf_res(1,
+ XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+ struct xfs_mount *mp)
+{
+ return sizeof(struct xfs_qoff_logitem) * 2 +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+ struct xfs_mount *mp)
+{
+ return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ * the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
}
/*
@@ -555,12 +660,19 @@ xfs_trans_init(
resp->tr_writeid = xfs_calc_writeid_reservation(mp);
resp->tr_addafork = xfs_calc_addafork_reservation(mp);
resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
- resp->tr_attrset = xfs_calc_attrset_reservation(mp);
+ resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
+ resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+ resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
+ resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
+ resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
+ resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
+ resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
+ resp->tr_sb = xfs_calc_sb_reservation(mp);
}
/*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c6c0601abd7a..cd29f6171021 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -252,17 +252,19 @@ struct xfs_log_item_desc {
* as long as SWRITE logs the entire inode core
*/
#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
-#define XFS_ATTRSET_LOG_RES(mp, ext) \
- ((mp)->m_reservations.tr_attrset + \
- (ext * (mp)->m_sb.sb_sectsize) + \
- (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
- (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
+#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm)
+#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt)
+#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
-
+#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange)
+#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim)
+#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc)
+#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff)
+#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff)
+#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb)
/*
* Various log count values.
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6011ee661339..0eda7254305f 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -55,20 +55,6 @@ xfs_ail_check(
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
- /*
- * Walk the list checking lsn ordering, and that every entry has the
- * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
- * when specifically debugging the transaction subsystem.
- */
- prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
- list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
- if (&prev_lip->li_ail != &ailp->xa_ail)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
- ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
- prev_lip = lip;
- }
-#endif /* XFS_TRANS_DEBUG */
}
#else /* !DEBUG */
#define xfs_ail_check(a,l)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4fc17d479d42..3edf5dbee001 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -93,7 +93,7 @@ _xfs_trans_bjoin(
xfs_buf_item_init(bp, tp->t_mountp);
bip = bp->b_fspriv;
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
if (reset_recur)
bip->bli_recur = 0;
@@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
bip = bp->b_fspriv;
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_brelse(bip);
@@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_HOLD;
@@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT(bip->bli_flags & XFS_BLI_HOLD);
@@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bip->bli_flags &= ~XFS_BLI_STALE;
ASSERT(XFS_BUF_ISSTALE(bp));
XFS_BUF_UNSTALE(bp);
- bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+ bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
tp->t_flags |= XFS_TRANS_DIRTY;
@@ -643,6 +643,7 @@ xfs_trans_binval(
xfs_buf_t *bp)
{
xfs_buf_log_item_t *bip = bp->b_fspriv;
+ int i;
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
@@ -657,8 +658,8 @@ xfs_trans_binval(
*/
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
return;
@@ -668,10 +669,12 @@ xfs_trans_binval(
bip->bli_flags |= XFS_BLI_STALE;
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
- bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
- bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
- memset((char *)(bip->bli_format.blf_data_map), 0,
- (bip->bli_format.blf_map_size * sizeof(uint)));
+ bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+ bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ memset(bip->bli_formats[i].blf_data_map, 0,
+ (bip->bli_formats[i].blf_map_size * sizeof(uint)));
+ }
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
tp->t_flags |= XFS_TRANS_DIRTY;
}
@@ -775,5 +778,5 @@ xfs_trans_dquot_buf(
type == XFS_BLF_GDQUOT_BUF);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- bip->bli_format.blf_flags |= type;
+ bip->__bli_format.blf_flags |= type;
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 0c7fa54f309e..642c2d6e1db1 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots(
int i, j;
xfs_dquot_t *dqp;
xfs_dqtrx_t *qtrx, *qa;
- boolean_t locked;
+ bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = B_FALSE;
+ locked = false;
if (qtrx->qt_blk_res) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
dqp->q_res_bcount -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
if (qtrx->qt_ino_res) {
if (!locked) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
}
dqp->q_res_icount -=
(xfs_qcnt_t)qtrx->qt_ino_res;
@@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots(
if (qtrx->qt_rtblk_res) {
if (!locked) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
}
dqp->q_res_rtbcount -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index d2eee20d5f5b..ac6d567704db 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -33,14 +33,6 @@
#include "xfs_inode_item.h"
#include "xfs_trace.h"
-#ifdef XFS_TRANS_DEBUG
-STATIC void
-xfs_trans_inode_broot_debug(
- xfs_inode_t *ip);
-#else
-#define xfs_trans_inode_broot_debug(ip)
-#endif
-
/*
* Add a locked inode to the transaction.
*
@@ -67,8 +59,6 @@ xfs_trans_ijoin(
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &iip->ili_item);
-
- xfs_trans_inode_broot_debug(ip);
}
/*
@@ -135,34 +125,3 @@ xfs_trans_log_inode(
flags |= ip->i_itemp->ili_last_fields;
ip->i_itemp->ili_fields |= flags;
}
-
-#ifdef XFS_TRANS_DEBUG
-/*
- * Keep track of the state of the inode btree root to make sure we
- * log it properly.
- */
-STATIC void
-xfs_trans_inode_broot_debug(
- xfs_inode_t *ip)
-{
- xfs_inode_log_item_t *iip;
-
- ASSERT(ip->i_itemp != NULL);
- iip = ip->i_itemp;
- if (iip->ili_root_size != 0) {
- ASSERT(iip->ili_orig_root != NULL);
- kmem_free(iip->ili_orig_root);
- iip->ili_root_size = 0;
- iip->ili_orig_root = NULL;
- }
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- ASSERT((ip->i_df.if_broot != NULL) &&
- (ip->i_df.if_broot_bytes > 0));
- iip->ili_root_size = ip->i_df.if_broot_bytes;
- iip->ili_orig_root =
- (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
- memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
- iip->ili_root_size);
- }
-}
-#endif
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 7a41874f4c20..61ba1cfa974c 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -32,7 +32,6 @@ typedef unsigned int __uint32_t;
typedef signed long long int __int64_t;
typedef unsigned long long int __uint64_t;
-typedef enum { B_FALSE,B_TRUE } boolean_t;
typedef __uint32_t prid_t; /* project ID */
typedef __uint32_t inst_t; /* an instruction */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d95f565a390e..77ad74834baa 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -725,7 +725,7 @@ xfs_create(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- boolean_t unlock_dp_on_error = B_FALSE;
+ bool unlock_dp_on_error = false;
uint cancel_flags;
int committed;
prid_t prid;
@@ -794,7 +794,7 @@ xfs_create(
}
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = B_TRUE;
+ unlock_dp_on_error = true;
xfs_bmap_init(&free_list, &first_block);
@@ -830,7 +830,7 @@ xfs_create(
* error path.
*/
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = B_FALSE;
+ unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
&first_block, &free_list, resblks ?
@@ -1367,7 +1367,7 @@ xfs_symlink(
int pathlen;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- boolean_t unlock_dp_on_error = B_FALSE;
+ bool unlock_dp_on_error = false;
uint cancel_flags;
int committed;
xfs_fileoff_t first_fsb;
@@ -1438,7 +1438,7 @@ xfs_symlink(
}
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = B_TRUE;
+ unlock_dp_on_error = true;
/*
* Check whether the directory allows new symlinks or not.
@@ -1484,7 +1484,7 @@ xfs_symlink(
* error path.
*/
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = B_FALSE;
+ unlock_dp_on_error = false;
/*
* Also attach the dquot(s) to it, if applicable.