summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c2
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/9p/v9fs.h23
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_inode.c48
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/bitmap.c2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/anon_inodes.c1
-rw-r--r--fs/attr.c13
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/binfmt_aout.c38
-rw-r--r--fs/binfmt_elf.c151
-rw-r--r--fs/binfmt_elf_fdpic.c183
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/bio.c7
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c11
-rw-r--r--fs/btrfs/extent_io.c83
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/inode.c143
-rw-r--r--fs/btrfs/ioctl.c706
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/ordered-data.c41
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c8
-rw-r--r--fs/btrfs/super.c243
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c39
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/cifs/asn1.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/compat.c18
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/exec.c50
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/inode.c4
-rw-r--r--fs/ext2/balloc.c12
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c18
-rw-r--r--fs/ext2/namei.c51
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/balloc.c11
-rw-r--r--fs/ext3/file.c7
-rw-r--r--fs/ext3/ialloc.c16
-rw-r--r--fs/ext3/inode.c45
-rw-r--r--fs/ext3/namei.c24
-rw-r--r--fs/ext3/super.c248
-rw-r--r--fs/ext3/xattr.c22
-rw-r--r--fs/ext4/balloc.c35
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h108
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c260
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c48
-rw-r--r--fs/ext4/inode.c492
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/migrate.c35
-rw-r--r--fs/ext4/move_extent.c36
-rw-r--r--fs/ext4/namei.c86
-rw-r--r--fs/ext4/resize.c102
-rw-r--r--fs/ext4/super.c361
-rw-r--r--fs/ext4/xattr.c64
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fat/namei_vfat.c27
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/file.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/incore.h2
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c9
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/super.c7
-rw-r--r--fs/gfs2/sys.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/inode.c4
-rw-r--r--fs/jbd/commit.c10
-rw-r--r--fs/jbd/transaction.c45
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jfs/acl.c26
-rw-r--r--fs/jfs/file.c31
-rw-r--r--fs/jfs/inode.c14
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dtree.c28
-rw-r--r--fs/jfs/jfs_extent.c16
-rw-r--r--fs/jfs/jfs_inode.c8
-rw-r--r--fs/jfs/jfs_inode.h3
-rw-r--r--fs/jfs/jfs_xtree.c21
-rw-r--r--fs/jfs/namei.c23
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/jfs/xattr.c17
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/mon.c12
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c327
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c730
-rw-r--r--fs/logfs/inode.c417
-rw-r--r--fs/logfs/journal.c883
-rw-r--r--fs/logfs/logfs.h724
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2246
-rw-r--r--fs/logfs/segment.c927
-rw-r--r--fs/logfs/super.c650
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c533
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/callback.h8
-rw-r--r--fs/nfs/callback_proc.c165
-rw-r--r--fs/nfs/callback_xdr.c106
-rw-r--r--fs/nfs/client.c48
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/dns_resolve.c18
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/inode.c96
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4proc.c104
-rw-r--r--fs/nfs/nfs4renewd.c24
-rw-r--r--fs/nfs/nfs4state.c118
-rw-r--r--fs/nfs/nfs4xdr.c10
-rw-r--r--fs/nfs/pagelist.c23
-rw-r--r--fs/nfs/proc.c41
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/symlink.c2
-rw-r--r--fs/nfs/write.c247
-rw-r--r--fs/nfsd/nfs4callback.c5
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/nfs4xdr.c4
-rw-r--r--fs/nfsd/nfsctl.c24
-rw-r--r--fs/nfsd/vfs.c157
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/segbuf.c10
-rw-r--r--fs/nilfs2/segment.c8
-rw-r--r--fs/nilfs2/segment.h4
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/ntfs/ChangeLog1702
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/super.c33
-rw-r--r--fs/ocfs2/alloc.c13
-rw-r--r--fs/ocfs2/aops.c11
-rw-r--r--fs/ocfs2/cluster/masklog.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c4
-rw-r--r--fs/ocfs2/dir.c37
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c20
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/namei.c52
-rw-r--r--fs/ocfs2/quota_global.c7
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/omfs/inode.c10
-rw-r--r--fs/open.c5
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/generic.c33
-rw-r--r--fs/proc/task_mmu.c13
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/compat.c118
-rw-r--r--fs/quota/dquot.c412
-rw-r--r--fs/quota/netlink.c95
-rw-r--r--fs/quota/quota.c735
-rw-r--r--fs/reiserfs/bitmap.c12
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c24
-rw-r--r--fs/reiserfs/namei.c23
-rw-r--r--fs/reiserfs/stree.c20
-rw-r--r--fs/reiserfs/super.c15
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/select.c19
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c76
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c49
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c150
-rw-r--r--fs/sync.c14
-rw-r--r--fs/sysfs/bin.c50
-rw-r--r--fs/sysfs/dir.c132
-rw-r--r--fs/sysfs/file.c47
-rw-r--r--fs/sysfs/inode.c13
-rw-r--r--fs/sysfs/mount.c4
-rw-r--r--fs/sysfs/symlink.c38
-rw-r--r--fs/sysfs/sysfs.h17
-rw-r--r--fs/sysv/inode.c10
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c8
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/balloc.c84
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c46
-rw-r--r--fs/udf/namei.c17
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/file.c3
-rw-r--r--fs/ufs/ialloc.c11
-rw-r--r--fs/ufs/inode.c9
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/ufs/ufs_fs.h15
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c228
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c81
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c854
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c796
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h22
-rw-r--r--fs/xfs/xfs_bmap.c220
-rw-r--r--fs/xfs/xfs_fs.h3
-rw-r--r--fs/xfs/xfs_iget.c19
-rw-r--r--fs/xfs/xfs_inode.c68
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_log.c106
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_mount.c69
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c216
-rw-r--r--fs/xfs/xfs_vnodeops.c107
-rw-r--r--fs/xfs/xfs_vnodeops.h15
311 files changed, 16427 insertions, 6960 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d94420457..08b2eb15704 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
if (access == V9FS_ACCESS_SINGLE)
return ERR_PTR(-EPERM);
- if (v9fs_extended(v9ses))
+ if (v9fs_proto_dotu(v9ses))
uname = NULL;
else
uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 7d6c2139891..6c7f6a25111 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
list_add(&v9ses->slist, &v9fs_sessionlist);
spin_unlock(&v9fs_sessionlist_lock);
- v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
+ v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
strcpy(v9ses->uname, V9FS_DEFUSER);
strcpy(v9ses->aname, V9FS_DEFANAME);
v9ses->uid = ~0;
@@ -262,13 +262,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto error;
}
- if (!v9ses->clnt->dotu)
- v9ses->flags &= ~V9FS_EXTENDED;
+ if (!p9_is_proto_dotu(v9ses->clnt))
+ v9ses->flags &= ~V9FS_PROTO_2000U;
v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
/* for legacy mode, fall back to V9FS_ACCESS_ANY */
- if (!v9fs_extended(v9ses) &&
+ if (!v9fs_proto_dotu(v9ses) &&
((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c..6b801d1ddf4 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,7 +23,8 @@
/**
* enum p9_session_flags - option flags for each 9P session
- * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
* @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
* @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
* @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +33,12 @@
* Session flags reflect options selected by users at mount time
*/
enum p9_session_flags {
- V9FS_EXTENDED = 0x01,
- V9FS_ACCESS_SINGLE = 0x02,
- V9FS_ACCESS_USER = 0x04,
- V9FS_ACCESS_ANY = 0x06,
- V9FS_ACCESS_MASK = 0x06,
+ V9FS_PROTO_2000U = 0x01,
+ V9FS_PROTO_2000L = 0x02,
+ V9FS_ACCESS_SINGLE = 0x04,
+ V9FS_ACCESS_USER = 0x08,
+ V9FS_ACCESS_ANY = 0x0C,
+ V9FS_ACCESS_MASK = 0x0C,
};
/* possible values of ->cache */
@@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
return (inode->i_sb->s_fs_info);
}
-static inline int v9fs_extended(struct v9fs_session_info *v9ses)
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
{
- return v9ses->flags & V9FS_EXTENDED;
+ return v9ses->flags & V9FS_PROTO_2000U;
+}
+
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+ return v9ses->flags & V9FS_PROTO_2000L;
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61..d8a3afe4ff7 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,6 +76,15 @@ static inline int dt_type(struct p9_wstat *mistat)
return rettype;
}
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+ stbuf->name = NULL;
+ stbuf->uid = NULL;
+ stbuf->gid = NULL;
+ stbuf->muid = NULL;
+ stbuf->extension = NULL;
+}
+
/**
* v9fs_dir_readdir - read a directory
* @filp: opened file structure
@@ -131,11 +140,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
rdir->head = 0;
rdir->tail = err;
}
-
while (rdir->head < rdir->tail) {
+ p9stat_init(&st);
err = p9stat_read(rdir->buf + rdir->head,
buflen - rdir->head, &st,
- fid->clnt->dotu);
+ fid->clnt->proto_version);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 74a0461a9ac..df52d488d2a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
+ omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
inode->i_blocks = 0;
}
- if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+ if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
}
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
/* No mandatory locks */
- if (__mandatory_lock(inode))
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
struct p9_fid *fid;
struct p9_client *clnt;
struct inode *inode = filp->f_path.dentry->d_inode;
- int origin = *offset;
+ loff_t origin = *offset;
unsigned long pg_start, pg_end;
P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a407fa3388c..5fe45d692c9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if (S_ISLNK(mode))
res |= P9_DMSYMLINK;
if (v9ses->nodev == 0) {
@@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
if ((mode & P9_DMDIR) == P9_DMDIR)
res |= S_IFDIR;
- else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses)))
+ else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
res |= S_IFLNK;
- else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFSOCK;
- else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFIFO;
- else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses))
+ else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0))
res |= S_IFBLK;
else
res |= S_IFREG;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if ((mode & P9_DMSETUID) == P9_DMSETUID)
res |= S_ISUID;
@@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR,
"special files without extended mode\n");
err = -EINVAL;
@@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
inode->i_fop = &v9fs_file_operations;
break;
case S_IFLNK:
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR,
"extended modes used w/o 9P2000.u\n");
err = -EINVAL;
@@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
break;
case S_IFDIR:
inc_nlink(inode);
- if (v9fs_extended(v9ses))
+ if (v9fs_proto_dotu(v9ses))
inode->i_op = &v9fs_dir_inode_operations_ext;
else
inode->i_op = &v9fs_dir_inode_operations;
@@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
flags = O_RDWR;
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
+ v9fs_uflags2omode(flags,
+ v9fs_proto_dotu(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
fid = NULL;
@@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
if (iattr->ia_valid & ATTR_SIZE)
wstat.length = iattr->ia_size;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
if (iattr->ia_valid & ATTR_UID)
wstat.n_uid = iattr->ia_uid;
@@ -886,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb)
{
char ext[32];
+ char tag_name[14];
+ unsigned int i_nlink;
struct v9fs_session_info *v9ses = sb->s_fs_info;
inode->i_nlink = 1;
@@ -897,11 +900,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_uid = v9ses->dfltuid;
inode->i_gid = v9ses->dfltgid;
- if (v9fs_extended(v9ses)) {
+ if (v9fs_proto_dotu(v9ses)) {
inode->i_uid = stat->n_uid;
inode->i_gid = stat->n_gid;
}
-
+ if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+ if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+ /*
+ * Hadlink support got added later to
+ * to the .u extension. So there can be
+ * server out there that doesn't support
+ * this even with .u extension. So check
+ * for non NULL stat->extension
+ */
+ strncpy(ext, stat->extension, sizeof(ext));
+ /* HARDLINKCOUNT %u */
+ sscanf(ext, "%13s %u", tag_name, &i_nlink);
+ if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+ inode->i_nlink = i_nlink;
+ }
+ }
inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
char type = 0;
@@ -976,7 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
if (IS_ERR(fid))
return PTR_ERR(fid);
- if (!v9fs_extended(v9ses))
+ if (!v9fs_proto_dotu(v9ses))
return -EBADF;
st = p9_client_stat(fid);
@@ -1066,7 +1084,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
struct p9_fid *fid;
v9ses = v9fs_inode2v9ses(dir);
- if (!v9fs_extended(v9ses)) {
+ if (!v9fs_proto_dotu(v9ses)) {
P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
return -EPERM;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a..7405f071be6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
source "fs/jffs2/Kconfig"
# UBIFS File system configuration
source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
source "fs/cramfs/Kconfig"
source "fs/squashfs/Kconfig"
source "fs/freevxfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9..c3633aa4691 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
obj-$(CONFIG_JFFS2_FS) += jffs2/
+obj-$(CONFIG_LOGFS) += logfs/
obj-$(CONFIG_UBIFS_FS) += ubifs/
obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b83..2ff622f6f54 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
/* Inode stuff */
struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
-int adfs_write_inode(struct inode *inode,int unused);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
/* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5..0f5e3097813 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
*/
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
+#include <linux/writeback.h>
#include "adfs.h"
/*
@@ -360,7 +361,7 @@ out:
* The adfs-specific inode data has already been updated by
* adfs_notify_change()
*/
-int adfs_write_inode(struct inode *inode, int wait)
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct super_block *sb = inode->i_sb;
struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
obj.attr = ADFS_I(inode)->attr;
obj.size = inode->i_size;
- ret = adfs_dir_update(sb, &obj, wait);
+ ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
unlock_kernel();
return ret;
}
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e40caaba45..861dae68ac1 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode);
extern void affs_clear_inode(struct inode *inode);
extern struct inode *affs_iget(struct super_block *sb,
unsigned long ino);
-extern int affs_write_inode(struct inode *inode, int);
+extern int affs_write_inode(struct inode *inode,
+ struct writeback_control *wbc);
extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
/* file.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1..8306d53307e 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -128,7 +128,7 @@ err_range:
/*
* Allocate a block in the given allocation zone.
* Since we have to byte-swap the bitmap on little-endian
- * machines, this is rather expensive. Therefor we will
+ * machines, this is rather expensive. Therefore we will
* preallocate up to 16 blocks from the same word, if
* possible. We are not doing preallocations in the
* header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c..c9744d771d9 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -166,7 +166,7 @@ bad_inode:
}
int
-affs_write_inode(struct inode *inode, int unused)
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf7..c54dad4e606 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata);
extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern int afs_write_inode(struct inode *, int);
extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6..14f6431598a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
- .write_inode = afs_write_inode,
.destroy_inode = afs_destroy_inode,
.clear_inode = afs_clear_inode,
.put_super = afs_put_super,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9..3bed54a294d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
}
/*
- * write an inode back
- */
-int afs_write_inode(struct inode *inode, int sync)
-{
- struct afs_vnode *vnode = AFS_FS_I(inode);
- int ret;
-
- _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
-
- ret = 0;
- if (sync) {
- ret = filemap_fdatawait(inode->i_mapping);
- if (ret < 0)
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- }
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* completion of write to server
*/
void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e..2de009565d8 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -209,6 +209,7 @@ static struct inode *anon_inode_mkinode(void)
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
+ inode->i_flags |= S_PRIVATE;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
return inode;
}
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdadd..0815e93bb48 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
-#include <linux/quotaops.h>
#include <linux/security.h>
/* Taken over from the old code... */
@@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
if (inode->i_size < offset) {
unsigned long limit;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ limit = rlimit(RLIMIT_FSIZE);
if (limit != RLIM_INFINITY && offset > limit)
goto out_sig;
if (offset > inode->i_sb->s_maxbytes)
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
error = inode->i_op->setattr(dentry, attr);
} else {
error = inode_change_ok(inode, attr);
- if (!error) {
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
- error = vfs_dq_transfer(inode, attr) ?
- -EDQUOT : 0;
- if (!error)
- error = inode_setattr(inode, attr);
- }
+ if (!error)
+ error = inode_setattr(inode, attr);
}
if (ia_valid & ATTR_SIZE)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8f3d9fd8960..f22a7d3dc36 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
+#include <linux/writeback.h>
#include <asm/uaccess.h>
#include "bfs.h"
@@ -98,7 +99,7 @@ error:
return ERR_PTR(-EIO);
}
-static int bfs_write_inode(struct inode *inode, int wait)
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
mark_buffer_dirty(bh);
- if (wait) {
+ if (wbc->sync_mode == WB_SYNC_ALL) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh))
err = -EIO;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index fdd39709917..15d80bb35d6 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -24,6 +24,7 @@
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/init.h>
+#include <linux/coredump.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
}
/*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-#define DUMP_WRITE(addr, nr) \
- if (!dump_write(file, (void *)(addr), (nr))) \
- goto end_coredump;
-
-#define DUMP_SEEK(offset) \
-if (file->f_op->llseek) { \
- if (file->f_op->llseek(file,(offset),0) != (offset)) \
- goto end_coredump; \
-} else file->f_pos = (offset)
-
-/*
* Routine writes a core dump image in the current directory.
* Currently only a stub-function.
*
@@ -130,26 +111,31 @@ static int aout_core_dump(struct coredump_params *cprm)
set_fs(KERNEL_DS);
/* struct user */
- DUMP_WRITE(&dump,sizeof(dump));
+ if (!dump_write(file, &dump, sizeof(dump)))
+ goto end_coredump;
/* Now dump all of the user data. Include malloced stuff as well */
- DUMP_SEEK(PAGE_SIZE);
+ if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+ goto end_coredump;
/* now we start writing out the user space info */
set_fs(USER_DS);
/* Dump the data area */
if (dump.u_dsize != 0) {
dump_start = START_DATA(dump);
dump_size = dump.u_dsize << PAGE_SHIFT;
- DUMP_WRITE(dump_start,dump_size);
+ if (!dump_write(file, dump_start, dump_size))
+ goto end_coredump;
}
/* Now prepare to dump the stack area */
if (dump.u_ssize != 0) {
dump_start = START_STACK(dump);
dump_size = dump.u_ssize << PAGE_SHIFT;
- DUMP_WRITE(dump_start,dump_size);
+ if (!dump_write(file, dump_start, dump_size))
+ goto end_coredump;
}
/* Finally dump the task struct. Not be used by gdb, but could be useful */
set_fs(KERNEL_DS);
- DUMP_WRITE(current,sizeof(*current));
+ if (!dump_write(file, current, sizeof(*current)))
+ goto end_coredump;
end_coredump:
set_fs(fs);
return has_dumped;
@@ -247,7 +233,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
* size limits imposed on them by creating programs with large
* arrays in the data or bss.
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ rlim = rlimit(RLIMIT_DATA);
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fd5b2ea5d29..535e763ab1a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/utsname.h>
+#include <linux/coredump.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -1085,36 +1086,6 @@ out:
* Modelled on fs/exec.c:aout_core_dump()
* Jeremy Fitzhardinge <jeremy@sw.oz.au>
*/
-/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static int dump_seek(struct file *file, loff_t off)
-{
- if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
- return 0;
- } else {
- char *buf = (char *)get_zeroed_page(GFP_KERNEL);
- if (!buf)
- return 0;
- while (off > 0) {
- unsigned long n = off;
- if (n > PAGE_SIZE)
- n = PAGE_SIZE;
- if (!dump_write(file, buf, n))
- return 0;
- off -= n;
- }
- free_page((unsigned long)buf);
- }
- return 1;
-}
/*
* Decide what to dump of a segment, part, all or none.
@@ -1249,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
}
#undef DUMP_WRITE
-#define DUMP_WRITE(addr, nr) \
- if ((size += (nr)) > cprm->limit || \
- !dump_write(cprm->file, (addr), (nr))) \
- goto end_coredump;
-
static void fill_elf_header(struct elfhdr *elf, int segs,
u16 machine, u32 flags, u8 osabi)
{
@@ -1872,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
return gate_vma;
}
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+ elf_addr_t e_shoff, int segs)
+{
+ elf->e_shoff = e_shoff;
+ elf->e_shentsize = sizeof(*shdr4extnum);
+ elf->e_shnum = 1;
+ elf->e_shstrndx = SHN_UNDEF;
+
+ memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+ shdr4extnum->sh_type = SHT_NULL;
+ shdr4extnum->sh_size = elf->e_shnum;
+ shdr4extnum->sh_link = elf->e_shstrndx;
+ shdr4extnum->sh_info = segs;
+}
+
+static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
+ unsigned long mm_flags)
+{
+ struct vm_area_struct *vma;
+ size_t size = 0;
+
+ for (vma = first_vma(current, gate_vma); vma != NULL;
+ vma = next_vma(vma, gate_vma))
+ size += vma_dump_size(vma, mm_flags);
+ return size;
+}
+
/*
* Actual dumper
*
@@ -1888,8 +1882,11 @@ static int elf_core_dump(struct coredump_params *cprm)
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff, foffset;
- unsigned long mm_flags;
struct elf_note_info info;
+ struct elf_phdr *phdr4note = NULL;
+ struct elf_shdr *shdr4extnum = NULL;
+ Elf_Half e_phnum;
+ elf_addr_t e_shoff;
/*
* We no longer stop all VM operations.
@@ -1912,20 +1909,25 @@ static int elf_core_dump(struct coredump_params *cprm)
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
- segs += ELF_CORE_EXTRA_PHDRS;
-#endif
+ segs += elf_core_extra_phdrs();
gate_vma = get_gate_vma(current);
if (gate_vma != NULL)
segs++;
+ /* for notes section */
+ segs++;
+
+ /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+ * this, kernel supports extended numbering. Have a look at
+ * include/linux/elf.h for further information. */
+ e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
/*
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(elf, segs + 1, /* including notes section */
- &info, cprm->signr, cprm->regs))
+ if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
goto cleanup;
has_dumped = 1;
@@ -1934,31 +1936,47 @@ static int elf_core_dump(struct coredump_params *cprm)
fs = get_fs();
set_fs(KERNEL_DS);
- DUMP_WRITE(elf, sizeof(*elf));
offset += sizeof(*elf); /* Elf header */
- offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+ offset += segs * sizeof(struct elf_phdr); /* Program headers */
foffset = offset;
/* Write notes phdr entry */
{
- struct elf_phdr phdr;
size_t sz = get_note_info_size(&info);
sz += elf_coredump_extra_notes_size();
- fill_elf_note_phdr(&phdr, sz, offset);
+ phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+ if (!phdr4note)
+ goto end_coredump;
+
+ fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
- DUMP_WRITE(&phdr, sizeof(phdr));
}
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency between the program headers and bodies, otherwise an
- * unusable core file can be generated.
- */
- mm_flags = current->mm->flags;
+ offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
+ offset += elf_core_extra_data_size();
+ e_shoff = offset;
+
+ if (e_phnum == PN_XNUM) {
+ shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+ if (!shdr4extnum)
+ goto end_coredump;
+ fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+ }
+
+ offset = dataoff;
+
+ size += sizeof(*elf);
+ if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+ goto end_coredump;
+
+ size += sizeof(*phdr4note);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+ goto end_coredump;
/* Write program headers for segments dump */
for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1969,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_offset = offset;
phdr.p_vaddr = vma->vm_start;
phdr.p_paddr = 0;
- phdr.p_filesz = vma_dump_size(vma, mm_flags);
+ phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
phdr.p_memsz = vma->vm_end - vma->vm_start;
offset += phdr.p_filesz;
phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1979,12 +1997,14 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- DUMP_WRITE(&phdr, sizeof(phdr));
+ size += sizeof(phdr);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+ goto end_coredump;
}
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
- ELF_CORE_WRITE_EXTRA_PHDRS;
-#endif
+ if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+ goto end_coredump;
/* write out the notes section */
if (!write_note_info(&info, cprm->file, &foffset))
@@ -2002,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm)
unsigned long addr;
unsigned long end;
- end = vma->vm_start + vma_dump_size(vma, mm_flags);
+ end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
@@ -2023,15 +2043,24 @@ static int elf_core_dump(struct coredump_params *cprm)
}
}
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
- ELF_CORE_WRITE_EXTRA_DATA;
-#endif
+ if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+ goto end_coredump;
+
+ if (e_phnum == PN_XNUM) {
+ size += sizeof(*shdr4extnum);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, shdr4extnum,
+ sizeof(*shdr4extnum)))
+ goto end_coredump;
+ }
end_coredump:
set_fs(fs);
cleanup:
free_note_info(&info);
+ kfree(shdr4extnum);
+ kfree(phdr4note);
kfree(elf);
out:
return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 18d77297ccc..2c32d00a669 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
#include <linux/elf.h>
#include <linux/elf-fdpic.h>
#include <linux/elfcore.h>
+#include <linux/coredump.h>
#include <asm/uaccess.h>
#include <asm/param.h>
@@ -1216,26 +1217,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
#ifdef CONFIG_ELF_CORE
/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static int dump_seek(struct file *file, loff_t off)
-{
- if (file->f_op->llseek) {
- if (file->f_op->llseek(file, off, SEEK_SET) != off)
- return 0;
- } else {
- file->f_pos = off;
- }
- return 1;
-}
-
-/*
* Decide whether a segment is worth dumping; default is yes to be
* sure (missing info is worse than too much; etc).
* Personally I'd include everything, and use the coredump limit...
@@ -1313,35 +1294,35 @@ static int notesize(struct memelfnote *en)
/* #define DEBUG */
-#define DUMP_WRITE(addr, nr) \
- do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
-#define DUMP_SEEK(off) \
- do { if (!dump_seek(file, (off))) return 0; } while(0)
+#define DUMP_WRITE(addr, nr, foffset) \
+ do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-static int writenote(struct memelfnote *men, struct file *file)
+static int alignfile(struct file *file, loff_t *foffset)
{
- struct elf_note en;
+ static const char buf[4] = { 0, };
+ DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+ return 1;
+}
+static int writenote(struct memelfnote *men, struct file *file,
+ loff_t *foffset)
+{
+ struct elf_note en;
en.n_namesz = strlen(men->name) + 1;
en.n_descsz = men->datasz;
en.n_type = men->type;
- DUMP_WRITE(&en, sizeof(en));
- DUMP_WRITE(men->name, en.n_namesz);
- /* XXX - cast from long long to long to avoid need for libgcc.a */
- DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */
- DUMP_WRITE(men->data, men->datasz);
- DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */
+ DUMP_WRITE(&en, sizeof(en), foffset);
+ DUMP_WRITE(men->name, en.n_namesz, foffset);
+ if (!alignfile(file, foffset))
+ return 0;
+ DUMP_WRITE(men->data, men->datasz, foffset);
+ if (!alignfile(file, foffset))
+ return 0;
return 1;
}
#undef DUMP_WRITE
-#undef DUMP_SEEK
-
-#define DUMP_WRITE(addr, nr) \
- if ((size += (nr)) > cprm->limit || \
- !dump_write(cprm->file, (addr), (nr))) \
- goto end_coredump;
static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
{
@@ -1393,7 +1374,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
/*
* fill up all the fields in prstatus from the given task struct, except
- * registers which need to be filled up seperately.
+ * registers which need to be filled up separately.
*/
static void fill_prstatus(struct elf_prstatus *prstatus,
struct task_struct *p, long signr)
@@ -1524,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
return sz;
}
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+ elf_addr_t e_shoff, int segs)
+{
+ elf->e_shoff = e_shoff;
+ elf->e_shentsize = sizeof(*shdr4extnum);
+ elf->e_shnum = 1;
+ elf->e_shstrndx = SHN_UNDEF;
+
+ memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+ shdr4extnum->sh_type = SHT_NULL;
+ shdr4extnum->sh_size = elf->e_shnum;
+ shdr4extnum->sh_link = elf->e_shstrndx;
+ shdr4extnum->sh_info = segs;
+}
+
/*
* dump the segments for an MMU process
*/
@@ -1552,7 +1549,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
err = -EIO;
kunmap(page);
page_cache_release(page);
- } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+ } else if (!dump_seek(file, PAGE_SIZE))
err = -EFBIG;
if (err)
goto out;
@@ -1588,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
}
#endif
+static size_t elf_core_vma_data_size(unsigned long mm_flags)
+{
+ struct vm_area_struct *vma;
+ size_t size = 0;
+
+ for (vma = current->mm->mmap; vma; vma->vm_next)
+ if (maydump(vma, mm_flags))
+ size += vma->vm_end - vma->vm_start;
+ return size;
+}
+
/*
* Actual dumper
*
@@ -1605,7 +1613,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
int i;
struct vm_area_struct *vma;
struct elfhdr *elf = NULL;
- loff_t offset = 0, dataoff;
+ loff_t offset = 0, dataoff, foffset;
int numnote;
struct memelfnote *notes = NULL;
struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
@@ -1618,7 +1626,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
#endif
int thread_status_size = 0;
elf_addr_t *auxv;
- unsigned long mm_flags;
+ struct elf_phdr *phdr4note = NULL;
+ struct elf_shdr *shdr4extnum = NULL;
+ Elf_Half e_phnum;
+ elf_addr_t e_shoff;
/*
* We no longer stop all VM operations.
@@ -1683,12 +1694,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
- segs += ELF_CORE_EXTRA_PHDRS;
-#endif
+ segs += elf_core_extra_phdrs();
+
+ /* for notes section */
+ segs++;
+
+ /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+ * this, kernel supports extended numbering. Have a look at
+ * include/linux/elf.h for further information. */
+ e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
/* Set up header */
- fill_elf_fdpic_header(elf, segs + 1); /* including notes section */
+ fill_elf_fdpic_header(elf, e_phnum);
has_dumped = 1;
current->flags |= PF_DUMPCORE;
@@ -1727,13 +1744,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
fs = get_fs();
set_fs(KERNEL_DS);
- DUMP_WRITE(elf, sizeof(*elf));
offset += sizeof(*elf); /* Elf header */
- offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */
+ offset += segs * sizeof(struct elf_phdr); /* Program headers */
+ foffset = offset;
/* Write notes phdr entry */
{
- struct elf_phdr phdr;
int sz = 0;
for (i = 0; i < numnote; i++)
@@ -1741,20 +1757,38 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
sz += thread_status_size;
- fill_elf_note_phdr(&phdr, sz, offset);
+ phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+ if (!phdr4note)
+ goto end_coredump;
+
+ fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
- DUMP_WRITE(&phdr, sizeof(phdr));
}
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency between the program headers and bodies, otherwise an
- * unusable core file can be generated.
- */
- mm_flags = current->mm->flags;
+ offset += elf_core_vma_data_size(cprm->mm_flags);
+ offset += elf_core_extra_data_size();
+ e_shoff = offset;
+
+ if (e_phnum == PN_XNUM) {
+ shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+ if (!shdr4extnum)
+ goto end_coredump;
+ fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+ }
+
+ offset = dataoff;
+
+ size += sizeof(*elf);
+ if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+ goto end_coredump;
+
+ size += sizeof(*phdr4note);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+ goto end_coredump;
/* write program headers for segments dump */
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1767,7 +1801,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
phdr.p_offset = offset;
phdr.p_vaddr = vma->vm_start;
phdr.p_paddr = 0;
- phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0;
+ phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
phdr.p_memsz = sz;
offset += phdr.p_filesz;
phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1777,16 +1811,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- DUMP_WRITE(&phdr, sizeof(phdr));
+ size += sizeof(phdr);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+ goto end_coredump;
}
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
- ELF_CORE_WRITE_EXTRA_PHDRS;
-#endif
+ if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+ goto end_coredump;
/* write out the notes section */
for (i = 0; i < numnote; i++)
- if (!writenote(notes + i, cprm->file))
+ if (!writenote(notes + i, cprm->file, &foffset))
goto end_coredump;
/* write out the thread status notes section */
@@ -1795,20 +1831,27 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
list_entry(t, struct elf_thread_status, list);
for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm->file))
+ if (!writenote(&tmp->notes[i], cprm->file, &foffset))
goto end_coredump;
}
- if (!dump_seek(cprm->file, dataoff))
+ if (!dump_seek(cprm->file, dataoff - foffset))
goto end_coredump;
if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
- mm_flags) < 0)
+ cprm->mm_flags) < 0)
goto end_coredump;
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
- ELF_CORE_WRITE_EXTRA_DATA;
-#endif
+ if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+ goto end_coredump;
+
+ if (e_phnum == PN_XNUM) {
+ size += sizeof(*shdr4extnum);
+ if (size > cprm->limit
+ || !dump_write(cprm->file, shdr4extnum,
+ sizeof(*shdr4extnum)))
+ goto end_coredump;
+ }
if (cprm->file->f_pos != offset) {
/* Sanity check */
@@ -1826,7 +1869,7 @@ cleanup:
list_del(tmp);
kfree(list_entry(tmp, struct elf_thread_status, list));
}
-
+ kfree(phdr4note);
kfree(elf);
kfree(prstatus);
kfree(psinfo);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 42c6b4a5444..e0e769bdca5 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm,
* size limits imposed on them by creating programs with large
* arrays in the data or bss.
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ rlim = rlimit(RLIMIT_DATA);
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (data_len + bss_len > rlim) {
diff --git a/fs/bio.c b/fs/bio.c
index dc17afd672e..e1f922184b4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
- * @bs: the bio_set to allocate from. If %NULL, just use kmalloc
+ * @bs: the bio_set to allocate from.
*
* Description:
- * bio_alloc_bioset will first try its own mempool to satisfy the allocation.
+ * bio_alloc_bioset will try its own mempool to satisfy the allocation.
* If %__GFP_WAIT is set then we will block on the internal pool waiting
- * for a &struct bio to become free. If a %NULL @bs is passed in, we will
- * fall back to just using @kmalloc to allocate the required memory.
+ * for a &struct bio to become free.
*
* Note that the caller must set ->bi_destructor on successful return
* of a bio, to do the appropriate freeing of the bio once the reference
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d91..7a4dee19983 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
unsigned ordered_data_close:1;
unsigned dummy_inode:1;
+ /*
+ * always compress this one file
+ */
+ unsigned force_compress:1;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b5..28b92a7218a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -478,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+ page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
if (!page)
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a098..0af2e386857 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -373,11 +373,13 @@ struct btrfs_super_block {
* ones specified below then we will fail to mount
*/
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
- BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -1182,7 +1184,6 @@ struct btrfs_root {
#define BTRFS_INODE_NOATIME (1 << 9)
#define BTRFS_INODE_DIRSYNC (1 << 10)
-
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
* one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
compat_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
- compat_flags, 64);
+ compat_ro_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
incompat_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2326,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_delete_inode(struct inode *inode);
void btrfs_put_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, int wait);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
void btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root);
+ struct btrfs_root *root, int *was_new);
int btrfs_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to);
struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
-u64 btrfs_parse_size(char *str);
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b59201b955..11d0ad30e20 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -263,13 +263,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
static int verify_parent_transid(struct extent_io_tree *io_tree,
struct extent_buffer *eb, u64 parent_transid)
{
+ struct extent_state *cached_state = NULL;
int ret;
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
- lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
- if (extent_buffer_uptodate(io_tree, eb) &&
+ lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+ 0, &cached_state, GFP_NOFS);
+ if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
@@ -282,10 +284,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
(unsigned long long)btrfs_header_generation(eb));
}
ret = 1;
- clear_extent_buffer_uptodate(io_tree, eb);
+ clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
out:
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state, GFP_NOFS);
return ret;
}
@@ -901,7 +903,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->highest_objectid = 0;
root->name = NULL;
root->in_sysfs = 0;
- root->inode_tree.rb_node = NULL;
+ root->inode_tree = RB_ROOT;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
@@ -1673,7 +1675,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree.rb_node = NULL;
+ fs_info->block_group_cache_tree = RB_ROOT;
extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2497,7 +2499,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
int ret;
struct inode *btree_inode = buf->first_page->mapping->host;
- ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+ ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
+ NULL);
if (!ret)
return ret;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8..951ef09b82f 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
- inode = btrfs_iget(sb, &key, root);
+ inode = btrfs_iget(sb, &key, root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
if (!IS_ERR(dentry))
dentry->d_op = &btrfs_dentry_operations;
return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3..1727b26fb19 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6561,6 +6561,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
struct btrfs_key key;
struct inode *inode = NULL;
struct btrfs_file_extent_item *fi;
+ struct extent_state *cached_state = NULL;
u64 num_bytes;
u64 skip_objectid = 0;
u32 nritems;
@@ -6589,12 +6590,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
}
num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
- lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
- key.offset + num_bytes - 1, GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, 0, &cached_state,
+ GFP_NOFS);
btrfs_drop_extent_cache(inode, key.offset,
key.offset + num_bytes - 1, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
- key.offset + num_bytes - 1, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, &cached_state,
+ GFP_NOFS);
cond_resched();
}
iput(inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b177ed31961..c99121ac5d6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,8 +104,8 @@ void extent_io_exit(void)
void extent_io_tree_init(struct extent_io_tree *tree,
struct address_space *mapping, gfp_t mask)
{
- tree->state.rb_node = NULL;
- tree->buffer.rb_node = NULL;
+ tree->state = RB_ROOT;
+ tree->buffer = RB_ROOT;
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
@@ -513,7 +513,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_end;
int err;
int set = 0;
+ int clear = 0;
+ if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ clear = 1;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
spin_lock(&tree->lock);
if (cached_state) {
cached = *cached_state;
- *cached_state = NULL;
- cached_state = NULL;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
if (cached && cached->tree && cached->start == start) {
- atomic_dec(&cached->refs);
+ if (clear)
+ atomic_dec(&cached->refs);
state = cached;
goto hit_next;
}
- free_extent_state(cached);
+ if (clear)
+ free_extent_state(cached);
}
/*
* this search will find the extents that end after
@@ -946,11 +955,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+ struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
- 0, NULL, NULL, mask);
+ 0, NULL, cached_state, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +993,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
+ u64 end, struct extent_state **cached_state,
+ gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- NULL, mask);
+ cached_state, mask);
}
int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1181,8 @@ out:
* 1 is returned if we find something, 0 if nothing was in the tree
*/
static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes)
+ u64 *start, u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
@@ -1203,8 +1214,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
*end = state->end;
goto out;
}
- if (!found)
+ if (!found) {
*start = state->start;
+ *cached_state = state;
+ atomic_inc(&state->refs);
+ }
found++;
*end = state->end;
cur_start = state->end + 1;
@@ -1336,10 +1350,11 @@ again:
delalloc_start = *start;
delalloc_end = 0;
found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
- max_bytes);
+ max_bytes, &cached_state);
if (!found || delalloc_end <= *start) {
*start = delalloc_start;
*end = delalloc_end;
+ free_extent_state(cached_state);
return found;
}
@@ -1722,7 +1737,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
}
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, GFP_NOFS);
+ clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
@@ -1750,7 +1765,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
static void end_bio_extent_readpage(struct bio *bio, int err)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio->bi_io_vec;
struct extent_io_tree *tree;
u64 start;
u64 end;
@@ -1773,7 +1789,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
else
whole_page = 0;
- if (--bvec >= bio->bi_io_vec)
+ if (++bvec <= bvec_end)
prefetchw(&bvec->bv_page->flags);
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1834,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
}
check_page_locked(tree, page);
}
- } while (bvec >= bio->bi_io_vec);
+ } while (bvec <= bvec_end);
bio_put(bio);
}
@@ -2704,6 +2720,7 @@ int extent_readpages(struct extent_io_tree *tree,
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset)
{
+ struct extent_state *cached_state = NULL;
u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
u64 end = start + PAGE_CACHE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2729,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent(tree, start, end, GFP_NOFS);
+ lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING,
- 1, 1, NULL, GFP_NOFS);
+ 1, 1, &cached_state, GFP_NOFS);
return 0;
}
@@ -2920,16 +2937,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
get_extent_t *get_extent)
{
struct inode *inode = mapping->host;
+ struct extent_state *cached_state = NULL;
u64 start = iblock << inode->i_blkbits;
sector_t sector = 0;
size_t blksize = (1 << inode->i_blkbits);
struct extent_map *em;
- lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
- GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+ 0, &cached_state, GFP_NOFS);
em = get_extent(inode, NULL, 0, start, blksize, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
- GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
+ start + blksize - 1, &cached_state, GFP_NOFS);
if (!em || IS_ERR(em))
return 0;
@@ -2951,6 +2969,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u32 flags = 0;
u64 disko = 0;
struct extent_map *em = NULL;
+ struct extent_state *cached_state = NULL;
int end = 0;
u64 em_start = 0, em_len = 0;
unsigned long emflags;
@@ -2959,8 +2978,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (len == 0)
return -EINVAL;
- lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
- GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+ &cached_state, GFP_NOFS);
em = get_extent(inode, NULL, 0, off, max - off, 0);
if (!em)
goto out;
@@ -3023,8 +3042,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
out_free:
free_extent_map(em);
out:
- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
- GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+ &cached_state, GFP_NOFS);
return ret;
}
@@ -3264,7 +3283,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
}
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+ struct extent_buffer *eb,
+ struct extent_state **cached_state)
{
unsigned long i;
struct page *page;
@@ -3274,7 +3294,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- GFP_NOFS);
+ cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (page)
@@ -3334,7 +3354,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
}
int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb)
+ struct extent_buffer *eb,
+ struct extent_state *cached_state)
{
int ret = 0;
unsigned long num_pages;
@@ -3346,7 +3367,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1, NULL);
+ EXTENT_UPTODATE, 1, cached_state);
if (ret)
return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2..bbab4813646 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, struct extent_state **cached, gfp_t mask);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
int set_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb);
+ struct extent_buffer *eb,
+ struct extent_state **cached_state);
int extent_buffer_uptodate(struct extent_io_tree *tree,
- struct extent_buffer *eb);
+ struct extent_buffer *eb,
+ struct extent_state *cached_state);
int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **token, char **map,
unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f9..28d87ba60ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -35,7 +35,7 @@ void extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
{
- tree->map.rb_node = NULL;
+ tree->map = RB_ROOT;
rwlock_init(&tree->lock);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037..ee3323c7fc1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
- err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ NULL);
if (err)
return err;
@@ -753,6 +754,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
loff_t pos, unsigned long first_index,
unsigned long last_index, size_t write_bytes)
{
+ struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +783,18 @@ again:
}
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, 0, &cached_state,
+ GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode,
last_pos - 1);
if (ordered &&
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset < last_pos) {
btrfs_put_ordered_extent(ordered);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1,
+ &cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
page_cache_release(pages[i]);
@@ -802,12 +806,13 @@ again:
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING,
+ EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
GFP_NOFS);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, &cached_state,
+ GFP_NOFS);
}
for (i = 0; i < num_pages; i++) {
clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f0325..dd831ed31ee 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -870,7 +870,7 @@ __btrfs_return_cluster_to_free_space(
tree_insert_offset(&block_group->free_space_offset,
entry->offset, &entry->offset_index, 0);
}
- cluster->root.rb_node = NULL;
+ cluster->root = RB_ROOT;
out:
spin_unlock(&cluster->lock);
@@ -1355,7 +1355,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
spin_lock_init(&cluster->lock);
spin_lock_init(&cluster->refill_lock);
- cluster->root.rb_node = NULL;
+ cluster->root = RB_ROOT;
cluster->max_size = 0;
cluster->points_to_bitmap = false;
INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f896..02bb099845f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -379,7 +379,8 @@ again:
* change at any time if we discover bad compression ratios.
*/
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
- btrfs_test_opt(root, COMPRESS)) {
+ (btrfs_test_opt(root, COMPRESS) ||
+ (BTRFS_I(inode)->force_compress))) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -483,8 +484,10 @@ again:
nr_pages_ret = 0;
/* flag the file so we don't compress in the future */
- if (!btrfs_test_opt(root, FORCE_COMPRESS))
+ if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+ !(BTRFS_I(inode)->force_compress)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ }
}
if (will_compress) {
*num_added += 1;
@@ -570,8 +573,8 @@ retry:
unsigned long nr_written = 0;
lock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, GFP_NOFS);
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
/* allocate blocks */
ret = cow_file_range(inode, async_cow->locked_page,
@@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- else if (!btrfs_test_opt(root, COMPRESS))
+ else if (!btrfs_test_opt(root, COMPRESS) &&
+ !(BTRFS_I(inode)->force_compress))
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
else
@@ -1508,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
{
if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
WARN_ON(1);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- GFP_NOFS);
+ cached_state, GFP_NOFS);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
{
struct btrfs_writepage_fixup *fixup;
struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
struct page *page;
struct inode *inode;
u64 page_start;
@@ -1544,7 +1550,8 @@ again:
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ &cached_state, GFP_NOFS);
/* already ordered? We're done */
if (PagePrivate2(page))
@@ -1552,17 +1559,18 @@ again:
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
- unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
- page_end, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+ page_end, &cached_state, GFP_NOFS);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
ClearPageChecked(page);
out:
- unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
out_page:
unlock_page(page);
page_cache_release(page);
@@ -1691,14 +1699,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct btrfs_trans_handle *trans;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_state *cached_state = NULL;
int compressed = 0;
int ret;
- ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+ ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+ end - start + 1);
if (!ret)
return 0;
-
- ordered_extent = btrfs_lookup_ordered_extent(inode, start);
BUG_ON(!ordered_extent);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
goto out;
}
- lock_extent(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- GFP_NOFS);
+ lock_extent_bits(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ 0, &cached_state, GFP_NOFS);
trans = btrfs_join_transaction(root, 1);
@@ -1742,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len);
BUG_ON(ret);
}
- unlock_extent(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
@@ -2153,7 +2162,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+ inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
if (IS_ERR(inode))
break;
@@ -3081,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
char *kaddr;
u32 blocksize = root->sectorsize;
pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3137,14 @@ again:
}
wait_on_page_writeback(page);
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+ GFP_NOFS);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3152,15 @@ again:
goto again;
}
- clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
- GFP_NOFS);
+ 0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ &cached_state);
if (ret) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
goto out_unlock;
}
@@ -3159,7 +3173,8 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+ GFP_NOFS);
out_unlock:
if (ret)
@@ -3177,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
+ struct extent_state *cached_state = NULL;
u64 mask = root->sectorsize - 1;
u64 hole_start = (inode->i_size + mask) & ~mask;
u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct btrfs_ordered_extent *ordered;
btrfs_wait_ordered_range(inode, hole_start,
block_end - hole_start);
- lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ &cached_state, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(inode, hole_start);
if (!ordered)
break;
- unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ unlock_extent_cached(io_tree, hole_start, block_end - 1,
+ &cached_state, GFP_NOFS);
btrfs_put_ordered_extent(ordered);
}
@@ -3241,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
break;
}
- unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+ GFP_NOFS);
return err;
}
@@ -3639,6 +3658,7 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->index_cnt = (u64)-1;
bi->last_unlink_trans = 0;
bi->ordered_data_close = 0;
+ bi->force_compress = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_tree,
inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3707,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
* Returns in *is_new if the inode was read from disk
*/
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root)
+ struct btrfs_root *root, int *new)
{
struct inode *inode;
@@ -3702,6 +3722,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
inode_tree_add(inode);
unlock_new_inode(inode);
+ if (new)
+ *new = 1;
}
return inode;
@@ -3754,7 +3776,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return NULL;
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root);
+ inode = btrfs_iget(dir->i_sb, &location, root, NULL);
return inode;
}
@@ -3769,7 +3791,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root);
+ inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
}
srcu_read_unlock(&root->fs_info->subvol_srcu, index);
@@ -3968,7 +3990,7 @@ err:
return ret;
}
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -3977,7 +3999,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
if (root->fs_info->btree_inode == inode)
return 0;
- if (wait) {
+ if (wbc->sync_mode == WB_SYNC_ALL) {
trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_commit_transaction(trans, root);
@@ -4501,7 +4523,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
err = -ENOSPC;
- goto out_unlock;
+ goto out_fail;
}
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
{
struct extent_io_tree *tree;
struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -4997,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
btrfs_releasepage(page, GFP_NOFS);
return;
}
- lock_extent(tree, page_start, page_end, GFP_NOFS);
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+ GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
if (ordered) {
@@ -5008,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
- NULL, GFP_NOFS);
+ &cached_state, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -5018,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
page_start, page_end);
}
btrfs_put_ordered_extent(ordered);
- lock_extent(tree, page_start, page_end, GFP_NOFS);
+ cached_state = NULL;
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+ GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -5055,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
char *kaddr;
unsigned long zero_start;
loff_t size;
@@ -5093,7 +5120,8 @@ again:
}
wait_on_page_writeback(page);
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+ GFP_NOFS);
set_page_extent_mapped(page);
/*
@@ -5102,7 +5130,8 @@ again:
*/
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5145,15 @@ again:
* is probably a better way to do this, but for now keep consistent with
* prepare_pages in the normal write path.
*/
- clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
- GFP_NOFS);
+ 0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ &cached_state);
if (ret) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
ret = VM_FAULT_SIGBUS;
btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
goto out_unlock;
@@ -5148,7 +5179,7 @@ again:
BTRFS_I(inode)->last_trans = root->fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5827,6 +5858,7 @@ stop_trans:
static long btrfs_fallocate(struct inode *inode, int mode,
loff_t offset, loff_t len)
{
+ struct extent_state *cached_state = NULL;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
@@ -5865,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
/* the extent lock is ordered inside the running
* transaction
*/
- lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ locked_end, 0, &cached_state, GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end, GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
@@ -5916,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
break;
}
}
- unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8..2845c6ceecd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
+#include "ctree.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -474,7 +475,79 @@ out_unlock:
return error;
}
-static int btrfs_defrag_file(struct file *file)
+static int should_defrag_range(struct inode *inode, u64 start, u64 len,
+ int thresh, u64 *last_len, u64 *skip,
+ u64 *defrag_end)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ int ret = 1;
+
+
+ if (thresh == 0)
+ thresh = 256 * 1024;
+
+ /*
+ * make sure that once we start defragging and extent, we keep on
+ * defragging it
+ */
+ if (start < *defrag_end)
+ return 1;
+
+ *skip = 0;
+
+ /*
+ * hopefully we have this extent in the tree already, try without
+ * the full extent lock
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ /* get the big lock and read metadata off disk */
+ lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+
+ if (!em)
+ return 0;
+ }
+
+ /* this will cover holes, and inline extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = 0;
+
+ /*
+ * we hit a real extent, if it is big don't bother defragging it again
+ */
+ if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+ ret = 0;
+
+ /*
+ * last_len ends up being a counter of how many bytes we've defragged.
+ * every time we choose not to defrag an extent, we reset *last_len
+ * so that the next tiny extent will force a defrag.
+ *
+ * The end result of this is that tiny extents before a single big
+ * extent will force at least part of that big extent to be defragged.
+ */
+ if (ret) {
+ *last_len += len;
+ *defrag_end = extent_map_end(em);
+ } else {
+ *last_len = 0;
+ *skip = extent_map_end(em);
+ *defrag_end = 0;
+ }
+
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_defrag_file(struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
unsigned long total_read = 0;
u64 page_start;
u64 page_end;
+ u64 last_len = 0;
+ u64 skip = 0;
+ u64 defrag_end = 0;
unsigned long i;
int ret;
- ret = btrfs_check_data_free_space(root, inode, inode->i_size);
- if (ret)
- return -ENOSPC;
+ if (inode->i_size == 0)
+ return 0;
+
+ if (range->start + range->len > range->start) {
+ last_index = min_t(u64, inode->i_size - 1,
+ range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+ } else {
+ last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ i = range->start >> PAGE_CACHE_SHIFT;
+ while (i <= last_index) {
+ if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE,
+ range->extent_thresh,
+ &last_len, &skip,
+ &defrag_end)) {
+ unsigned long next;
+ /*
+ * the should_defrag function tells us how much to skip
+ * bump our counter by the suggested amount
+ */
+ next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ i = max(i + 1, next);
+ continue;
+ }
- mutex_lock(&inode->i_mutex);
- last_index = inode->i_size >> PAGE_CACHE_SHIFT;
- for (i = 0; i <= last_index; i++) {
if (total_read % ra_pages == 0) {
btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
min(last_index, i + ra_pages - 1));
}
total_read++;
+ mutex_lock(&inode->i_mutex);
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+ BTRFS_I(inode)->force_compress = 1;
+
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode,
+ PAGE_CACHE_SIZE);
+ ret = -ENOSPC;
+ break;
+ }
again:
+ if (inode->i_size == 0 ||
+ i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+ ret = 0;
+ goto err_reservations;
+ }
+
page = grab_cache_page(inode->i_mapping, i);
if (!page)
- goto out_unlock;
+ goto err_reservations;
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
- goto out_unlock;
+ goto err_reservations;
}
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+
wait_on_page_writeback(page);
+ if (PageDirty(page)) {
+ btrfs_free_reserved_data_space(root, inode,
+ PAGE_CACHE_SIZE);
+ goto loop_unlock;
+ }
+
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
* page if it is dirtied again later
*/
clear_page_dirty_for_io(page);
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
+ page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, GFP_NOFS);
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+ ClearPageChecked(page);
set_page_dirty(page);
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+loop_unlock:
unlock_page(page);
page_cache_release(page);
+ mutex_unlock(&inode->i_mutex);
+
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+ i++;
+ }
+
+ if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+ filemap_flush(inode->i_mapping);
+
+ if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ /* the filemap_flush will queue IO into the worker threads, but
+ * we have to make sure the IO is actually started and that
+ * ordered extents get created before we return
+ */
+ atomic_inc(&root->fs_info->async_submit_draining);
+ while (atomic_read(&root->fs_info->nr_async_submits) ||
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+ atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&root->fs_info->async_submit_draining);
+
+ mutex_lock(&inode->i_mutex);
+ BTRFS_I(inode)->force_compress = 0;
+ mutex_unlock(&inode->i_mutex);
}
-out_unlock:
- mutex_unlock(&inode->i_mutex);
return 0;
+
+err_reservations:
+ mutex_unlock(&inode->i_mutex);
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ return ret;
}
static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
mod = 1;
sizestr++;
}
- new_size = btrfs_parse_size(sizestr);
+ new_size = memparse(sizestr, NULL);
if (new_size == 0) {
ret = -EINVAL;
goto out_unlock;
@@ -743,6 +911,327 @@ out:
return ret;
}
+static noinline int key_in_sk(struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk)
+{
+ struct btrfs_key test;
+ int ret;
+
+ test.objectid = sk->min_objectid;
+ test.type = sk->min_type;
+ test.offset = sk->min_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret < 0)
+ return 0;
+
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret > 0)
+ return 0;
+ return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk,
+ char *buf,
+ unsigned long *sk_offset,
+ int *num_found)
+{
+ u64 found_transid;
+ struct extent_buffer *leaf;
+ struct btrfs_ioctl_search_header sh;
+ unsigned long item_off;
+ unsigned long item_len;
+ int nritems;
+ int i;
+ int slot;
+ int found = 0;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ if (btrfs_header_generation(leaf) > sk->max_transid) {
+ i = nritems;
+ goto advance_key;
+ }
+ found_transid = btrfs_header_generation(leaf);
+
+ for (i = slot; i < nritems; i++) {
+ item_off = btrfs_item_ptr_offset(leaf, i);
+ item_len = btrfs_item_size_nr(leaf, i);
+
+ if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ item_len = 0;
+
+ if (sizeof(sh) + item_len + *sk_offset >
+ BTRFS_SEARCH_ARGS_BUFSIZE) {
+ ret = 1;
+ goto overflow;
+ }
+
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+ continue;
+
+ sh.objectid = key->objectid;
+ sh.offset = key->offset;
+ sh.type = key->type;
+ sh.len = item_len;
+ sh.transid = found_transid;
+
+ /* copy search result header */
+ memcpy(buf + *sk_offset, &sh, sizeof(sh));
+ *sk_offset += sizeof(sh);
+
+ if (item_len) {
+ char *p = buf + *sk_offset;
+ /* copy the item */
+ read_extent_buffer(leaf, p,
+ item_off, item_len);
+ *sk_offset += item_len;
+ }
+ found++;
+
+ if (*num_found >= sk->nr_items)
+ break;
+ }
+advance_key:
+ ret = 0;
+ if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+ key->offset++;
+ else if (key->type < (u8)-1 && key->type < sk->max_type) {
+ key->offset = 0;
+ key->type++;
+ } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+ key->offset = 0;
+ key->type = 0;
+ key->objectid++;
+ } else
+ ret = 1;
+overflow:
+ *num_found += found;
+ return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+ struct btrfs_ioctl_search_args *args)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct btrfs_key max_key;
+ struct btrfs_path *path;
+ struct btrfs_ioctl_search_key *sk = &args->key;
+ struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+ int ret;
+ int num_found = 0;
+ unsigned long sk_offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (sk->tree_id == 0) {
+ /* search the root of the inode that was passed */
+ root = BTRFS_I(inode)->root;
+ } else {
+ key.objectid = sk->tree_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(info, &key);
+ if (IS_ERR(root)) {
+ printk(KERN_ERR "could not find root %llu\n",
+ sk->tree_id);
+ btrfs_free_path(path);
+ return -ENOENT;
+ }
+ }
+
+ key.objectid = sk->min_objectid;
+ key.type = sk->min_type;
+ key.offset = sk->min_offset;
+
+ max_key.objectid = sk->max_objectid;
+ max_key.type = sk->max_type;
+ max_key.offset = sk->max_offset;
+
+ path->keep_locks = 1;
+
+ while(1) {
+ ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+ sk->min_transid);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ goto err;
+ }
+ ret = copy_to_sk(root, path, &key, sk, args->buf,
+ &sk_offset, &num_found);
+ btrfs_release_path(root, path);
+ if (ret || num_found >= sk->nr_items)
+ break;
+
+ }
+ ret = 0;
+err:
+ sk->nr_items = num_found;
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args *args;
+ struct inode *inode;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ args = kmalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return -ENOMEM;
+
+ if (copy_from_user(args, argp, sizeof(*args))) {
+ kfree(args);
+ return -EFAULT;
+ }
+ inode = fdentry(file)->d_inode;
+ ret = search_ioctl(inode, args);
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+ kfree(args);
+ return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+ u64 tree_id, u64 dirid, char *name)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ char *ptr;
+ int ret = -1;
+ int slot;
+ int len;
+ int total_len = 0;
+ struct btrfs_inode_ref *iref;
+ struct extent_buffer *l;
+ struct btrfs_path *path;
+
+ if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+ name[0]='\0';
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+
+ key.objectid = tree_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(info, &key);
+ if (IS_ERR(root)) {
+ printk(KERN_ERR "could not find root %llu\n", tree_id);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ while(1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (ret > 0 && slot > 0)
+ slot--;
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (ret > 0 && (key.objectid != dirid ||
+ key.type != BTRFS_INODE_REF_KEY)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(l, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < name)
+ goto out;
+
+ *(ptr + len) = '/';
+ read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+
+ if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ btrfs_release_path(root, path);
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+
+ }
+ if (ptr < name)
+ goto out;
+ memcpy(name, ptr, total_len);
+ name[total_len]='\0';
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_args *args;
+ struct inode *inode;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ args = kmalloc(sizeof(*args), GFP_KERNEL);
+ if (copy_from_user(args, argp, sizeof(*args))) {
+ kfree(args);
+ return -EFAULT;
+ }
+ inode = fdentry(file)->d_inode;
+
+ if (args->treeid == 0)
+ args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+ ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ args->treeid, args->objectid,
+ args->name);
+
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
@@ -849,10 +1338,11 @@ out:
return err;
}
-static int btrfs_ioctl_defrag(struct file *file)
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_defrag_range_args *range;
int ret;
ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1363,30 @@ static int btrfs_ioctl_defrag(struct file *file)
ret = -EINVAL;
goto out;
}
- btrfs_defrag_file(file);
+
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+ if (!range) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (argp) {
+ if (copy_from_user(range, argp,
+ sizeof(*range))) {
+ ret = -EFAULT;
+ kfree(range);
+ }
+ /* compression requires us to start the IO */
+ if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range->extent_thresh = (u32)-1;
+ }
+ } else {
+ /* the rest are all set to zero by kzalloc */
+ range->len = (u64)-1;
+ }
+ btrfs_defrag_file(file, range);
+ kfree(range);
break;
}
out:
@@ -1274,6 +1787,157 @@ out:
return ret;
}
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *new_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_super_block *disk_super;
+ u64 features;
+ u64 objectid = 0;
+ u64 dir_id;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&objectid, argp, sizeof(objectid)))
+ return -EFAULT;
+
+ if (!objectid)
+ objectid = root->root_key.objectid;
+
+ location.objectid = objectid;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ if (IS_ERR(new_root))
+ return PTR_ERR(new_root);
+
+ if (btrfs_root_refs(&new_root->root_item) == 0)
+ return -ENOENT;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+ dir_id, "default", 7, 1);
+ if (!di) {
+ btrfs_free_path(path);
+ btrfs_end_transaction(trans, root);
+ printk(KERN_ERR "Umm, you don't have the default dir item, "
+ "this isn't going to work\n");
+ return -ENOENT;
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+ btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ disk_super = &root->fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
+ features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ }
+ btrfs_end_transaction(trans, root);
+
+ return 0;
+}
+
+long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_space_args space_args;
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_ioctl_space_info *dest;
+ struct btrfs_ioctl_space_info *dest_orig;
+ struct btrfs_ioctl_space_info *user_dest;
+ struct btrfs_space_info *info;
+ int alloc_size;
+ int ret = 0;
+ int slot_count = 0;
+
+ if (copy_from_user(&space_args,
+ (struct btrfs_ioctl_space_args __user *)arg,
+ sizeof(space_args)))
+ return -EFAULT;
+
+ /* first we count slots */
+ rcu_read_lock();
+ list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
+ slot_count++;
+ rcu_read_unlock();
+
+ /* space_slots == 0 means they are asking for a count */
+ if (space_args.space_slots == 0) {
+ space_args.total_spaces = slot_count;
+ goto out;
+ }
+ alloc_size = sizeof(*dest) * slot_count;
+ /* we generally have at most 6 or so space infos, one for each raid
+ * level. So, a whole page should be more than enough for everyone
+ */
+ if (alloc_size > PAGE_CACHE_SIZE)
+ return -ENOMEM;
+
+ space_args.total_spaces = 0;
+ dest = kmalloc(alloc_size, GFP_NOFS);
+ if (!dest)
+ return -ENOMEM;
+ dest_orig = dest;
+
+ /* now we have a buffer to copy into */
+ rcu_read_lock();
+ list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+ /* make sure we don't copy more than we allocated
+ * in our buffer
+ */
+ if (slot_count == 0)
+ break;
+ slot_count--;
+
+ /* make sure userland has enough room in their buffer */
+ if (space_args.total_spaces >= space_args.space_slots)
+ break;
+
+ space.flags = info->flags;
+ space.total_bytes = info->total_bytes;
+ space.used_bytes = info->bytes_used;
+ memcpy(dest, &space, sizeof(space));
+ dest++;
+ space_args.total_spaces++;
+ }
+ rcu_read_unlock();
+
+ user_dest = (struct btrfs_ioctl_space_info *)
+ (arg + sizeof(struct btrfs_ioctl_space_args));
+
+ if (copy_to_user(user_dest, dest_orig, alloc_size))
+ ret = -EFAULT;
+
+ kfree(dest_orig);
+out:
+ if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
/*
* there are many ways the trans_start and trans_end ioctls can lead
* to deadlocks. They should only be used by applications that
@@ -1320,8 +1984,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
+ case BTRFS_IOC_DEFAULT_SUBVOL:
+ return btrfs_ioctl_default_subvol(file, argp);
case BTRFS_IOC_DEFRAG:
- return btrfs_ioctl_defrag(file);
+ return btrfs_ioctl_defrag(file, NULL);
+ case BTRFS_IOC_DEFRAG_RANGE:
+ return btrfs_ioctl_defrag(file, argp);
case BTRFS_IOC_RESIZE:
return btrfs_ioctl_resize(root, argp);
case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2006,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_TREE_SEARCH:
+ return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_INO_LOOKUP:
+ return btrfs_ioctl_ino_lookup(file, argp);
+ case BTRFS_IOC_SPACE_INFO:
+ return btrfs_ioctl_space_info(root, argp);
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475e..424694aa517 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
char name[BTRFS_PATH_NAME_MAX + 1];
};
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+ __u64 treeid;
+ __u64 objectid;
+ char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+ /* which root are we searching. 0 is the tree of tree roots */
+ __u64 tree_id;
+
+ /* keys returned will be >= min and <= max */
+ __u64 min_objectid;
+ __u64 max_objectid;
+
+ /* keys returned will be >= min and <= max */
+ __u64 min_offset;
+ __u64 max_offset;
+
+ /* max and min transids to search for */
+ __u64 min_transid;
+ __u64 max_transid;
+
+ /* keys returned will be >= min and <= max */
+ __u32 min_type;
+ __u32 max_type;
+
+ /*
+ * how many items did userland ask for, and how many are we
+ * returning
+ */
+ __u32 nr_items;
+
+ /* align to 64 bits */
+ __u32 unused;
+
+ /* some extra for later */
+ __u64 unused1;
+ __u64 unused2;
+ __u64 unused3;
+ __u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+ __u64 transid;
+ __u64 objectid;
+ __u64 offset;
+ __u32 type;
+ __u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+ struct btrfs_ioctl_search_key key;
+ char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
struct btrfs_ioctl_clone_range_args {
__s64 src_fd;
__u64 src_offset, src_length;
__u64 dest_offset;
};
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+ /* start of the defrag operation */
+ __u64 start;
+
+ /* number of bytes to defrag, use (u64)-1 to say all */
+ __u64 len;
+
+ /*
+ * flags for the operation, which can include turning
+ * on compression for this one defrag
+ */
+ __u64 flags;
+
+ /*
+ * any extent bigger than this will be considered
+ * already defragged. Use 0 to take the kernel default
+ * Use 1 to say every single extent must be rewritten
+ */
+ __u32 extent_thresh;
+
+ /* spare for later */
+ __u32 unused[5];
+};
+
+struct btrfs_ioctl_space_info {
+ __u64 flags;
+ __u64 total_bytes;
+ __u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+ __u64 space_slots;
+ __u64 total_spaces;
+ struct btrfs_ioctl_space_info spaces[0];
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+ struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+ struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+ struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+ struct btrfs_ioctl_space_args)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a94..a8ffecd0b49 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -174,7 +174,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (!entry)
return -ENOMEM;
- mutex_lock(&tree->mutex);
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
@@ -190,16 +189,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
+ spin_lock(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
BUG_ON(node);
+ spin_unlock(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
- mutex_unlock(&tree->mutex);
BUG_ON(node);
return 0;
}
@@ -216,9 +216,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
return 0;
}
@@ -232,15 +232,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
* to make sure this function only returns 1 once for a given ordered extent.
*/
int btrfs_dec_test_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
- struct btrfs_ordered_extent *entry;
+ struct btrfs_ordered_extent *entry = NULL;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -264,7 +265,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
else
ret = 1;
out:
- mutex_unlock(&tree->mutex);
+ if (!ret && cached && entry) {
+ *cached = entry;
+ atomic_inc(&entry->refs);
+ }
+ spin_unlock(&tree->lock);
return ret == 0;
}
@@ -291,7 +296,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* remove an ordered extent from the tree. No references are dropped
- * and you must wake_up entry->wait. You must hold the tree mutex
+ * and you must wake_up entry->wait. You must hold the tree lock
* while you call this function.
*/
static int __btrfs_remove_ordered_extent(struct inode *inode,
@@ -340,9 +345,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
ret = __btrfs_remove_ordered_extent(inode, entry);
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
wake_up(&entry->wait);
return ret;
@@ -567,7 +572,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -578,7 +583,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (entry)
atomic_inc(&entry->refs);
out:
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
return entry;
}
@@ -594,7 +599,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -602,7 +607,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
return entry;
}
@@ -629,7 +634,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
disk_i_size = BTRFS_I(inode)->disk_i_size;
/* truncate file */
@@ -735,7 +740,7 @@ out:
*/
if (ordered)
__btrfs_remove_ordered_extent(inode, ordered);
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
if (ordered)
wake_up(&ordered->wait);
return ret;
@@ -762,7 +767,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
if (!ordered)
return 1;
- mutex_lock(&tree->mutex);
+ spin_lock(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +782,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
}
}
out:
- mutex_unlock(&tree->mutex);
+ spin_unlock(&tree->lock);
btrfs_put_ordered_extent(ordered);
return ret;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47..c82f76a9f04 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
/* one of these per inode */
struct btrfs_ordered_inode_tree {
- struct mutex mutex;
+ spinlock_t lock;
struct rb_root tree;
struct rb_node *last;
};
@@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
static inline void
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
{
- mutex_init(&t->mutex);
- t->tree.rb_node = NULL;
+ spin_lock_init(&t->lock);
+ t->tree = RB_ROOT;
t->last = NULL;
}
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
int btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
int btrfs_dec_test_ordered_pending(struct inode *inode,
- u64 file_offset, u64 io_size);
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int tyep);
int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db7..e2a55cb2072 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
{
- tree->root.rb_node = NULL;
+ tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->list);
spin_lock_init(&tree->lock);
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab7ab531874..0b23942cbc0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -170,14 +170,14 @@ struct async_merge {
static void mapping_tree_init(struct mapping_tree *tree)
{
- tree->rb_root.rb_node = NULL;
+ tree->rb_root = RB_ROOT;
spin_lock_init(&tree->lock);
}
static void backref_cache_init(struct backref_cache *cache)
{
int i;
- cache->rb_root.rb_node = NULL;
+ cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
spin_lock_init(&cache->lock);
@@ -2659,7 +2659,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
EXTENT_BOUNDARY, GFP_NOFS);
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
dirty_page++;
@@ -3487,7 +3487,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root);
+ inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
BTRFS_I(inode)->index_cnt = group->key.objectid;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e6457..9ac612e6ca6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -63,10 +63,10 @@ static void btrfs_put_super(struct super_block *sb)
}
enum {
- Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
- Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
- Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
- Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
+ Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+ Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
+ Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
+ Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
Opt_flushoncommit,
Opt_discard, Opt_err,
};
@@ -74,6 +74,7 @@ enum {
static match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
+ {Opt_subvolid, "subvolid=%d"},
{Opt_device, "device=%s"},
{Opt_nodatasum, "nodatasum"},
{Opt_nodatacow, "nodatacow"},
@@ -95,31 +96,6 @@ static match_table_t tokens = {
{Opt_err, NULL},
};
-u64 btrfs_parse_size(char *str)
-{
- u64 res;
- int mult = 1;
- char *end;
- char last;
-
- res = simple_strtoul(str, &end, 10);
-
- last = end[0];
- if (isalpha(last)) {
- last = tolower(last);
- switch (last) {
- case 'g':
- mult *= 1024;
- case 'm':
- mult *= 1024;
- case 'k':
- mult *= 1024;
- }
- res = res * mult;
- }
- return res;
-}
-
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -128,7 +104,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
- char *p, *num;
+ char *p, *num, *orig;
int intarg;
int ret = 0;
@@ -143,6 +119,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
if (!options)
return -ENOMEM;
+ orig = options;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -156,6 +133,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
+ case Opt_subvolid:
case Opt_device:
/*
* These are parsed by btrfs_parse_early_options
@@ -213,7 +191,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_max_extent:
num = match_strdup(&args[0]);
if (num) {
- info->max_extent = btrfs_parse_size(num);
+ info->max_extent = memparse(num, NULL);
kfree(num);
info->max_extent = max_t(u64,
@@ -225,7 +203,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_max_inline:
num = match_strdup(&args[0]);
if (num) {
- info->max_inline = btrfs_parse_size(num);
+ info->max_inline = memparse(num, NULL);
kfree(num);
if (info->max_inline) {
@@ -240,7 +218,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_alloc_start:
num = match_strdup(&args[0]);
if (num) {
- info->alloc_start = btrfs_parse_size(num);
+ info->alloc_start = memparse(num, NULL);
kfree(num);
printk(KERN_INFO
"btrfs: allocations start at %llu\n",
@@ -280,7 +258,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
- kfree(options);
+ kfree(orig);
return ret;
}
@@ -291,12 +269,13 @@ out:
* only when we need to allocate a new super block.
*/
static int btrfs_parse_early_options(const char *options, fmode_t flags,
- void *holder, char **subvol_name,
+ void *holder, char **subvol_name, u64 *subvol_objectid,
struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *opts, *p;
int error = 0;
+ int intarg;
if (!options)
goto out;
@@ -319,6 +298,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
case Opt_subvol:
*subvol_name = match_strdup(&args[0]);
break;
+ case Opt_subvolid:
+ intarg = 0;
+ error = match_int(&args[0], &intarg);
+ if (!error) {
+ /* we want the original fs_tree */
+ if (!intarg)
+ *subvol_objectid =
+ BTRFS_FS_TREE_OBJECTID;
+ else
+ *subvol_objectid = intarg;
+ }
+ break;
case Opt_device:
error = btrfs_scan_one_device(match_strdup(&args[0]),
flags, holder, fs_devices);
@@ -346,6 +337,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
return error;
}
+static struct dentry *get_default_root(struct super_block *sb,
+ u64 subvol_objectid)
+{
+ struct btrfs_root *root = sb->s_fs_info;
+ struct btrfs_root *new_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ struct inode *inode;
+ struct dentry *dentry;
+ u64 dir_id;
+ int new = 0;
+
+ /*
+ * We have a specific subvol we want to mount, just setup location and
+ * go look up the root.
+ */
+ if (subvol_objectid) {
+ location.objectid = subvol_objectid;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+ goto find_root;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ path->leave_spinning = 1;
+
+ /*
+ * Find the "default" dir item which points to the root item that we
+ * will mount by default if we haven't been given a specific subvolume
+ * to mount.
+ */
+ dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+ if (!di) {
+ /*
+ * Ok the default dir item isn't there. This is weird since
+ * it's always been there, but don't freak out, just try and
+ * mount to root most subvolume.
+ */
+ btrfs_free_path(path);
+ dir_id = BTRFS_FIRST_FREE_OBJECTID;
+ new_root = root->fs_info->fs_root;
+ goto setup_root;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ btrfs_free_path(path);
+
+find_root:
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ if (IS_ERR(new_root))
+ return ERR_PTR(PTR_ERR(new_root));
+
+ if (btrfs_root_refs(&new_root->root_item) == 0)
+ return ERR_PTR(-ENOENT);
+
+ dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+ location.objectid = dir_id;
+ location.type = BTRFS_INODE_ITEM_KEY;
+ location.offset = 0;
+
+ inode = btrfs_iget(sb, &location, new_root, &new);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * If we're just mounting the root most subvol put the inode and return
+ * a reference to the dentry. We will have already gotten a reference
+ * to the inode in btrfs_fill_super so we're good to go.
+ */
+ if (!new && sb->s_root->d_inode == inode) {
+ iput(inode);
+ return dget(sb->s_root);
+ }
+
+ if (new) {
+ const struct qstr name = { .name = "/", .len = 1 };
+
+ /*
+ * New inode, we need to make the dentry a sibling of s_root so
+ * everything gets cleaned up properly on unmount.
+ */
+ dentry = d_alloc(sb->s_root, &name);
+ if (!dentry) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ d_splice_alias(inode, dentry);
+ } else {
+ /*
+ * We found the inode in cache, just find a dentry for it and
+ * put the reference to the inode we just got.
+ */
+ dentry = d_find_alias(inode);
+ iput(inode);
+ }
+
+ return dentry;
+}
+
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
void *data, int silent)
@@ -379,7 +474,7 @@ static int btrfs_fill_super(struct super_block *sb,
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+ inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -391,12 +486,6 @@ static int btrfs_fill_super(struct super_block *sb,
err = -ENOMEM;
goto fail_close;
}
-#if 0
- /* this does the super kobj at the same time */
- err = btrfs_sysfs_add_super(tree_root->fs_info);
- if (err)
- goto fail_close;
-#endif
sb->s_root = root_dentry;
@@ -488,19 +577,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data, struct vfsmount *mnt)
{
- char *subvol_name = NULL;
struct block_device *bdev = NULL;
struct super_block *s;
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
fmode_t mode = FMODE_READ;
+ char *subvol_name = NULL;
+ u64 subvol_objectid = 0;
int error = 0;
+ int found = 0;
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
error = btrfs_parse_early_options(data, mode, fs_type,
- &subvol_name, &fs_devices);
+ &subvol_name, &subvol_objectid,
+ &fs_devices);
if (error)
return error;
@@ -529,6 +621,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
goto error_close_devices;
}
+ found = 1;
btrfs_close_devices(fs_devices);
} else {
char b[BDEVNAME_SIZE];
@@ -546,25 +639,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
s->s_flags |= MS_ACTIVE;
}
- if (!strcmp(subvol_name, "."))
- root = dget(s->s_root);
- else {
- mutex_lock(&s->s_root->d_inode->i_mutex);
- root = lookup_one_len(subvol_name, s->s_root,
+ root = get_default_root(s, subvol_objectid);
+ if (IS_ERR(root)) {
+ error = PTR_ERR(root);
+ deactivate_locked_super(s);
+ goto error;
+ }
+ /* if they gave us a subvolume name bind mount into that */
+ if (strcmp(subvol_name, ".")) {
+ struct dentry *new_root;
+ mutex_lock(&root->d_inode->i_mutex);
+ new_root = lookup_one_len(subvol_name, root,
strlen(subvol_name));
- mutex_unlock(&s->s_root->d_inode->i_mutex);
+ mutex_unlock(&root->d_inode->i_mutex);
- if (IS_ERR(root)) {
+ if (IS_ERR(new_root)) {
deactivate_locked_super(s);
- error = PTR_ERR(root);
- goto error_free_subvol_name;
+ error = PTR_ERR(new_root);
+ dput(root);
+ goto error_close_devices;
}
- if (!root->d_inode) {
+ if (!new_root->d_inode) {
dput(root);
+ dput(new_root);
deactivate_locked_super(s);
error = -ENXIO;
- goto error_free_subvol_name;
+ goto error_close_devices;
}
+ dput(root);
+ root = new_root;
}
mnt->mnt_sb = s;
@@ -579,6 +682,7 @@ error_close_devices:
btrfs_close_devices(fs_devices);
error_free_subvol_name:
kfree(subvol_name);
+error:
return error;
}
@@ -623,14 +727,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct list_head *head = &root->fs_info->space_info;
+ struct btrfs_space_info *found;
+ u64 total_used = 0;
+ u64 data_used = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+ BTRFS_BLOCK_GROUP_RAID10|
+ BTRFS_BLOCK_GROUP_RAID1)) {
+ total_used += found->bytes_used;
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ data_used += found->bytes_used;
+ else
+ data_used += found->total_bytes;
+ }
+
+ total_used += found->bytes_used;
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ data_used += found->bytes_used;
+ else
+ data_used += found->total_bytes;
+ }
+ rcu_read_unlock();
+
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
- buf->f_bfree = buf->f_blocks -
- (btrfs_super_bytes_used(disk_super) >> bits);
- buf->f_bavail = buf->f_bfree;
+ buf->f_bfree = buf->f_blocks - (total_used >> bits);
+ buf->f_bavail = buf->f_blocks - (data_used >> bits);
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81d..4ce16ef702a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
complete(&root->kobj_unregister);
}
-static struct sysfs_ops btrfs_super_attr_ops = {
+static const struct sysfs_ops btrfs_super_attr_ops = {
.show = btrfs_super_attr_show,
.store = btrfs_super_attr_store,
};
-static struct sysfs_ops btrfs_root_attr_ops = {
+static const struct sysfs_ops btrfs_root_attr_ops = {
.show = btrfs_root_attr_show,
.store = btrfs_root_attr_store,
};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b3..2d654c1c794 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -69,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root)
cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
- cur_trans->delayed_refs.root.rb_node = NULL;
+ cur_trans->delayed_refs.root = RB_ROOT;
cur_trans->delayed_refs.num_entries = 0;
cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.num_heads = 0;
@@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
- if (flush_on_commit) {
+ if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
ret = btrfs_wait_ordered_extents(root, 0, 1);
BUG_ON(ret);
- } else if (snap_pending) {
- ret = btrfs_wait_ordered_extents(root, 0, 1);
- BUG_ON(ret);
}
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622e..1255fcc8ade 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -445,7 +445,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root);
+ inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
if (IS_ERR(inode)) {
inode = NULL;
} else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f..9df8e3f1cca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -256,13 +256,13 @@ loop_lock:
wake_up(&fs_info->async_submit_wait);
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
- submit_bio(cur->bi_rw, cur);
- num_run++;
- batch_run++;
if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
+ submit_bio(cur->bi_rw, cur);
+ num_run++;
+ batch_run++;
if (need_resched()) {
if (num_sync_run) {
blk_run_backing_dev(bdi, NULL);
@@ -325,16 +325,6 @@ loop_lock:
num_sync_run = 0;
blk_run_backing_dev(bdi, NULL);
}
-
- cond_resched();
- if (again)
- goto loop;
-
- spin_lock(&device->io_lock);
- if (device->pending_bios.head || device->pending_sync_bios.head)
- goto loop_lock;
- spin_unlock(&device->io_lock);
-
/*
* IO has already been through a long path to get here. Checksumming,
* async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +336,16 @@ loop_lock:
* cared about found its way down here.
*/
blk_run_backing_dev(bdi, NULL);
+
+ cond_resched();
+ if (again)
+ goto loop;
+
+ spin_lock(&device->io_lock);
+ if (device->pending_bios.head || device->pending_sync_bios.head)
+ goto loop_lock;
+ spin_unlock(&device->io_lock);
+
done:
return 0;
}
@@ -365,6 +365,7 @@ static noinline int device_list_add(const char *path,
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
u64 found_transid = btrfs_super_generation(disk_super);
+ char *name;
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
@@ -411,6 +412,12 @@ static noinline int device_list_add(const char *path,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
+ } else if (strcmp(device->name, path)) {
+ name = kstrdup(path, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+ kfree(device->name);
+ device->name = name;
}
if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +599,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
goto error_close;
disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = le64_to_cpu(disk_super->dev_item.devid);
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
goto error_brelse;
@@ -694,7 +701,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = le64_to_cpu(disk_super->dev_item.devid);
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
if (disk_super->label[0])
printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1194,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = le64_to_cpu(disk_super->dev_item.devid);
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
device = btrfs_find_device(root, devid, dev_uuid,
disk_super->fsid);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bf..c9c266db062 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
/*
* The page straddles i_size. It must be zeroed out on each and every
- * writepage invokation because it may be mmapped. "A file is mapped
+ * writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
-static void
-init_buffer_head(void *data)
-{
- struct buffer_head *bh = data;
-
- memset(bh, 0, sizeof(*bh));
- INIT_LIST_HEAD(&bh->b_assoc_buffers);
-}
-
void __init buffer_init(void)
{
int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
sizeof(struct buffer_head), 0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
- init_buffer_head);
+ NULL);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb2..a20bea59893 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
return 0;
}
- ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+ ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
*val = *(++(ctx->pointer)); /* value has enum value */
else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b44ce0a0711..b1d61d0bdfc 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -54,7 +54,7 @@ void cifs_dfs_release_automount_timer(void)
* Extracts sharename form full UNC.
* i.e. strips from UNC trailing path that is not part of share
* name and fixup missing '\' in the begining of DFS node refferal
- * if neccessary.
+ * if necessary.
* Returns pointer to share name on success or ERR_PTR on error.
* Caller is responsible for freeing returned string.
*/
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 20959befc70..7cc7f83e931 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4019,7 +4019,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
goto parse_DFS_referrals_exit;
}
- /* collect neccessary data from referrals */
+ /* collect necessary data from referrals */
for (i = 0; i < *num_of_nodes; i++) {
char *temp;
int max_len;
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f..030602d453b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1795,6 +1795,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
return ret;
}
+struct compat_sel_arg_struct {
+ compat_ulong_t n;
+ compat_uptr_t inp;
+ compat_uptr_t outp;
+ compat_uptr_t exp;
+ compat_uptr_t tvp;
+};
+
+asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+{
+ struct compat_sel_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+ compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
#ifdef HAVE_SET_RESTORE_SIGMASK
static long do_compat_pselect(int n, compat_ulong_t __user *inp,
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296..112e45a17e9 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
#undef elfhdr
#undef elf_phdr
+#undef elf_shdr
#undef elf_note
#undef elf_addr_t
#define elfhdr elf32_hdr
#define elf_phdr elf32_phdr
+#define elf_shdr elf32_shdr
#define elf_note elf32_note
#define elf_addr_t Elf32_Addr
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 0ca9ec4a79c..6d55b61bfa7 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -545,7 +545,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
kcmd = MTIOCPOS;
karg = &pos;
break;
- case MTIOCGET32:
+ default: /* MTIOCGET32 */
kcmd = MTIOCGET;
karg = &get;
break;
@@ -663,7 +663,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd,
switch (cmd) {
case RAW_SETBIND:
- case RAW_GETBIND: {
+ default: { /* RAW_GETBIND */
struct raw_config_request req;
mm_segment_t oldfs = get_fs();
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 26a8bd40400..f994a7dfda8 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
kfree(ls);
}
-static struct sysfs_ops dlm_attr_ops = {
+static const struct sysfs_ops dlm_attr_ops = {
.show = dlm_attr_show,
.store = dlm_attr_store,
};
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0ba..b12532e553f 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
/*
* This in_recovery lock does two things:
* 1) Keeps this function from returning until all threads are out
- * of locking routines and locking is truely stopped.
+ * of locking routines and locking is truly stopped.
* 2) Keeps any new requests from being processed until it's unlocked
* when recovery is complete.
*/
diff --git a/fs/exec.c b/fs/exec.c
index cce6bbdbdbb..49cdaa19e5b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* to work from.
*/
rlim = current->signal->rlim;
- if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
+ if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
put_page(page);
return NULL;
}
@@ -246,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_STACK_FLAGS;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
err = insert_vm_struct(mm, vma);
if (err)
goto err;
@@ -516,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
/*
* cover the whole range: [new_start, old_end)
*/
- vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
+ if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
+ return -ENOMEM;
/*
* move the page tables downwards, on failure we rely on
@@ -547,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
tlb_finish_mmu(tlb, new_end, old_end);
/*
- * shrink the vma to just the new range.
+ * Shrink the vma to just the new range. Always succeeds.
*/
vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
return 0;
}
-#define EXTRA_STACK_VM_PAGES 20 /* random */
-
/*
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
* the stack is optionally relocated, and some extra space is added.
@@ -577,7 +577,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size to 1GB */
- stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
+ stack_base = rlimit_max(RLIMIT_STACK);
if (stack_base > (1 << 30))
stack_base = 1 << 30;
@@ -630,7 +630,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
goto out_unlock;
}
- stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+ stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
stack_size = vma->vm_end - vma->vm_start;
/*
* Align this down to a page boundary as expand_stack
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
+ sync_mm_rss(tsk, old_mm);
mm_release(tsk, old_mm);
if (old_mm) {
@@ -1532,7 +1533,7 @@ static int format_corename(char *corename, long signr)
/* core limit size */
case 'c':
rc = snprintf(out_ptr, out_end - out_ptr,
- "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
+ "%lu", rlimit(RLIMIT_CORE));
if (rc > out_end - out_ptr)
goto out;
out_ptr += rc;
@@ -1560,12 +1561,13 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start, int exit_code)
{
struct task_struct *t;
int nr = 0;
start->signal->flags = SIGNAL_GROUP_EXIT;
+ start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
t = start;
@@ -1590,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- tsk->signal->group_exit_code = exit_code;
- nr = zap_process(tsk);
+ nr = zap_process(tsk, exit_code);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(nr < 0))
@@ -1640,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
if (p->mm) {
if (unlikely(p->mm == mm)) {
lock_task_sighand(p, &flags);
- nr += zap_process(p);
+ nr += zap_process(p, exit_code);
unlock_task_sighand(p, &flags);
}
break;
@@ -1747,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value)
}
}
-int get_dumpable(struct mm_struct *mm)
+static int __get_dumpable(unsigned long mm_flags)
{
int ret;
- ret = mm->flags & 0x3;
+ ret = mm_flags & MMF_DUMPABLE_MASK;
return (ret >= 2) ? 2 : ret;
}
+int get_dumpable(struct mm_struct *mm)
+{
+ return __get_dumpable(mm->flags);
+}
+
static void wait_for_dump_helpers(struct file *file)
{
struct pipe_inode_info *pipe;
@@ -1797,7 +1803,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
struct coredump_params cprm = {
.signr = signr,
.regs = regs,
- .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur,
+ .limit = rlimit(RLIMIT_CORE),
+ /*
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ */
+ .mm_flags = mm->flags,
};
audit_core_dumps(signr);
@@ -1816,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
/*
* If another thread got here first, or we are not dumpable, bail out.
*/
- if (mm->core_state || !get_dumpable(mm)) {
+ if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
up_write(&mm->mmap_sem);
put_cred(cred);
goto fail;
@@ -1827,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
* process nor do we know its entire history. We only know it
* was tainted so we dump it as root in mode 2.
*/
- if (get_dumpable(mm) == 2) { /* Setuid core dump mode */
+ if (__get_dumpable(cprm.mm_flags) == 2) {
+ /* Setuid core dump mode */
flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = 0; /* Dump root private */
}
@@ -1923,8 +1936,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
/*
* Dont allow local users get cute and trick others to coredump
* into their pre-created files:
+ * Note, this is not relevant for pipes
*/
- if (inode->i_uid != current_fsuid())
+ if (!ispipe && (inode->i_uid != current_fsuid()))
goto close_fail;
if (!cprm.file->f_op)
goto close_fail;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 59b8bf2825c..8442e353309 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata);
extern struct inode *exofs_iget(struct super_block *, unsigned long);
struct inode *exofs_new_inode(struct inode *, int);
-extern int exofs_write_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
extern void exofs_delete_inode(struct inode *);
/* dir.c: */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5514f3c2c2f..a17e4b733e3 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1280,9 +1280,9 @@ out:
return ret;
}
-int exofs_write_inode(struct inode *inode, int wait)
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- return exofs_update_inode(inode, wait);
+ return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
/*
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea..1d081f0cfec 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
error_return:
brelse(bitmap_bh);
release_blocks(sb, freed);
- vfs_dq_free_block(inode, freed);
+ dquot_free_block(inode, freed);
}
/**
@@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
unsigned short windowsz = 0;
unsigned long ngroups;
unsigned long num = *count;
+ int ret;
*errp = -ENOSPC;
sb = inode->i_sb;
@@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
/*
* Check quota for allocation of this block.
*/
- if (vfs_dq_alloc_block(inode, num)) {
- *errp = -EDQUOT;
+ ret = dquot_alloc_block(inode, num);
+ if (ret) {
+ *errp = ret;
return 0;
}
@@ -1409,7 +1411,7 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- vfs_dq_free_block(inode, *count-num);
+ dquot_free_block(inode, *count-num);
*count = num;
return ret_block;
@@ -1420,7 +1422,7 @@ out:
* Undo the block allocation
*/
if (!performed_allocation)
- vfs_dq_free_block(inode, *count);
+ dquot_free_block(inode, *count);
brelse(bitmap_bh);
return 0;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3c..0b038e47ad2 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
/* inode.c */
extern struct inode *ext2_iget (struct super_block *, unsigned long);
-extern int ext2_write_inode (struct inode *, int);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_delete_inode (struct inode *);
extern int ext2_sync_inode (struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 586e3589d4c..5d198d0697f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
@@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = xip_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d..ad7d572ee8d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
if (!is_bad_inode(inode)) {
/* Quota is already initialized in iput() */
ext2_xattr_delete_inode(inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
}
es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
goto fail_drop;
}
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext2_init_acl(inode, dir);
if (err)
@@ -605,10 +605,10 @@ got:
return inode;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a0..fc13cc119aa 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
MODULE_DESCRIPTION("Second Extended Filesystem");
MODULE_LICENSE("GPL");
+static int __ext2_write_inode(struct inode *inode, int do_sync);
+
/*
* Test whether an inode is a fast symlink.
*/
@@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
*/
void ext2_delete_inode (struct inode * inode)
{
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
goto no_delete;
EXT2_I(inode)->i_dtime = get_seconds();
mark_inode_dirty(inode);
- ext2_write_inode(inode, inode_needs_sync(inode));
+ __ext2_write_inode(inode, inode_needs_sync(inode));
inode->i_size = 0;
if (inode->i_blocks)
@@ -1335,7 +1339,7 @@ bad_inode:
return ERR_PTR(ret);
}
-int ext2_write_inode(struct inode *inode, int do_sync)
+static int __ext2_write_inode(struct inode *inode, int do_sync)
{
struct ext2_inode_info *ei = EXT2_I(inode);
struct super_block *sb = inode->i_sb;
@@ -1440,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
return err;
}
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
int ext2_sync_inode(struct inode *inode)
{
struct writeback_control wbc = {
@@ -1457,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
error = inode_change_ok(inode, iattr);
if (error)
return error;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
(iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
- error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, iattr);
if (error)
return error;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce560..71efb0e9a3f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
*/
static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
{
- struct inode * inode = ext2_new_inode (dir, mode);
- int err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
- } else if (test_opt(inode->i_sb, NOBH)) {
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- inode->i_fop = &ext2_file_operations;
- } else {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_file_operations;
- }
- mark_inode_dirty(inode);
- err = ext2_add_nondir(dentry, inode);
+ struct inode *inode;
+
+ dquot_initialize(dir);
+
+ inode = ext2_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &ext2_file_inode_operations;
+ if (ext2_use_xip(inode->i_sb)) {
+ inode->i_mapping->a_ops = &ext2_aops_xip;
+ inode->i_fop = &ext2_xip_file_operations;
+ } else if (test_opt(inode->i_sb, NOBH)) {
+ inode->i_mapping->a_ops = &ext2_nobh_aops;
+ inode->i_fop = &ext2_file_operations;
+ } else {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_file_operations;
}
- return err;
+ mark_inode_dirty(inode);
+ return ext2_add_nondir(dentry, inode);
}
static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
inode = ext2_new_inode (dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out;
+ dquot_initialize(dir);
+
inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
if (inode->i_nlink >= EXT2_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= EXT2_LINK_MAX)
goto out;
+ dquot_initialize(dir);
+
inode_inc_link_count(dir);
inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
struct page * page;
int err = -ENOENT;
+ dquot_initialize(dir);
+
de = ext2_find_entry (dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct ext2_dir_entry_2 * old_de;
int err = -ENOENT;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f9cb54a585c..42e4a303b67 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
static void ext2_clear_inode(struct inode *inode)
{
struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
+
+ dquot_drop(inode);
ext2_discard_reservation(inode);
EXT2_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f8..e44dc92609b 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
the inode. */
ea_bdebug(new_bh, "reusing block");
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1)) {
+ error = dquot_alloc_block(inode, 1);
+ if (error) {
unlock_buffer(new_bh);
goto cleanup;
}
@@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* as if nothing happened and cleanup the unused block */
if (error && error != -ENOSPC) {
if (new_bh && new_bh != old_bh)
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
}
} else
@@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
if (ce)
mb_cache_entry_release(ce);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
mark_buffer_dirty(old_bh);
ea_bdebug(old_bh, "refcount now=%d",
le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
}
EXT2_I(inode)->i_file_acl = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e82..161da2d3f89 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
}
ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
if (dquot_freed_blocks)
- vfs_dq_free_block(inode, dquot_freed_blocks);
+ dquot_free_block(inode, dquot_freed_blocks);
return;
}
@@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
/*
* Check quota for allocation of this block.
*/
- if (vfs_dq_alloc_block(inode, num)) {
- *errp = -EDQUOT;
+ err = dquot_alloc_block(inode, num);
+ if (err) {
+ *errp = err;
return 0;
}
@@ -1713,7 +1714,7 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- vfs_dq_free_block(inode, *count-num);
+ dquot_free_block(inode, *count-num);
*count = num;
return ret_block;
@@ -1728,7 +1729,7 @@ out:
* Undo the block allocation
*/
if (!performed_allocation)
- vfs_dq_free_block(inode, *count);
+ dquot_free_block(inode, *count);
brelse(bitmap_bh);
return 0;
}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4..f55df0e61cb 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
+#include <linux/quotaops.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
#include "xattr.h"
@@ -33,9 +34,9 @@
*/
static int ext3_release_file (struct inode * inode, struct file * filp)
{
- if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
filemap_flush(inode->i_mapping);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+ ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
.compat_ioctl = ext3_compat_ioctl,
#endif
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b3999128513..ef9008b885b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_init(inode);
+ dquot_initialize(inode);
ext3_xattr_delete_inode(handle, inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -588,10 +588,10 @@ got:
sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
ret = inode;
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext3_init_acl(handle, inode, dir);
if (err)
@@ -619,10 +619,10 @@ really_out:
return ret;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb..7f920b7263a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
*/
if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
if (inode->i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = inode->i_size;
ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
- if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+ ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
journal = EXT3_JOURNAL(inode);
journal_lock_updates(journal);
err = journal_flush(journal);
@@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
/*
* We give up here if we're reentered, because it might be for a
@@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
+
if (ext3_journal_current_handle())
goto out_fail;
@@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ WARN_ON_ONCE(IS_RDONLY(inode));
+
if (ext3_journal_current_handle())
goto no_write;
@@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
PAGE_CACHE_SIZE, NULL, write_end_fn);
if (ret == 0)
ret = err;
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
unlock_page(page);
} else {
/*
@@ -1785,8 +1795,9 @@ retry:
handle = ext3_journal_start(inode, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
+ * but cannot extend i_size. Truncate allocated blocks
+ * and pretend the write failed... */
+ ext3_truncate(inode);
ret = PTR_ERR(handle);
goto out;
}
@@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
goto out_notrans;
if (inode->i_size == 0 && ext3_should_writeback_data(inode))
- ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+ ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
/*
* We have to lock the EOF page here, because lock_page() nests
@@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext3_get_inode_loc(inode, iloc,
- !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+ !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
}
void ext3_set_inode_flags(struct inode *inode)
@@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
EXT3_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
- ei->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -2955,7 +2966,7 @@ again:
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT3_STATE_NEW)
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
ext3_get_inode_flags(ei);
@@ -3052,7 +3063,7 @@ again:
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
out_brelse:
@@ -3096,7 +3107,7 @@ out_brelse:
* `stuff()' is running, and the new i_size will be lost. Plus the inode
* will no longer be on the superblock's dirty inode list.
*/
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
if (current->flags & PF_MEMALLOC)
return 0;
@@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
return -EIO;
}
- if (!wait)
+ if (wbc->sync_mode != WB_SYNC_ALL)
return 0;
return ext3_force_commit(inode->i_sb);
@@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if (ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
handle_t *handle;
@@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
- error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext3_journal_stop(handle);
return error;
@@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
ret = 2 * (bpp + indirects) + 2;
#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during vfs_dq_init so
+ /* We know that structure was already allocated during dquot_initialize so
* we will be updating only the data blocks + inodes */
ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
#endif
@@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7b0e44f7d66..ee184084ca4 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
struct inode * inode;
int err, retries = 0;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir,
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
+ dquot_initialize(dir);
+
retry:
handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
if (inode->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;
+
+ dquot_initialize(dir);
+
/*
* Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
* otherwise has the potential to corrupt the orphan inode list.
@@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
struct ext3_dir_entry_2 * old_de, * new_de;
int retval, flush_file = 0;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_bh = new_bh = dir_bh = NULL;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
if (new_dentry->d_inode)
- vfs_dq_init(new_dentry->d_inode);
+ dquot_initialize(new_dentry->d_inode);
handle = ext3_journal_start(old_dir, 2 *
EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afa2b569da1..1bee604cc6c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
* write out the superblock safely.
*
* We'll just use the journal_abort() error code to record an error in
- * the journal instead. On recovery, the journal will compain about
+ * the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
*/
@@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb)
if (!test_opt (sb, ERRORS_CONT)) {
journal_t *journal = EXT3_SB(sb)->s_journal;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
if (journal)
journal_abort(journal, -EIO);
}
@@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
sb->s_flags |= MS_RDONLY;
- EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
@@ -528,6 +528,8 @@ static void destroy_inodecache(void)
static void ext3_clear_inode(struct inode *inode)
{
struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+
+ dquot_drop(inode);
ext3_discard_reservation(inode);
EXT3_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
@@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
- if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+ if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
- if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+ if (test_opt(sb, GRPQUOTA))
seq_puts(seq, ",grpquota");
#endif
}
@@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
- seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
- EXT3_MOUNT_DATA_FLAGS));
+ seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
@@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext3_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ext3_write_dquot,
.acquire_dquot = ext3_acquire_dquot,
.release_dquot = ext3_release_dquot,
@@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
return sb_block;
}
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char *qname;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return 0;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext3_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype] &&
+ strcmp(sbi->s_qf_names[qtype], qname)) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified", QTYPE2NAME(qtype));
+ kfree(qname);
+ return 0;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ if (strchr(sbi->s_qf_names[qtype], '/')) {
+ ext3_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 0;
+ }
+ set_opt(sbi->s_mount_opt, QUOTA);
+ return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return 0;
+ }
+ /*
+ * The space will be released later when all options are confirmed
+ * to be correct
+ */
+ sbi->s_qf_names[qtype] = NULL;
+ return 1;
+}
+#endif
+
static int parse_options (char *options, struct super_block *sb,
unsigned int *inum, unsigned long *journal_devnum,
ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
int data_opt = 0;
int option;
#ifdef CONFIG_QUOTA
- int qtype, qfmt;
- char *qname;
+ int qfmt;
#endif
if (!options)
@@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
data_opt = EXT3_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
- == data_opt)
+ if (test_opt(sb, DATA_FLAGS) == data_opt)
break;
ext3_msg(sb, KERN_ERR,
"error: cannot change "
"data mode on remount. The filesystem "
"is mounted in data=%s mode and you "
"try to remount it in data=%s mode.",
- data_mode_string(sbi->s_mount_opt &
- EXT3_MOUNT_DATA_FLAGS),
+ data_mode_string(test_opt(sb,
+ DATA_FLAGS)),
data_mode_string(data_opt));
return 0;
} else {
- sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
+ clear_opt(sbi->s_mount_opt, DATA_FLAGS);
sbi->s_mount_opt |= data_opt;
}
break;
@@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb,
break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
- qtype = USRQUOTA;
- goto set_qf_name;
- case Opt_grpjquota:
- qtype = GRPQUOTA;
-set_qf_name:
- if (sb_any_quota_loaded(sb) &&
- !sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR,
- "error: cannot change journaled "
- "quota options when quota turned on.");
- return 0;
- }
- qname = match_strdup(&args[0]);
- if (!qname) {
- ext3_msg(sb, KERN_ERR,
- "error: not enough memory for "
- "storing quotafile name.");
+ if (!set_qf_name(sb, USRQUOTA, &args[0]))
return 0;
- }
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "error: %s quota file already "
- "specified.", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
- ext3_msg(sb, KERN_ERR,
- "error: quotafile must be on "
- "filesystem root.");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ break;
+ case Opt_grpjquota:
+ if (!set_qf_name(sb, GRPQUOTA, &args[0]))
return 0;
- }
- set_opt(sbi->s_mount_opt, QUOTA);
break;
case Opt_offusrjquota:
- qtype = USRQUOTA;
- goto clear_qf_name;
+ if (!clear_qf_name(sb, USRQUOTA))
+ return 0;
+ break;
case Opt_offgrpjquota:
- qtype = GRPQUOTA;
-clear_qf_name:
- if (sb_any_quota_loaded(sb) &&
- sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR, "error: cannot change "
- "journaled quota options when "
- "quota turned on.");
+ if (!clear_qf_name(sb, GRPQUOTA))
return 0;
- }
- /*
- * The space will be released later when all options
- * are confirmed to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
break;
case Opt_jqfmt_vfsold:
qfmt = QFMT_VFS_OLD;
@@ -1244,18 +1251,12 @@ set_qf_format:
}
#ifdef CONFIG_QUOTA
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
- sbi->s_qf_names[USRQUOTA])
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sbi->s_mount_opt, USRQUOTA);
-
- if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
- sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
clear_opt(sbi->s_mount_opt, GRPQUOTA);
- if ((sbi->s_qf_names[USRQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
- (sbi->s_qf_names[GRPQUOTA] &&
- (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
ext3_msg(sb, KERN_ERR, "error: old and new quota "
"format mixing.");
return 0;
@@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
}
list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
printk(KERN_DEBUG
"%s: truncating inode %lu to %Ld bytes\n",
@@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, POSIX_ACL);
#endif
if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
- sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
- sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
- sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
+ set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ if (test_opt(sb, ABORT))
ext3_abort(sb, __func__, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
es = sbi->s_es;
@@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+ if (test_opt(sb, ABORT)) {
err = -EROFS;
goto restore_opts;
}
@@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
* Process 1 Process 2
* ext3_create() quota_sync()
* journal_start() write_dquot()
- * vfs_dq_init() down(dqio_mutex)
+ * dquot_initialize() down(dqio_mutex)
* down(dqio_mutex) journal_start()
*
*/
@@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
int err = 0;
int offset = off & (sb->s_blocksize - 1);
- int tocopy;
int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
@@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
+
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
- bh = ext3_bread(handle, inode, blk, 1, &err);
- if (!bh)
+ bh = ext3_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ if (journal_quota) {
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
goto out;
- if (journal_quota) {
- err = ext3_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
- }
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext3_journal_dirty_metadata(handle, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
}
- brelse(bh);
- if (err)
- goto out;
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
}
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ if (journal_quota)
+ err = ext3_journal_dirty_metadata(handle, bh);
+ else {
+ /* Always do at least ordered writes for quotas */
+ err = ext3_journal_dirty_data(handle, bh);
+ mark_buffer_dirty(bh);
+ }
+ brelse(bh);
out:
- if (len == towrite) {
+ if (err) {
mutex_unlock(&inode->i_mutex);
return err;
}
- if (inode->i_size < off+len-towrite) {
- i_size_write(inode, off+len-towrite);
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
EXT3_I(inode)->i_disksize = inode->i_size;
}
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
mutex_unlock(&inode->i_mutex);
- return len - towrite;
+ return len;
}
#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 66895ccf76c..534a94c3a93 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return -ENODATA;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
void *end;
int error;
- if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+ if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
return 0;
error = ext3_get_inode_loc(inode, &iloc);
if (error)
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext3_journal_dirty_metadata(handle, bh);
if (IS_SYNC(inode))
handle->h_sync = 1;
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
if (ce)
@@ -775,8 +775,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1))
+ error = dquot_alloc_block(inode, 1);
+ if (error)
goto cleanup;
error = ext3_journal_get_write_access(handle,
new_bh);
@@ -850,7 +850,7 @@ cleanup:
return error;
cleanup_dquot:
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
bad_block:
@@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
error = ext3_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext3_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
- EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
+ ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
}
return 0;
}
@@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
- if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
- EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
}
error = ext3_xattr_ibody_find(inode, &i, &is);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d91..d2f37a5516c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, __func__,
- "Checksum bad for group %u", block_group);
+ ext4_error(sb, "Checksum bad for group %u",
+ block_group);
ext4_free_blks_set(sb, gdp, 0);
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* to make sure we calculate the right free blocks
*/
group_blocks = ext4_blocks_count(sbi->s_es) -
- le32_to_cpu(sbi->s_es->s_first_data_block) -
- (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
+ ext4_group_first_block_no(sb, ngroups - 1);
} else {
group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
}
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* when a file system is mounted (see ext4_fill_super).
*/
-
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/**
* ext4_get_group_desc() -- load group descriptor from disk
* @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= ngroups) {
- ext4_error(sb, "ext4_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %u, groups_count = %u",
- block_group, ngroups);
+ ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+ " groups_count = %u", block_group, ngroups);
return NULL;
}
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
- ext4_error(sb, "ext4_get_group_desc",
- "Group descriptor not loaded - "
+ ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
return 1;
err_out:
- ext4_error(sb, __func__,
- "Invalid block bitmap - "
- "block_group = %d, block = %llu",
+ ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
block_group, bitmap_blk);
return 0;
}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
- ext4_error(sb, __func__,
- "Cannot read block bitmap - "
+ ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
- ext4_error(sb, __func__,
- "Cannot read block bitmap - "
+ ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
- ext4_error(sb, __func__,
- "Adding blocks in system zones - "
+ ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, __func__,
- "bit already cleared for block %llu",
+ ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57..983f0e12749 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
entry = rb_entry(n, struct ext4_system_zone, node);
kmem_cache_free(ext4_system_zone_cachep, entry);
if (!parent)
- EXT4_SB(sb)->system_blks.rb_node = NULL;
+ EXT4_SB(sb)->system_blks = RB_ROOT;
else if (parent->rb_left == n)
parent->rb_left = NULL;
else if (parent->rb_right == n)
parent->rb_right = NULL;
n = parent;
}
- EXT4_SB(sb)->system_blks.rb_node = NULL;
+ EXT4_SB(sb)->system_blks = RB_ROOT;
}
/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e26..86cb6d86a04 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
error_msg = "inode out of bounds";
if (error_msg != NULL)
- ext4_error(dir->i_sb, function,
- "bad entry in directory #%lu: %s - "
- "offset=%u, inode=%u, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg, offset,
+ __ext4_error(dir->i_sb, function,
+ "bad entry in directory #%lu: %s - block=%llu"
+ "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg,
+ (unsigned long long) bh->b_blocknr,
+ (unsigned) (offset%bh->b_size), offset,
le32_to_cpu(de->inode),
rlen, de->name_len);
return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- ext4_error(sb, __func__, "directory #%lu "
+ ext4_error(sb, "directory #%lu "
"contains a hole at offset %Lu",
inode->i_ino,
(unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
kfree(old);
}
if (!parent)
- root->rb_node = NULL;
+ *root = RB_ROOT;
else if (parent->rb_left == n)
parent->rb_left = NULL;
else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4cedc91ec59..bf938cf7c5f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+ ext4_error_inode(__func__, (inode), (fmt), ## a);
+
+#define EXT4_ERROR_FILE(file, fmt, a...) \
+ ext4_error_file(__func__, (file), (fmt), ## a);
+
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -133,14 +139,14 @@ struct mpage_da_data {
int pages_written;
int retval;
};
-#define DIO_AIO_UNWRITTEN 0x1
+#define EXT4_IO_UNWRITTEN 0x1
typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
- int error; /* I/O error code */
- ext4_lblk_t offset; /* offset in the file */
- size_t size; /* size of the extent */
+ struct page *page; /* page struct for buffer write */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
struct work_struct work; /* data work queue */
} ext4_io_end_t;
@@ -284,10 +290,12 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
return flags & EXT4_OTHER_FLMASK;
}
-/*
- * Inode dynamic state flags
- */
-#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
-
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
__u32 group; /* Group number for this data */
@@ -364,19 +361,20 @@ struct ext4_new_group_data {
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO 0x0008
+#define EXT4_GET_BLOCKS_PRE_IO 0x0008
#define EXT4_GET_BLOCKS_CONVERT 0x0010
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
- /* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_DIO_CREATE_EXT)
/*
* Flags used by ext4_free_blocks
*/
#define EXT4_FREE_BLOCKS_METADATA 0x0001
#define EXT4_FREE_BLOCKS_FORGET 0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
/*
* ioctl commands
@@ -630,7 +628,7 @@ struct ext4_inode_info {
* near to their parent directory's inode.
*/
ext4_group_t i_block_group;
- __u32 i_state; /* Dynamic state flags for ext4 */
+ unsigned long i_state_flags; /* Dynamic state flags */
ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +706,9 @@ struct ext4_inode_info {
qsize_t i_reserved_quota;
#endif
- /* completed async DIOs that might need unwritten extents handling */
- struct list_head i_aio_dio_complete_list;
+ /* completed IOs that might need unwritten extents handling */
+ struct list_head i_completed_io_list;
+ spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
@@ -760,6 +759,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
(ino >= EXT4_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT4_STATE_JDATA, /* journaled data exists */
+ EXT4_STATE_NEW, /* inode is newly created */
+ EXT4_STATE_XATTR, /* has in-inode xattrs */
+ EXT4_STATE_NO_EXPAND, /* No space for expansion */
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+};
+
+static inline int ext4_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1416,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int ext4_write_inode(struct inode *, int);
+extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
@@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
/* ioctl.c */
@@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
ext4_fsblk_t n_blocks_count);
/* super.c */
-extern void ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
+#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
+extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
+extern void ext4_error_file(const char *, struct file *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void __ext4_std_error(struct super_block *, const char *, int);
extern void ext4_abort(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning(struct super_block *, const char *, const char *, ...)
+extern void __ext4_warning(struct super_block *, const char *,
+ const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
extern void ext4_msg(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- loff_t len);
+ ssize_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
@@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 len, __u64 *moved_len);
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+ BH_Uninit /* blocks are allocated but uninitialized on disk */
+ = BH_JBDPrivateStart,
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+
/*
* Add new method to test wether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6..53d2764d71c 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- if (inode && bh)
+ if (inode)
mark_buffer_dirty_inode(bh, inode);
else
mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"IO error syncing inode, "
"inode=%lu, block=%llu",
inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d70..b79ad512646 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
}
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads. This only works for extent-based
+ * files, and it doesn't work for nobh or if data journaling is
+ * enabled, since the dioread_nolock code uses b_private to pass
+ * information back to the I/O completion handler, and this conflicts
+ * with the jbd's use of b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+ if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ return 0;
+ if (test_opt(inode->i_sb, NOBH))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ return 1;
+}
+
#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b11..94c8ee81f5e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
if (S_ISREG(inode->i_mode))
block_group++;
}
- bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
/*
@@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
return 0;
corrupted:
- ext4_error(inode->i_sb, function,
+ __ext4_error(inode->i_sb, function,
"bad header/extent in inode #%lu: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
eh = ext_block_hdr(bh);
ppos++;
- BUG_ON(ppos > depth);
+ if (unlikely(ppos > depth)) {
+ put_bh(bh);
+ EXT4_ERROR_INODE(inode,
+ "ppos %d > depth %d", ppos, depth);
+ goto err;
+ }
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
i--;
@@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (err)
return err;
- BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+ if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d == ei_block %d!",
+ logical, le32_to_cpu(curp->p_idx->ei_block));
+ return -EIO;
+ }
len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
@@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
- BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
- > le16_to_cpu(curp->p_hdr->eh_max));
- BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+ if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+ > le16_to_cpu(curp->p_hdr->eh_max))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d == ei_block %d!",
+ logical, le32_to_cpu(curp->p_idx->ei_block));
+ return -EIO;
+ }
+ if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+ return -EIO;
+ }
err = ext4_ext_dirty(handle, inode, curp);
ext4_std_error(inode->i_sb, err);
@@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* if current leaf will be split, then we should use
* border from split point */
- BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+ return -EIO;
+ }
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
ext_debug("leaf will be split."
@@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* initialize new leaf */
newblock = ablocks[--a];
- BUG_ON(newblock == 0);
+ if (unlikely(newblock == 0)) {
+ EXT4_ERROR_INODE(inode, "newblock == 0!");
+ err = -EIO;
+ goto cleanup;
+ }
bh = sb_getblk(inode->i_sb, newblock);
if (!bh) {
err = -EIO;
@@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */
- BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+ if (unlikely(path[depth].p_hdr->eh_entries !=
+ path[depth].p_hdr->eh_max)) {
+ EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+ path[depth].p_hdr->eh_entries,
+ path[depth].p_hdr->eh_max);
+ err = -EIO;
+ goto cleanup;
+ }
/* start copy from next extent */
/* TODO: we could do it by single memmove */
m = 0;
@@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* create intermediate indexes */
k = depth - at - 1;
- BUG_ON(k < 0);
+ if (unlikely(k < 0)) {
+ EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+ err = -EIO;
+ goto cleanup;
+ }
if (k)
ext_debug("create %d intermediate indices\n", k);
/* insert new index into current index block */
@@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
EXT_MAX_INDEX(path[i].p_hdr));
- BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
- EXT_LAST_INDEX(path[i].p_hdr));
+ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+ EXT_LAST_INDEX(path[i].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+ le32_to_cpu(path[i].p_ext->ee_block));
+ err = -EIO;
+ goto cleanup;
+ }
while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
ext_debug("%d: move %d:%llu in new index %llu\n", i,
le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent *ex;
int depth, ee_len;
- BUG_ON(path == NULL);
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
depth = path->p_depth;
*phys = 0;
@@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
if (*logical < le32_to_cpu(ex->ee_block)) {
- BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+ *logical, le32_to_cpu(ex->ee_block));
+ return -EIO;
+ }
while (--depth >= 0) {
ix = path[depth].p_idx;
- BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+ ix != NULL ? ix->ei_block : 0,
+ EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+ EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+ depth);
+ return -EIO;
+ }
}
return 0;
}
- BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
*phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
- BUG_ON(path == NULL);
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
depth = path->p_depth;
*phys = 0;
@@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
if (*logical < le32_to_cpu(ex->ee_block)) {
- BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "first_extent(path[%d].p_hdr) != ex",
+ depth);
+ return -EIO;
+ }
while (--depth >= 0) {
ix = path[depth].p_idx;
- BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix != EXT_FIRST_INDEX *logical %d!",
+ *logical);
+ return -EIO;
+ }
}
*logical = le32_to_cpu(ex->ee_block);
*phys = ext_pblock(ex);
return 0;
}
- BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
/* next allocated block in this leaf */
@@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
- BUG_ON(ex == NULL);
- BUG_ON(eh == NULL);
+
+ if (unlikely(ex == NULL || eh == NULL)) {
+ EXT4_ERROR_INODE(inode,
+ "ex %p == NULL or eh %p == NULL", ex, eh);
+ return -EIO;
+ }
if (depth == 0) {
/* there is no tree at all */
@@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
merge_done = 1;
WARN_ON(eh->eh_entries == 0);
if (!eh->eh_entries)
- ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
- "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+ ext4_error(inode->i_sb,
+ "inode#%lu, eh->eh_entries = 0!",
+ inode->i_ino);
}
return merge_done;
@@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
ext4_lblk_t next;
unsigned uninitialized = 0;
- BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+ if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+ EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+ return -EIO;
+ }
depth = ext_depth(inode);
ex = path[depth].p_ext;
- BUG_ON(path[depth].p_hdr == NULL);
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
/* try to insert block into found extent and return */
- if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1830,7 @@ has_space:
merge:
/* try to merge extents to the right */
- if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
}
depth = ext_depth(inode);
- BUG_ON(path[depth].p_hdr == NULL);
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ err = -EIO;
+ break;
+ }
ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path);
@@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
- BUG_ON(cbex.ec_len == 0);
+ if (unlikely(cbex.ec_len == 0)) {
+ EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+ err = -EIO;
+ break;
+ }
err = func(inode, path, &cbex, ex, cbdata);
ext4_ext_drop_refs(path);
@@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
cex->ec_type != EXT4_EXT_CACHE_EXTENT);
- if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+ if (in_range(block, cex->ec_block, cex->ec_len)) {
ex->ee_block = cpu_to_le32(cex->ec_block);
ext4_ext_store_pblock(ex, cex->ec_start);
ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
/* free index block */
path--;
leaf = idx_pblock(path->p_idx);
- BUG_ON(path->p_hdr->eh_entries == 0);
+ if (unlikely(path->p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ return -EIO;
+ }
err = ext4_ext_get_access(handle, inode, path);
if (err)
return err;
@@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
- BUG_ON(eh == NULL);
-
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
/* find where to start removing */
ex = EXT_LAST_EXTENT(eh);
@@ -2983,7 +3087,7 @@ fix_extent_len:
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path)
{
@@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
- /* DIO get_block() before submit the IO, split the extent */
- if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ /* get_block() before submit the IO, split the extent */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
@@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* completed
*/
if (io)
- io->flag = DIO_AIO_UNWRITTEN;
+ io->flag = EXT4_IO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ if (ext4_should_dioread_nolock(inode))
+ set_buffer_uninit(bh_result);
goto out;
}
- /* async DIO end_io complete, convert the filled extent to written */
- if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
- ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ /* IO end_io complete, convert the filled extent to written */
+ if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+ ret = ext4_convert_unwritten_extents_endio(handle, inode,
path);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
- struct ext4_extent newex, *ex;
+ struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
@@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* this situation is possible, though, _during_ tree modification;
* this is why assert can't be put in ext4_ext_find_extent()
*/
- if (path[depth].p_ext == NULL && depth != 0) {
- ext4_error(inode->i_sb, __func__, "bad extent address "
- "inode: %lu, iblock: %d, depth: %d",
- inode->i_ino, iblock, depth);
+ if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+ EXT4_ERROR_INODE(inode, "bad extent address "
+ "iblock: %d, depth: %d pblock %lld",
+ iblock, depth, path[depth].p_block);
err = -EIO;
goto out2;
}
@@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
- if (iblock >= ee_block && iblock < ee_block + ee_len) {
+ if (in_range(iblock, ee_block, ee_len)) {
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
@@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
/*
- * io_end structure was created for every async
- * direct IO write to the middle of the file.
- * To avoid unecessary convertion for every aio dio rewrite
- * to the mid of file, here we flag the IO that is really
- * need the convertion.
+ * io_end structure was created for every IO write to an
+ * uninitialized extent. To avoid unecessary conversion,
+ * here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform convertion when IO is done.
*/
- if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
if (io)
- io->flag = DIO_AIO_UNWRITTEN;
+ io->flag = EXT4_IO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |=
- EXT4_STATE_DIO_UNWRITTEN;;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+ if (ext4_should_dioread_nolock(inode))
+ set_buffer_uninit(bh_result);
+ }
+
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+ if (unlikely(!eh->eh_entries)) {
+ EXT4_ERROR_INODE(inode,
+ "eh->eh_entries == 0 ee_block %d",
+ ex->ee_block);
+ err = -EIO;
+ goto out2;
}
+ last_ex = EXT_LAST_EXTENT(eh);
+ if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+ + ext4_ext_get_actual_len(last_ex))
+ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
@@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if (new_size > i_size_read(inode))
+ EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
}
}
@@ -3603,7 +3730,7 @@ retry:
* Returns 0 on success.
*/
int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- loff_t len)
+ ssize_t len)
{
handle_t *handle;
ext4_lblk_t block;
@@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
map_bh.b_state = 0;
ret = ext4_get_blocks(handle, inode, block,
max_blocks, &map_bh,
- EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
int error = 0;
/* in-inode? */
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
struct ext4_iloc iloc;
int offset; /* offset of xattr in inode */
@@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
ext4_lblk_t start_blk;
- ext4_lblk_t len_blks;
int error = 0;
/* fallback to generic here if not in extents fmt */
@@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
error = ext4_xattr_fiemap(inode, fieinfo);
} else {
+ ext4_lblk_t len_blks;
+ __u64 last_blk;
+
start_blk = start >> inode->i_sb->s_blocksize_bits;
- len_blks = len >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCK)
+ last_blk = EXT_MAX_BLOCK-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
* Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 56eee3d796c..d0776e410f3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/quotaops.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -35,9 +36,9 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
@@ -125,7 +126,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
sb->s_dirt = 1;
}
}
- return generic_file_open(inode, filp);
+ return dquot_file_open(inode, filp);
}
const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad0..0d0c3239c1c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (inode->i_sb->s_flags & MS_RDONLY)
return 0;
- ret = flush_aio_dio_completed_IO(inode);
+ ret = flush_completed_IO(inode);
if (ret < 0)
return ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6..361c0b9962a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, __func__, "Checksum bad for group %u",
- block_group);
+ ext4_error(sb, "Checksum bad for group %u", block_group);
ext4_free_blks_set(sb, gdp, 0);
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
bitmap_blk = ext4_inode_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
- ext4_error(sb, __func__,
- "Cannot read inode bitmap - "
+ ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
- ext4_error(sb, __func__,
- "Cannot read inode bitmap - "
+ ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_init(inode);
+ dquot_initialize(inode);
ext4_xattr_delete_inode(handle, inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
es = EXT4_SB(sb)->s_es;
if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
- ext4_error(sb, "ext4_free_inode",
- "reserved or nonexistent inode %lu", ino);
+ ext4_error(sb, "reserved or nonexistent inode %lu", ino);
goto error_return;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit, bitmap_bh->b_data);
if (!cleared)
- ext4_error(sb, "ext4_free_inode",
- "bit already cleared for inode %lu", ino);
+ ext4_error(sb, "bit already cleared for inode %lu", ino);
else {
gdp = ext4_get_group_desc(sb, block_group, &bh2);
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
ino > EXT4_INODES_PER_GROUP(sb)) {
ext4_unlock_group(sb, group);
- ext4_error(sb, __func__,
- "reserved inode or inode > inodes count - "
+ ext4_error(sb, "reserved inode or inode > inodes count - "
"block_group = %u, inode=%lu", group,
ino + group * EXT4_INODES_PER_GROUP(sb));
return 1;
@@ -904,7 +898,7 @@ repeat_in_this_group:
BUFFER_TRACE(inode_bitmap_bh,
"call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
inode_bitmap_bh);
if (err)
goto fail;
@@ -1029,15 +1023,16 @@ got:
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- ei->i_state = EXT4_STATE_NEW;
+ ei->i_state_flags = 0;
+ ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ret = inode;
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto fail_drop;
- }
err = ext4_init_acl(handle, inode, dir);
if (err)
@@ -1074,10 +1069,10 @@ really_out:
return ret;
fail_free_drop:
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
unlock_new_inode(inode);
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
- ext4_warning(sb, __func__,
- "bad orphan ino %lu! e2fsck was run?", ino);
+ ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
goto error;
}
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh) {
- ext4_warning(sb, __func__,
- "inode bitmap error for orphan %lu", ino);
+ ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
goto error;
}
@@ -1140,8 +1133,7 @@ iget_failed:
err = PTR_ERR(inode);
inode = NULL;
bad_orphan:
- ext4_warning(sb, __func__,
- "bad orphan inode %lu! e2fsck was run?", ino);
+ ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
bit, (unsigned long long)bitmap_bh->b_blocknr,
ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e0..986120f3006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
+#include <linux/kernel.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -170,6 +171,9 @@ void ext4_delete_inode(struct inode *inode)
handle_t *handle;
int err;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
@@ -194,7 +198,7 @@ void ext4_delete_inode(struct inode *inode)
inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
@@ -212,7 +216,7 @@ void ext4_delete_inode(struct inode *inode)
if (err > 0)
err = ext4_journal_restart(handle, 3);
if (err != 0) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"couldn't extend journal (err %d)", err);
stop_handle:
ext4_journal_stop(handle);
@@ -323,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode,
offsets[n++] = i_block & (ptrs - 1);
final = ptrs;
} else {
- ext4_warning(inode->i_sb, "ext4_block_to_path",
- "block %lu > max in inode %lu",
+ ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
i_block + direct_blocks +
indirect_blocks + double_blocks, inode->i_ino);
}
@@ -344,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
- ext4_error(inode->i_sb, function,
+ __ext4_error(inode->i_sb, function,
"invalid block reference %u "
"in inode #%lu", blk, inode->i_ino);
return -EIO;
@@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
- BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+ if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + count %lu > %d!",
+ current_block, count,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
target -= count;
/* allocate blocks for indirect blocks */
@@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
- BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+ if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + ar.len %d > %d!",
+ current_block, ar.len,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
if (*err && (target == blks)) {
/*
@@ -1061,6 +1078,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
int mdb_free = 0, allocated_meta_blocks = 0;
spin_lock(&ei->i_block_reservation_lock);
+ trace_ext4_da_update_reserve_space(inode, used);
if (unlikely(used > ei->i_reserved_data_blocks)) {
ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
"with only %d reserved data blocks\n",
@@ -1093,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update quota subsystem */
if (quota_claim) {
- vfs_dq_claim_block(inode, used);
+ dquot_claim_block(inode, used);
if (mdb_free)
- vfs_dq_release_reservation_block(inode, mdb_free);
+ dquot_release_reservation_block(inode, mdb_free);
} else {
/*
* We did fallocate with an offset that is already delayed
@@ -1106,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
* that
*/
if (allocated_meta_blocks)
- vfs_dq_claim_block(inode, allocated_meta_blocks);
- vfs_dq_release_reservation_block(inode, mdb_free + used);
+ dquot_claim_block(inode, allocated_meta_blocks);
+ dquot_release_reservation_block(inode, mdb_free + used);
}
/*
@@ -1124,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, msg,
+ __ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
@@ -1306,7 +1324,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
/*
@@ -1534,6 +1552,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
ext4_truncate(inode);
}
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1575,8 +1595,12 @@ retry:
}
*pagep = page;
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_get_block);
+ if (ext4_should_dioread_nolock(inode))
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext4_get_block_write);
+ else
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1836,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long md_needed, md_reserved;
+ int ret;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1846,6 +1871,7 @@ repeat:
spin_lock(&ei->i_block_reservation_lock);
md_reserved = ei->i_reserved_meta_blocks;
md_needed = ext4_calc_metadata_amount(inode, lblock);
+ trace_ext4_da_reserve_space(inode, md_needed);
spin_unlock(&ei->i_block_reservation_lock);
/*
@@ -1853,11 +1879,12 @@ repeat:
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, md_needed + 1))
- return -EDQUOT;
+ ret = dquot_reserve_block(inode, md_needed + 1);
+ if (ret)
+ return ret;
if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- vfs_dq_release_reservation_block(inode, md_needed + 1);
+ dquot_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1914,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, to_free);
+ dquot_release_reservation_block(inode, to_free);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -2091,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
+ if (buffer_uninit(exbh))
+ set_buffer_uninit(bh);
cur_logical++;
pblock++;
} while ((bh = bh->b_this_page) != head);
@@ -2133,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
+ if (page->index > end)
break;
- index++;
-
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
block_invalidatepage(page, 0);
ClearPageUptodate(page);
unlock_page(page);
}
+ index = pvec.pages[nr_pages - 1]->index + 1;
+ pagevec_release(&pvec);
}
return;
}
@@ -2220,6 +2248,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
*/
new.b_state = 0;
get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+ if (ext4_should_dioread_nolock(mpd->inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
@@ -2630,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;
walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
return ret;
}
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2715,7 @@ static int ext4_writepage(struct page *page,
int ret = 0;
loff_t size;
unsigned int len;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
trace_ext4_writepage(inode, page);
@@ -2758,7 +2791,11 @@ static int ext4_writepage(struct page *page,
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
ret = nobh_writepage(page, noalloc_get_block_write, wbc);
- else
+ else if (page_bufs && buffer_uninit(page_bufs)) {
+ ext4_set_bh_endio(page_bufs, inode);
+ ret = block_write_full_page_endio(page, noalloc_get_block_write,
+ wbc, ext4_end_io_buffer_write);
+ } else
ret = block_write_full_page(page, noalloc_get_block_write,
wbc);
@@ -3301,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
}
- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
err = jbd2_journal_flush(journal);
@@ -3345,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ if (io->page)
+ put_page(io->page);
+ iput(io->inode);
+ kfree(io);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ if (!page_has_buffers(page))
+ return;
+ head = bh = page_buffers(page);
+ do {
+ if (offset <= curr_off && test_clear_buffer_uninit(bh)
+ && bh->b_private) {
+ ext4_free_io_end(bh->b_private);
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ }
+ curr_off = curr_off + bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
/*
+ * free any io_end structure allocated for buffers to be discarded
+ */
+ if (ext4_should_dioread_nolock(page->mapping->host))
+ ext4_invalidatepage_free_endio(page, offset);
+ /*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
@@ -3420,7 +3492,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ if (rw == READ && ext4_should_dioread_nolock(inode))
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block, NULL);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3515,9 @@ retry:
* but cannot extend i_size. Bail out and pretend
* the write failed... */
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
goto out;
}
if (inode->i_nlink)
@@ -3463,75 +3545,63 @@ out:
return ret;
}
-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- handle_t *handle = NULL;
+ handle_t *handle = ext4_journal_current_handle();
int ret = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;
+ int started = 0;
- ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
inode->i_ino, create);
/*
- * DIO VFS code passes create = 0 flag for write to
- * the middle of file. It does this to avoid block
- * allocation for holes, to prevent expose stale data
- * out when there is parallel buffered read (which does
- * not hold the i_mutex lock) while direct IO write has
- * not completed. DIO request on holes finally falls back
- * to buffered IO for this reason.
- *
- * For ext4 extent based file, since we support fallocate,
- * new allocated extent as uninitialized, for holes, we
- * could fallocate blocks for holes, thus parallel
- * buffered IO read will zero out the page when read on
- * a hole while parallel DIO write to the hole has not completed.
- *
- * when we come here, we know it's a direct IO write to
- * to the middle of file (<i_size)
- * so it's safe to override the create flag from VFS.
+ * ext4_get_block in prepare for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after IO complete.
*/
- create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+ create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
- handle = ext4_journal_start(inode, dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ if (!handle) {
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ started = 1;
}
+
ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
create);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
}
- ext4_journal_stop(handle);
+ if (started)
+ ext4_journal_stop(handle);
out:
return ret;
}
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
- BUG_ON(!io);
- iput(io->inode);
- kfree(io);
-}
-static void dump_aio_dio_list(struct inode * inode)
+static void dump_completed_IO(struct inode * inode)
{
#ifdef EXT4_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
- if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
- ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
return;
}
- ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3611,31 @@ static void dump_aio_dio_list(struct inode * inode)
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif
}
/*
* check a range of space and convert unwritten extents to written.
*/
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
- size_t size = io->size;
+ ssize_t size = io->size;
int ret = 0;
- ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
if (list_empty(&io->list))
return ret;
- if (io->flag != DIO_AIO_UNWRITTEN)
+ if (io->flag != EXT4_IO_UNWRITTEN)
return ret;
- if (offset + size <= i_size_read(inode))
- ret = ext4_convert_unwritten_extents(inode, offset, size);
-
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
if (ret < 0) {
printk(KERN_EMERG "%s: failed to convert unwritten"
"extents to written extents, error is %d"
@@ -3579,50 +3648,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
io->flag = 0;
return ret;
}
+
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- int ret = 0;
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+ int ret;
mutex_lock(&inode->i_mutex);
- ret = ext4_end_aio_dio_nolock(io);
- if (ret >= 0) {
- if (!list_empty(&io->list))
- list_del_init(&io->list);
- ext4_free_io_end(io);
+ ret = ext4_end_io_nolock(io);
+ if (ret < 0) {
+ mutex_unlock(&inode->i_mutex);
+ return;
}
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
mutex_unlock(&inode->i_mutex);
+ ext4_free_io_end(io);
}
+
/*
* This function is called from ext4_sync_file().
*
- * When AIO DIO IO is completed, the work to convert unwritten
- * extents to written is queued on workqueue but may not get immediately
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
* scheduled. When fsync is called, we need to ensure the
* conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
- * that might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
*/
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
int ret = 0;
int ret2 = 0;
- if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ if (list_empty(&ei->i_completed_io_list))
return ret;
- dump_aio_dio_list(inode);
- while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
- io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ dump_completed_IO(inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&ei->i_completed_io_list)){
+ io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list);
/*
- * Calling ext4_end_aio_dio_nolock() to convert completed
+ * Calling ext4_end_io_nolock() to convert completed
* IO to written.
*
* When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3718,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
* avoid double converting from both fsync and background work
* queue work.
*/
- ret = ext4_end_aio_dio_nolock(io);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ ret = ext4_end_io_nolock(io);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
if (ret < 0)
ret2 = ret;
else
list_del_init(&io->list);
}
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0;
}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = NULL;
- io = kmalloc(sizeof(*io), GFP_NOFS);
+ io = kmalloc(sizeof(*io), flags);
if (io) {
igrab(inode);
@@ -3656,8 +3742,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
io->flag = 0;
io->offset = 0;
io->size = 0;
- io->error = 0;
- INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ io->page = NULL;
+ INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
@@ -3669,6 +3755,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
+ unsigned long flags;
+ struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -3680,7 +3768,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
/* if not aio dio with unwritten extents, just free io and return */
- if (io_end->flag != DIO_AIO_UNWRITTEN){
+ if (io_end->flag != EXT4_IO_UNWRITTEN){
ext4_free_io_end(io_end);
iocb->private = NULL;
return;
@@ -3688,16 +3776,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
io_end->offset = offset;
io_end->size = size;
+ io_end->flag = EXT4_IO_UNWRITTEN;
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
/* Add the io_end to per-inode completed aio dio list*/
- list_add_tail(&io_end->list,
- &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ ei = EXT4_I(io_end->inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
iocb->private = NULL;
}
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+ ext4_io_end_t *io_end = bh->b_private;
+ struct workqueue_struct *wq;
+ struct inode *inode;
+ unsigned long flags;
+
+ if (!test_clear_buffer_uninit(bh) || !io_end)
+ goto out;
+
+ if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+ printk("sb umounted, discard end_io request for inode %lu\n",
+ io_end->inode->i_ino);
+ ext4_free_io_end(io_end);
+ goto out;
+ }
+
+ io_end->flag = EXT4_IO_UNWRITTEN;
+ inode = io_end->inode;
+
+ /* Add the io_end to per-inode completed io list*/
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+ wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+out:
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ clear_buffer_uninit(bh);
+ end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+ ext4_io_end_t *io_end;
+ struct page *page = bh->b_page;
+ loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+ size_t size = bh->b_size;
+
+retry:
+ io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+ if (!io_end) {
+ if (printk_ratelimit())
+ printk(KERN_WARNING "%s: allocation fail\n", __func__);
+ schedule();
+ goto retry;
+ }
+ io_end->offset = offset;
+ io_end->size = size;
+ /*
+ * We need to hold a reference to the page to make sure it
+ * doesn't get evicted before ext4_end_io_work() has a chance
+ * to convert the extent from written to unwritten.
+ */
+ io_end->page = page;
+ get_page(io_end->page);
+
+ bh->b_private = io_end;
+ bh->b_end_io = ext4_end_io_buffer_write;
+ return 0;
+}
+
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) {
- iocb->private = ext4_init_io_end(inode);
+ iocb->private = ext4_init_io_end(inode, GFP_NOFS);
if (!iocb->private)
return -ENOMEM;
/*
@@ -3767,7 +3924,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ret = blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
- ext4_get_block_dio_write,
+ ext4_get_block_write,
ext4_end_io_dio);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3945,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0 && (EXT4_I(inode)->i_state &
- EXT4_STATE_DIO_UNWRITTEN)) {
+ } else if (ret > 0 && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
@@ -3799,7 +3956,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
offset, ret);
if (err < 0)
ret = err;
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
return ret;
}
@@ -4130,18 +4287,27 @@ no_top:
* We release `count' blocks on disk, but (last - first) may be greater
* than `count' because there can be holes in there.
*/
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh,
- ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first,
- __le32 *last)
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
{
__le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET;
+ int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+ count)) {
+ ext4_error(inode->i_sb, "inode #%lu: "
+ "attempt to clear blocks %llu len %lu, invalid",
+ inode->i_ino, (unsigned long long) block_to_free,
+ count);
+ return 1;
+ }
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4326,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
*p = 0;
ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+ return 0;
}
/**
@@ -4215,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
} else if (nr == block_to_free + count) {
count++;
} else {
- ext4_clear_blocks(handle, inode, this_bh,
- block_to_free,
- count, block_to_free_p, p);
+ if (ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free, count,
+ block_to_free_p, p))
+ break;
block_to_free = nr;
block_to_free_p = p;
count = 1;
@@ -4241,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
ext4_handle_dirty_metadata(handle, inode, this_bh);
else
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"circular indirect block detected, "
"inode=%lu, block=%llu",
inode->i_ino,
@@ -4281,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ nr, 1)) {
+ ext4_error(inode->i_sb,
+ "indirect mapped block in inode "
+ "#%lu invalid (level %d, blk #%lu)",
+ inode->i_ino, depth,
+ (unsigned long) nr);
+ break;
+ }
+
/* Go read the buffer for the next level down */
bh = sb_bread(inode->i_sb, nr);
@@ -4289,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* (should be rare).
*/
if (!bh) {
- ext4_error(inode->i_sb, "ext4_free_branches",
+ ext4_error(inode->i_sb,
"Read failure, inode=%lu, block=%llu",
inode->i_ino, nr);
continue;
@@ -4433,8 +4611,10 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
@@ -4604,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (!bh) {
- ext4_error(sb, "ext4_get_inode_loc", "unable to read "
- "inode block - inode=%lu, block=%llu",
- inode->i_ino, block);
+ ext4_error(sb, "unable to read inode block - "
+ "inode=%lu, block=%llu", inode->i_ino, block);
return -EIO;
}
if (!buffer_uptodate(bh)) {
@@ -4704,9 +4883,8 @@ make_io:
submit_bh(READ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, __func__,
- "unable to read inode block - inode=%lu, "
- "block=%llu", inode->i_ino, block);
+ ext4_error(sb, "unable to read inode block - inode=%lu,"
+ " block=%llu", inode->i_ino, block);
brelse(bh);
return -EIO;
}
@@ -4720,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -4917,8 +5095,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
- ext4_error(sb, __func__,
- "bad extended attribute block %llu in inode #%lu",
+ ext4_error(sb, "bad extended attribute block %llu inode #%lu",
ei->i_file_acl, inode->i_ino);
ret = -EIO;
goto bad_inode;
@@ -4964,8 +5141,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
ret = -EIO;
- ext4_error(inode->i_sb, __func__,
- "bogus i_mode (%o) for inode=%lu",
+ ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
inode->i_mode, inode->i_ino);
goto bad_inode;
}
@@ -5037,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle,
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
@@ -5101,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, inode,
+ err = ext4_handle_dirty_metadata(handle, NULL,
EXT4_SB(sb)->s_sbh);
}
}
@@ -5130,10 +5306,10 @@ static int ext4_do_update_inode(handle_t *handle,
}
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
@@ -5177,7 +5353,7 @@ out_brelse:
* `stuff()' is running, and the new i_size will be lost. Plus the inode
* will no longer be on the superblock's dirty inode list.
*/
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
@@ -5191,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait)
return -EIO;
}
- if (!wait)
+ if (wbc->sync_mode != WB_SYNC_ALL)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -5201,13 +5377,11 @@ int ext4_write_inode(struct inode *inode, int wait)
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
- if (wait)
+ if (wbc->sync_mode == WB_SYNC_ALL)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- ext4_error(inode->i_sb, __func__,
- "IO error syncing inode, "
- "inode=%lu, block=%llu",
- inode->i_ino,
+ ext4_error(inode->i_sb, "IO error syncing inode, "
+ "inode=%lu, block=%llu", inode->i_ino,
(unsigned long long)iloc.bh->b_blocknr);
err = -EIO;
}
@@ -5249,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if (ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
handle_t *handle;
@@ -5261,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
- error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext4_journal_stop(handle);
return error;
@@ -5288,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size ||
+ (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
+ /* ext4_truncate will clear the flag */
+ if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ ext4_truncate(inode);
}
rc = inode_setattr(inode, attr);
@@ -5557,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
entry = IFIRST(header);
/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"Unable to expand inode %lu. Delete"
" some EAs or run e2fsck.",
inode->i_ino);
@@ -5641,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
@@ -5683,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
iloc.bh);
brelse(iloc.bh);
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126d..016d0249294 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags &= ~EXT4_EXTENTS_FL;
}
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
if (me.moved_len > 0)
file_remove_suid(donor_filp);
- if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
err = -EFAULT;
mext_out:
fput(donor_filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e13..54df209d2ee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -69,7 +69,7 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> lenght for this prealloc space
+ * pa_len -> length for this prealloc space
* pa_free -> free space available in this prealloc space
*
* The inode preallocation space is used looking at the _logical_ start
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
for (i = 0; i < count; i++) {
if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
ext4_fsblk_t blocknr;
- blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += first + i;
- blocknr +=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_grp_locked_error(sb, e4b->bd_group,
__func__, "double-free of inode"
" %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
ext4_fsblk_t blocknr;
- blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += block;
- blocknr +=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_grp_locked_error(sb, e4b->bd_group,
__func__, "double-free of inode"
" %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_super_block *es = sbi->s_es;
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
- start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
- ex.fe_start + le32_to_cpu(es->s_first_data_block);
+ start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+ ex.fe_start;
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
BUG_ON(sbi->s_stripe == 0);
/* find first stripe-aligned block in group */
- first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
- + le32_to_cpu(sbi->s_es->s_first_data_block);
+ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
a = first_group_block + sbi->s_stripe - 1;
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root.rb_node = NULL;
+ meta_group_info[i]->bb_free_root = RB_ROOT;
#ifdef DOUBLE_CHECK
{
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
ext4_unlock_group(sb, entry->group);
if (test_opt(sb, DISCARD)) {
ext4_fsblk_t discard_block;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- discard_block = (ext4_fsblk_t)entry->group *
- EXT4_BLOCKS_PER_GROUP(sb)
- + entry->start_blk
- + le32_to_cpu(es->s_first_data_block);
+ discard_block = entry->start_blk +
+ ext4_group_first_block_no(sb, entry->group);
trace_ext4_discard_blocks(sb,
(unsigned long long)discard_block,
entry->count);
@@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (err)
goto out_err;
- block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
- + ac->ac_b_ex.fe_start
- + le32_to_cpu(es->s_first_data_block);
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = ac->ac_b_ex.fe_len;
if (!ext4_data_block_valid(sbi, block, len)) {
- ext4_error(sb, __func__,
- "Allocating blocks %llu-%llu which overlap "
+ ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata\n", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
@@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
- goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
- ac->ac_g_ex.fe_start +
- le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+ goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
/*
* search for the prealloc space that is having
* minimal distance from the goal block.
@@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
- le32_to_cpu(sbi->s_es->s_first_data_block);
+ start = ext4_group_first_block_no(sb, group) + bit;
mb_debug(1, " free preallocated %u/%u in group %u\n",
(unsigned) start, (unsigned) next - bit,
(unsigned) group);
@@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- ext4_error(sb, __func__, "Error in reading block "
- "bitmap for %u", group);
+ ext4_error(sb, "Error reading block bitmap for %u", group);
return 0;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- ext4_error(sb, __func__, "Error in loading buddy "
- "information for %u", group);
+ ext4_error(sb, "Error loading buddy information for %u", group);
put_bh(bitmap_bh);
return 0;
}
@@ -3804,15 +3790,15 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- ext4_error(sb, __func__, "Error in loading buddy "
- "information for %u", group);
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- ext4_error(sb, __func__, "Error in reading block "
- "bitmap for %u", group);
+ ext4_error(sb, "Error reading block bitmap for %u",
+ group);
ext4_mb_release_desc(&e4b);
continue;
}
@@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
/* don't use group allocation for large files */
size = max(size, isize);
- if (size >= sbi->s_mb_stream_request) {
+ if (size > sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
@@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
- ext4_error(sb, __func__, "Error in loading buddy "
- "information for %u", group);
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
continue;
}
ext4_lock_group(sb, group);
@@ -4254,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0;
}
reserv_blks = ar->len;
- while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+ while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
}
@@ -4331,7 +4317,7 @@ out2:
kmem_cache_free(ext4_ac_cachep, ac);
out1:
if (inquota && ar->len < inquota)
- vfs_dq_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, inquota - ar->len);
out3:
if (!ar->len) {
if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
- if (!ext4_data_block_valid(sbi, block, count)) {
- ext4_error(sb, __func__,
- "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_data_block_valid(sbi, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
goto error_return;
}
@@ -4547,8 +4533,7 @@ do_more:
in_range(block + count - 1, ext4_inode_table(sb, gdp),
EXT4_SB(sb)->s_itb_per_group)) {
- ext4_error(sb, __func__,
- "Freeing blocks in system zone - "
+ ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
goto error_return;
@@ -4646,7 +4631,7 @@ do_more:
sb->s_dirt = 1;
error_return:
if (freed)
- vfs_dq_free_block(inode, freed);
+ dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae45..b619322c76f 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,16 +220,9 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
- ext4_fsblk_t block;
-
- block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
- + fex->fe_start
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- return block;
+ return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
}
#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00..8b87bd0eac9 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
}
i_size_write(tmp_inode, i_size_read(inode));
/*
- * We don't want the inode to be reclaimed
- * if we got interrupted in between. We have
- * this tmp inode carrying reference to the
- * data blocks of the original file. We set
- * the i_nlink to zero at the last stage after
- * switching the original file to extent format
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
*/
- tmp_inode->i_nlink = 1;
+ tmp_inode->i_nlink = 0;
ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode)
* allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ /*
+ * It is impossible to update on-disk structures without
+ * a handle, so just rollback in-core changes and live other
+ * work to orphan_list_cleanup()
+ */
+ ext4_orphan_del(NULL, tmp_inode);
+ retval = PTR_ERR(handle);
+ goto out;
+ }
ei = EXT4_I(inode);
i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
-
- /*
- * Set the i_nlink to zero so that
- * generic_drop_inode really deletes the
- * inode
- */
- tmp_inode->i_nlink = 0;
-
ext4_journal_stop(handle);
+out:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a..aa5fe28d180 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
int ret = 0;
if (inode1 == NULL) {
- ext4_error(inode2->i_sb, function,
+ __ext4_error(inode2->i_sb, function,
"Both inodes should not be NULL: "
"inode1 NULL inode2 %lu", inode2->i_ino);
ret = -EIO;
} else if (inode2 == NULL) {
- ext4_error(inode1->i_sb, function,
+ __ext4_error(inode1->i_sb, function,
"Both inodes should not be NULL: "
"inode1 %lu inode2 NULL", inode1->i_ino);
ret = -EIO;
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;
} else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
* orig |------------------------------|
*/
o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;
} else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
struct ext4_extent new_ext, start_ext, end_ext;
ext4_lblk_t new_ext_end;
- ext4_fsblk_t new_phys_end;
int oext_alen, new_ext_alen, end_ext_alen;
int depth = ext_depth(orig_inode);
int ret;
@@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
new_ext.ee_len = dext->ee_len;
new_ext_alen = ext4_ext_get_actual_len(&new_ext);
new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
- new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
/*
* Case: original extent is first
@@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
le32_to_cpu(oext->ee_block) + oext_alen) {
start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
le32_to_cpu(oext->ee_block));
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(oext, &start_ext);
} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
prev_ext = oext - 1;
@@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
start_ext.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(prev_ext) +
new_ext_alen);
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(prev_ext, &start_ext);
new_ext.ee_len = 0;
}
@@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* new_ext |-------|
*/
if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
- ext4_error(orig_inode->i_sb, __func__,
+ ext4_error(orig_inode->i_sb,
"new_ext_end(%u) should be less than or equal to "
"oext->ee_block(%u) + oext_alen(%d) - 1",
new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
while (1) {
/* The extent for donor must be found. */
if (!dext) {
- ext4_error(donor_inode->i_sb, __func__,
+ ext4_error(donor_inode->i_sb,
"The extent for donor must be found");
*err = -EIO;
goto out;
} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
- ext4_error(donor_inode->i_sb, __func__,
+ ext4_error(donor_inode->i_sb,
"Donor offset(%u) and the first block of donor "
"extent(%u) should be equal",
donor_off,
@@ -928,7 +930,7 @@ out2:
}
/**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
*
* @orig_inode: original inode
* @donor_inode: donor inode
@@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode,
unsigned int blkbits = orig_inode->i_blkbits;
unsigned int blocksize = 1 << blkbits;
- /* Regular file check */
- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
- ext4_debug("ext4 move extent: The argument files should be "
- "regular file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
ext4_debug("ext4 move extent: suid or sgid is set"
" to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
/* Protect orig and donor inodes against a truncate */
ret1 = mext_inode_double_lock(orig_inode, donor_inode);
if (ret1 < 0)
@@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (ret1 < 0)
break;
if (*moved_len > len) {
- ext4_error(orig_inode->i_sb, __func__,
+ ext4_error(orig_inode->i_sb,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd6..0c070fabd10 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
root->info.hash_version != DX_HASH_LEGACY) {
- ext4_warning(dir->i_sb, __func__,
- "Unrecognised inode hash code %d",
+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
root->info.hash_version);
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
- ext4_warning(dir->i_sb, __func__,
- "Unimplemented inode hash flags: %#06x",
+ ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
root->info.unused_flags);
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
}
if ((indirect = root->info.indirect_levels) > 1) {
- ext4_warning(dir->i_sb, __func__,
- "Unimplemented inode hash depth: %#06x",
+ ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
root->info.indirect_levels);
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (dx_get_limit(entries) != dx_root_limit(dir,
root->info.info_length)) {
- ext4_warning(dir->i_sb, __func__,
- "dx entry: limit != root limit");
+ ext4_warning(dir->i_sb, "dx entry: limit != root limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
{
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
- ext4_warning(dir->i_sb, __func__,
+ ext4_warning(dir->i_sb,
"dx entry: no count or count > limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail2;
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext4_warning(dir->i_sb, __func__,
+ ext4_warning(dir->i_sb,
"dx entry: limit != node limit");
brelse(bh);
*err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
}
fail:
if (*err == ERR_BAD_DX_DIR)
- ext4_warning(dir->i_sb, __func__,
+ ext4_warning(dir->i_sb,
"Corrupt dir inode %ld, running e2fsck is "
"recommended.", dir->i_ino);
return NULL;
@@ -947,9 +943,8 @@ restart:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* read error, skip block & hope for the best */
- ext4_error(sb, __func__, "reading directory #%lu "
- "offset %lu", dir->i_ino,
- (unsigned long)block);
+ ext4_error(sb, "reading directory #%lu offset %lu",
+ dir->i_ino, (unsigned long)block);
brelse(bh);
goto next;
}
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
retval = ext4_htree_next_block(dir, hash, frame,
frames, NULL);
if (retval < 0) {
- ext4_warning(sb, __func__,
+ ext4_warning(sb,
"error reading index page in directory #%lu",
dir->i_ino);
*err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
__u32 ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
- ext4_error(dir->i_sb, "ext4_lookup",
- "bad inode number: %u", ino);
+ ext4_error(dir->i_sb, "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
if (unlikely(IS_ERR(inode))) {
if (PTR_ERR(inode) == -ESTALE) {
- ext4_error(dir->i_sb, __func__,
+ ext4_error(dir->i_sb,
"deleted inode referenced: %u",
ino);
return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
brelse(bh);
if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
- ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+ ext4_error(child->d_inode->i_sb,
"bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
- ext4_error(dir->i_sb, __func__,
+ ext4_error(dir->i_sb,
"invalid rec_len for '..' in inode %lu",
dir->i_ino);
brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels && (dx_get_count(frames->entries) ==
dx_get_limit(frames->entries))) {
- ext4_warning(sb, __func__,
- "Directory index full!");
+ ext4_warning(sb, "Directory index full!");
err = -ENOSPC;
goto cleanup;
}
@@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
struct inode *inode;
int err, retries = 0;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1916,11 +1915,11 @@ static int empty_dir(struct inode *inode)
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
!(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
if (err)
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"error %d reading directory #%lu offset 0",
err, inode->i_ino);
else
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"bad directory (dir #%lu) - no data block",
inode->i_ino);
return 1;
@@ -1931,7 +1930,7 @@ static int empty_dir(struct inode *inode)
!le32_to_cpu(de1->inode) ||
strcmp(".", de->name) ||
strcmp("..", de1->name)) {
- ext4_warning(inode->i_sb, "empty_dir",
+ ext4_warning(inode->i_sb,
"bad directory (dir #%lu) - no `.' or `..'",
inode->i_ino);
brelse(bh);
@@ -1949,7 +1948,7 @@ static int empty_dir(struct inode *inode)
offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
if (!bh) {
if (err)
- ext4_error(sb, __func__,
+ ext4_error(sb,
"error %d reading directory"
" #%lu offset %u",
err, inode->i_ino, offset);
@@ -2020,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto out_unlock;
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+ goto mem_insert;
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -2037,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
*
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
+mem_insert:
if (!err)
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
@@ -2096,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
@@ -2136,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2163,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
- ext4_warning(inode->i_sb, "ext4_rmdir",
+ ext4_warning(inode->i_sb,
"empty directory has too many links (%d)",
inode->i_nlink);
inode->i_version++;
@@ -2195,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- vfs_dq_init(dentry->d_inode);
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2215,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
goto end_unlink;
if (!inode->i_nlink) {
- ext4_warning(inode->i_sb, "ext4_unlink",
+ ext4_warning(inode->i_sb,
"Deleting nonexistent file (%lu), %d",
inode->i_ino, inode->i_nlink);
inode->i_nlink = 1;
@@ -2250,6 +2261,8 @@ static int ext4_symlink(struct inode *dir,
if (l > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
+ dquot_initialize(dir);
+
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2308,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
+ dquot_initialize(dir);
+
/*
* Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
* otherwise has the potential to corrupt the orphan inode list.
@@ -2358,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ext4_dir_entry_2 *old_de, *new_de;
int retval, force_da_alloc = 0;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_bh = new_bh = dir_bh = NULL;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
if (new_dentry->d_inode)
- vfs_dq_init(new_dentry->d_inode);
+ dquot_initialize(new_dentry->d_inode);
handle = ext4_journal_start(old_dir, 2 *
EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2462,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
if (retval) {
- ext4_warning(old_dir->i_sb, "ext4_rename",
+ ext4_warning(old_dir->i_sb,
"Deleting old file (%lu), %d, error=%d",
old_dir->i_ino, old_dir->i_nlink, retval);
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a..5692c48754a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
if (group != sbi->s_groups_count)
- ext4_warning(sb, __func__,
- "Cannot add at group %u (only %u groups)",
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
input->group, sbi->s_groups_count);
else if (offset != 0)
- ext4_warning(sb, __func__, "Last group not full");
+ ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
- ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
+ ext4_warning(sb, "Reserved blocks too high (%u)",
input->reserved_blocks);
else if (free_blocks_count < 0)
- ext4_warning(sb, __func__, "Bad blocks count %u",
+ ext4_warning(sb, "Bad blocks count %u",
input->blocks_count);
else if (!(bh = sb_bread(sb, end - 1)))
- ext4_warning(sb, __func__,
- "Cannot read last block (%llu)",
+ ext4_warning(sb, "Cannot read last block (%llu)",
end - 1);
else if (outside(input->block_bitmap, start, end))
- ext4_warning(sb, __func__,
- "Block bitmap not in group (block %llu)",
+ ext4_warning(sb, "Block bitmap not in group (block %llu)",
(unsigned long long)input->block_bitmap);
else if (outside(input->inode_bitmap, start, end))
- ext4_warning(sb, __func__,
- "Inode bitmap not in group (block %llu)",
+ ext4_warning(sb, "Inode bitmap not in group (block %llu)",
(unsigned long long)input->inode_bitmap);
else if (outside(input->inode_table, start, end) ||
outside(itend - 1, start, end))
- ext4_warning(sb, __func__,
- "Inode table not in group (blocks %llu-%llu)",
+ ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
(unsigned long long)input->inode_table, itend - 1);
else if (input->inode_bitmap == input->block_bitmap)
- ext4_warning(sb, __func__,
- "Block bitmap same as inode bitmap (%llu)",
+ ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
(unsigned long long)input->block_bitmap);
else if (inside(input->block_bitmap, input->inode_table, itend))
- ext4_warning(sb, __func__,
- "Block bitmap (%llu) in inode table (%llu-%llu)",
+ ext4_warning(sb, "Block bitmap (%llu) in inode table "
+ "(%llu-%llu)",
(unsigned long long)input->block_bitmap,
(unsigned long long)input->inode_table, itend - 1);
else if (inside(input->inode_bitmap, input->inode_table, itend))
- ext4_warning(sb, __func__,
- "Inode bitmap (%llu) in inode table (%llu-%llu)",
+ ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+ "(%llu-%llu)",
(unsigned long long)input->inode_bitmap,
(unsigned long long)input->inode_table, itend - 1);
else if (inside(input->block_bitmap, start, metaend))
- ext4_warning(sb, __func__,
- "Block bitmap (%llu) in GDT table"
- " (%llu-%llu)",
+ ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
(unsigned long long)input->block_bitmap,
start, metaend - 1);
else if (inside(input->inode_bitmap, start, metaend))
- ext4_warning(sb, __func__,
- "Inode bitmap (%llu) in GDT table"
- " (%llu-%llu)",
+ ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
(unsigned long long)input->inode_bitmap,
start, metaend - 1);
else if (inside(input->inode_table, start, metaend) ||
inside(itend - 1, start, metaend))
- ext4_warning(sb, __func__,
- "Inode table (%llu-%llu) overlaps"
- "GDT table (%llu-%llu)",
+ ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+ "(%llu-%llu)",
(unsigned long long)input->inode_table,
itend - 1, start, metaend - 1);
else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
if (le32_to_cpu(*p++) !=
grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
- ext4_warning(sb, __func__,
- "reserved GDT %llu"
+ ext4_warning(sb, "reserved GDT %llu"
" missing grp %d (%llu)",
blk, grp,
grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
*/
if (EXT4_SB(sb)->s_sbh->b_blocknr !=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
- ext4_warning(sb, __func__,
- "won't resize using backup superblock at %llu",
+ ext4_warning(sb, "won't resize using backup superblock at %llu",
(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
return -EPERM;
}
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
data = (__le32 *)dind->b_data;
if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
- ext4_warning(sb, __func__,
- "new group %u GDT block %llu not reserved",
+ ext4_warning(sb, "new group %u GDT block %llu not reserved",
input->group, gdblock);
err = -EINVAL;
goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
GFP_NOFS);
if (!n_group_desc) {
err = -ENOMEM;
- ext4_warning(sb, __func__,
+ ext4_warning(sb,
"not enough memory for %lu groups", gdb_num + 1);
goto exit_inode;
}
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
/* Get each reserved primary GDT block and verify it holds backups */
for (res = 0; res < reserved_gdb; res++, blk++) {
if (le32_to_cpu(*data) != blk) {
- ext4_warning(sb, __func__,
- "reserved block %llu"
+ ext4_warning(sb, "reserved block %llu"
" not at offset %ld",
blk,
(long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
*/
exit_err:
if (err) {
- ext4_warning(sb, __func__,
- "can't update backup for group %u (err %d), "
+ ext4_warning(sb, "can't update backup for group %u (err %d), "
"forcing fsck on next reboot", group, err);
sbi->s_mount_state &= ~EXT4_VALID_FS;
sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
- ext4_warning(sb, __func__,
- "Can't resize non-sparse filesystem further");
+ ext4_warning(sb, "Can't resize non-sparse filesystem further");
return -EPERM;
}
if (ext4_blocks_count(es) + input->blocks_count <
ext4_blocks_count(es)) {
- ext4_warning(sb, __func__, "blocks_count overflow");
+ ext4_warning(sb, "blocks_count overflow");
return -EINVAL;
}
if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
le32_to_cpu(es->s_inodes_count)) {
- ext4_warning(sb, __func__, "inodes_count overflow");
+ ext4_warning(sb, "inodes_count overflow");
return -EINVAL;
}
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (!EXT4_HAS_COMPAT_FEATURE(sb,
EXT4_FEATURE_COMPAT_RESIZE_INODE)
|| !le16_to_cpu(es->s_reserved_gdt_blocks)) {
- ext4_warning(sb, __func__,
+ ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
}
inode = ext4_iget(sb, EXT4_RESIZE_INO);
if (IS_ERR(inode)) {
- ext4_warning(sb, __func__,
- "Error opening resize inode");
+ ext4_warning(sb, "Error opening resize inode");
return PTR_ERR(inode);
}
}
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
mutex_lock(&sbi->s_resize_lock);
if (input->group != sbi->s_groups_count) {
- ext4_warning(sb, __func__,
- "multiple resizers run on filesystem!");
+ ext4_warning(sb, "multiple resizers run on filesystem!");
err = -EBUSY;
goto exit_journal;
}
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
" too large to resize to %llu blocks safely\n",
sb->s_id, n_blocks_count);
if (sizeof(sector_t) < 8)
- ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
+ ext4_warning(sb, "CONFIG_LBDAF not enabled");
return -EINVAL;
}
if (n_blocks_count < o_blocks_count) {
- ext4_warning(sb, __func__,
- "can't shrink FS - resize aborted");
+ ext4_warning(sb, "can't shrink FS - resize aborted");
return -EBUSY;
}
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
- ext4_warning(sb, __func__,
- "need to use ext2online to resize further");
+ ext4_warning(sb, "need to use ext2online to resize further");
return -EPERM;
}
add = EXT4_BLOCKS_PER_GROUP(sb) - last;
if (o_blocks_count + add < o_blocks_count) {
- ext4_warning(sb, __func__, "blocks_count overflow");
+ ext4_warning(sb, "blocks_count overflow");
return -EINVAL;
}
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
add = n_blocks_count - o_blocks_count;
if (o_blocks_count + add < n_blocks_count)
- ext4_warning(sb, __func__,
- "will only finish group (%llu"
- " blocks, %u new)",
+ ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
bh = sb_bread(sb, o_blocks_count + add - 1);
if (!bh) {
- ext4_warning(sb, __func__,
- "can't read last block, resize aborted");
+ ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle = ext4_journal_start_sb(sb, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
- ext4_warning(sb, __func__, "error %d on journal start", err);
+ ext4_warning(sb, "error %d on journal start", err);
goto exit_put;
}
mutex_lock(&EXT4_SB(sb)->s_resize_lock);
if (o_blocks_count != ext4_blocks_count(es)) {
- ext4_warning(sb, __func__,
- "multiple resizers run on filesystem!");
+ ext4_warning(sb, "multiple resizers run on filesystem!");
mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
ext4_journal_stop(handle);
err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
if ((err = ext4_journal_get_write_access(handle,
EXT4_SB(sb)->s_sbh))) {
- ext4_warning(sb, __func__,
- "error %d on journal write access", err);
+ ext4_warning(sb, "error %d on journal write access", err);
mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
ext4_journal_stop(handle);
goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd5..ba191dae873 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -302,7 +302,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
* write out the superblock safely.
*
* We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead. On recovery, the journal will compain about
+ * the journal instead. On recovery, the journal will complain about
* that error until we've noted it down and cleared it.
*/
@@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb)
sb->s_id);
}
-void ext4_error(struct super_block *sb, const char *function,
+void __ext4_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
va_list args;
@@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function,
ext4_handle_error(sb);
}
+void ext4_error_inode(const char *function, struct inode *inode,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+ inode->i_sb->s_id, function, inode->i_ino, current->comm);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(const char *function, struct file *file,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct inode *inode = file->f_dentry->d_inode;
+ char pathname[80], *path;
+
+ va_start(args, fmt);
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (!path)
+ path = "(unknown)";
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+ inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ ext4_handle_error(inode->i_sb);
+}
+
static const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16])
{
@@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
va_end(args);
}
-void ext4_warning(struct super_block *sb, const char *function,
+void __ext4_warning(struct super_block *sb, const char *function,
const char *fmt, ...)
{
va_list args;
@@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
return;
- ext4_warning(sb, __func__,
+ ext4_warning(sb,
"updating to rev %d because of new feature flag, "
"running e2fsck is recommended",
EXT4_DYNAMIC_REV);
@@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
#endif
- INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ INIT_LIST_HEAD(&ei->i_completed_io_list);
+ spin_lock_init(&ei->i_completed_io_lock);
ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
@@ -761,6 +798,7 @@ static void destroy_inodecache(void)
static void ext4_clear_inode(struct inode *inode)
{
+ dquot_drop(inode);
ext4_discard_preallocations(inode);
if (EXT4_JOURNAL(inode))
jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -796,10 +834,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
- if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+ if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
- if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+ if (test_opt(sb, GRPQUOTA))
seq_puts(seq, ",grpquota");
#endif
}
@@ -926,6 +964,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOLOAD))
seq_puts(seq, ",norecovery");
+ if (test_opt(sb, DIOREAD_NOLOCK))
+ seq_puts(seq, ",dioread_nolock");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -1012,19 +1053,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext4_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .reserve_space = dquot_reserve_space,
- .claim_space = dquot_claim_space,
- .release_rsv = dquot_release_reserved_space,
#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
#endif
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
@@ -1109,6 +1140,7 @@ enum {
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard,
};
@@ -1176,6 +1208,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
@@ -1205,6 +1239,66 @@ static ext4_fsblk_t get_sb_block(void **data)
}
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+ "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *qname;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return 0;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext4_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype] &&
+ strcmp(sbi->s_qf_names[qtype], qname)) {
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified", QTYPE2NAME(qtype));
+ kfree(qname);
+ return 0;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ if (strchr(sbi->s_qf_names[qtype], '/')) {
+ ext4_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 0;
+ }
+ set_opt(sbi->s_mount_opt, QUOTA);
+ return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return 0;
+ }
+ /*
+ * The space will be released later when all options are confirmed
+ * to be correct
+ */
+ sbi->s_qf_names[qtype] = NULL;
+ return 1;
+}
+#endif
static int parse_options(char *options, struct super_block *sb,
unsigned long *journal_devnum,
@@ -1217,8 +1311,7 @@ static int parse_options(char *options, struct super_block *sb,
int data_opt = 0;
int option;
#ifdef CONFIG_QUOTA
- int qtype, qfmt;
- char *qname;
+ int qfmt;
#endif
if (!options)
@@ -1229,19 +1322,31 @@ static int parse_options(char *options, struct super_block *sb,
if (!*p)
continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = 0;
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
clear_opt(sbi->s_mount_opt, MINIX_DF);
break;
case Opt_minix_df:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
set_opt(sbi->s_mount_opt, MINIX_DF);
+
break;
case Opt_grpid:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
set_opt(sbi->s_mount_opt, GRPID);
+
break;
case Opt_nogrpid:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
clear_opt(sbi->s_mount_opt, GRPID);
+
break;
case Opt_resuid:
if (match_int(&args[0], &option))
@@ -1378,14 +1483,13 @@ static int parse_options(char *options, struct super_block *sb,
data_opt = EXT4_MOUNT_WRITEBACK_DATA;
datacheck:
if (is_remount) {
- if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
- != data_opt) {
+ if (test_opt(sb, DATA_FLAGS) != data_opt) {
ext4_msg(sb, KERN_ERR,
"Cannot change data mode on remount");
return 0;
}
} else {
- sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+ clear_opt(sbi->s_mount_opt, DATA_FLAGS);
sbi->s_mount_opt |= data_opt;
}
break;
@@ -1397,63 +1501,22 @@ static int parse_options(char *options, struct super_block *sb,
break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
- qtype = USRQUOTA;
- goto set_qf_name;
- case Opt_grpjquota:
- qtype = GRPQUOTA;
-set_qf_name:
- if (sb_any_quota_loaded(sb) &&
- !sbi->s_qf_names[qtype]) {
- ext4_msg(sb, KERN_ERR,
- "Cannot change journaled "
- "quota options when quota turned on");
+ if (!set_qf_name(sb, USRQUOTA, &args[0]))
return 0;
- }
- qname = match_strdup(&args[0]);
- if (!qname) {
- ext4_msg(sb, KERN_ERR,
- "Not enough memory for "
- "storing quotafile name");
- return 0;
- }
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext4_msg(sb, KERN_ERR,
- "%s quota file already "
- "specified", QTYPE2NAME(qtype));
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
- ext4_msg(sb, KERN_ERR,
- "quotafile must be on "
- "filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ break;
+ case Opt_grpjquota:
+ if (!set_qf_name(sb, GRPQUOTA, &args[0]))
return 0;
- }
- set_opt(sbi->s_mount_opt, QUOTA);
break;
case Opt_offusrjquota:
- qtype = USRQUOTA;
- goto clear_qf_name;
+ if (!clear_qf_name(sb, USRQUOTA))
+ return 0;
+ break;
case Opt_offgrpjquota:
- qtype = GRPQUOTA;
-clear_qf_name:
- if (sb_any_quota_loaded(sb) &&
- sbi->s_qf_names[qtype]) {
- ext4_msg(sb, KERN_ERR, "Cannot change "
- "journaled quota options when "
- "quota turned on");
+ if (!clear_qf_name(sb, GRPQUOTA))
return 0;
- }
- /*
- * The space will be released later when all options
- * are confirmed to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
break;
+
case Opt_jqfmt_vfsold:
qfmt = QFMT_VFS_OLD;
goto set_qf_format;
@@ -1518,10 +1581,11 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, BARRIER);
break;
case Opt_barrier:
- if (match_int(&args[0], &option)) {
- set_opt(sbi->s_mount_opt, BARRIER);
- break;
- }
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
if (option)
set_opt(sbi->s_mount_opt, BARRIER);
else
@@ -1594,10 +1658,11 @@ set_qf_format:
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
case Opt_auto_da_alloc:
- if (match_int(&args[0], &option)) {
- clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
- break;
- }
+ if (args[0].from) {
+ if (match_int(&args[0], &option))
+ return 0;
+ } else
+ option = 1; /* No argument, default to 1 */
if (option)
clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
else
@@ -1609,6 +1674,12 @@ set_qf_format:
case Opt_nodiscard:
clear_opt(sbi->s_mount_opt, DISCARD);
break;
+ case Opt_dioread_nolock:
+ set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+ break;
+ case Opt_dioread_lock:
+ clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
@@ -1618,18 +1689,13 @@ set_qf_format:
}
#ifdef CONFIG_QUOTA
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
- sbi->s_qf_names[USRQUOTA])
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sbi->s_mount_opt, USRQUOTA);
- if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
- sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
clear_opt(sbi->s_mount_opt, GRPQUOTA);
- if ((sbi->s_qf_names[USRQUOTA] &&
- (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
- (sbi->s_qf_names[GRPQUOTA] &&
- (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
ext4_msg(sb, KERN_ERR, "old and new quota "
"format mixing");
return 0;
@@ -1939,7 +2005,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (inode->i_nlink) {
ext4_msg(sb, KERN_DEBUG,
"%s: truncating inode %lu to %lld bytes",
@@ -2292,7 +2358,7 @@ static void ext4_sb_release(struct kobject *kobj)
}
-static struct sysfs_ops ext4_attr_ops = {
+static const struct sysfs_ops ext4_attr_ops = {
.show = ext4_attr_show,
.store = ext4_attr_store,
};
@@ -2432,8 +2498,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
if (def_mount_opts & EXT4_DEFM_DEBUG)
set_opt(sbi->s_mount_opt, DEBUG);
- if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+ if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+ "2.6.38");
set_opt(sbi->s_mount_opt, GRPID);
+ }
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sbi->s_mount_opt, NO_UID32);
#ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2514,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, POSIX_ACL);
#endif
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
- sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
- sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
- sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+ set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2477,7 +2546,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2766,7 +2835,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount4;
+ goto failed_mount_wq;
} else {
clear_opt(sbi->s_mount_opt, DATA_FLAGS);
set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2848,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount4;
+ goto failed_mount_wq;
}
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2887,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
ext4_msg(sb, KERN_ERR, "Journal does not support "
"requested data journaling mode");
- goto failed_mount4;
+ goto failed_mount_wq;
}
default:
break;
@@ -2826,13 +2895,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal:
-
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
"its supported only with writeback mode");
clear_opt(sbi->s_mount_opt, NOBH);
}
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+ "not supported with nobh mode");
+ goto failed_mount_wq;
+ }
}
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,6 +2970,18 @@ no_journal:
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
}
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+ "option - requested data journaling mode");
+ clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+ }
+ if (sb->s_blocksize < PAGE_SIZE) {
+ ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+ "option - block size is too small");
+ clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+ }
+ }
err = ext4_setup_system_zone(sb);
if (err) {
@@ -3360,10 +3445,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
char nbuf[16];
errstr = ext4_decode_error(sb, j_errno, nbuf);
- ext4_warning(sb, __func__, "Filesystem error recorded "
+ ext4_warning(sb, "Filesystem error recorded "
"from previous mount: %s", errstr);
- ext4_warning(sb, __func__, "Marking fs in need of "
- "filesystem check.");
+ ext4_warning(sb, "Marking fs in need of filesystem check.");
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3514,7 +3598,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_abort(sb, __func__, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
es = sbi->s_es;
@@ -3708,7 +3792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
* Process 1 Process 2
* ext4_create() quota_sync()
* jbd2_journal_start() write_dquot()
- * vfs_dq_init() down(dqio_mutex)
+ * dquot_initialize() down(dqio_mutex)
* down(dqio_mutex) jbd2_journal_start()
*
*/
@@ -3917,9 +4001,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
int err = 0;
int offset = off & (sb->s_blocksize - 1);
- int tocopy;
int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
@@ -3929,52 +4011,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
- bh = ext4_bread(handle, inode, blk, 1, &err);
- if (!bh)
+ bh = ext4_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ if (journal_quota) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
goto out;
- if (journal_quota) {
- err = ext4_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
}
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext4_handle_dirty_metadata(handle, NULL, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext4_jbd2_file_inode(handle, inode);
- mark_buffer_dirty(bh);
- }
- brelse(bh);
- if (err)
- goto out;
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
}
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ if (journal_quota)
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ else {
+ /* Always do at least ordered writes for quotas */
+ err = ext4_jbd2_file_inode(handle, inode);
+ mark_buffer_dirty(bh);
+ }
+ brelse(bh);
out:
- if (len == towrite) {
+ if (err) {
mutex_unlock(&inode->i_mutex);
return err;
}
- if (inode->i_size < off+len-towrite) {
- i_size_write(inode, off+len-towrite);
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
}
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ext4_mark_inode_dirty(handle, inode);
mutex_unlock(&inode->i_mutex);
- return len - towrite;
+ return len;
}
#endif
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45a..b4c5aa8489d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
-bad_block: ext4_error(inode->i_sb, __func__,
+bad_block:
+ ext4_error(inode->i_sb,
"inode %lu: bad block %llu", inode->i_ino,
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return -ENODATA;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"inode %lu: bad block %llu", inode->i_ino,
EXT4_I(inode)->i_file_acl);
error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
void *end;
int error;
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return 0;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -494,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext4_handle_dirty_metadata(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
if (ce)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
if (ext4_xattr_check_block(bs->bh)) {
- ext4_error(sb, __func__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ ext4_error(sb, "inode %lu: bad block %llu",
+ inode->i_ino, EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -787,8 +787,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = -EDQUOT;
- if (vfs_dq_alloc_block(inode, 1))
+ error = dquot_alloc_block(inode, 1);
+ if (error)
goto cleanup;
error = ext4_journal_get_write_access(handle,
new_bh);
@@ -876,13 +876,12 @@ cleanup:
return error;
cleanup_dquot:
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto cleanup;
bad_block:
- ext4_error(inode->i_sb, __func__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+ inode->i_ino, EXT4_I(inode)->i_file_acl);
goto cleanup;
#undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
error = ext4_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
return 0;
}
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (strlen(name) > 255)
return -ERANGE;
down_write(&EXT4_I(inode)->xattr_sem);
- no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_get_inode_loc(inode, &is.iloc);
if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
}
error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = ext4_current_time(inode);
if (!value)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
brelse(is.iloc.bh);
brelse(bs.bh);
if (no_expand == 0)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
@@ -1195,9 +1194,8 @@ retry:
if (!bh)
goto cleanup;
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, __func__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+ inode->i_ino, EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -1302,6 +1300,8 @@ retry:
/* Remove the chosen entry from the inode */
error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ if (error)
+ goto cleanup;
entry = IFIRST(header);
if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
goto cleanup;
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh) {
- ext4_error(inode->i_sb, __func__,
- "inode %lu: block %llu read error", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+ inode->i_ino, EXT4_I(inode)->i_file_acl);
goto cleanup;
}
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext4_error(inode->i_sb, __func__,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+ inode->i_ino, EXT4_I(inode)->i_file_acl);
goto cleanup;
}
ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1504,7 @@ again:
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"inode %lu: block %lu read error",
inode->i_ino, (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05c..0ce143bd7d5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = sbi->free_clusters;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
- buf->f_namelen = sbi->options.isvfat ? 260 : 12;
+ buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
return 0;
}
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
return i_pos;
}
-static int fat_write_inode(struct inode *inode, int wait)
+static int __fat_write_inode(struct inode *inode, int wait)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
return err;
}
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
int fat_sync_inode(struct inode *inode)
{
- return fat_write_inode(inode, 1);
+ return __fat_write_inode(inode, 1);
}
EXPORT_SYMBOL_GPL(fat_sync_inode);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b..c1ef5015486 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
*outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
if (*outlen < 0)
return *outlen;
- else if (*outlen > 255)
+ else if (*outlen > FAT_LFN_LEN)
return -ENAMETOOLONG;
op = &outname[*outlen * sizeof(wchar_t)];
} else {
if (nls) {
for (i = 0, ip = name, op = outname, *outlen = 0;
- i < len && *outlen <= 255;
+ i < len && *outlen <= FAT_LFN_LEN;
*outlen += 1)
{
if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
return -ENAMETOOLONG;
} else {
for (i = 0, ip = name, op = outname, *outlen = 0;
- i < len && *outlen <= 255;
+ i < len && *outlen <= FAT_LFN_LEN;
i++, *outlen += 1)
{
*op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
return fat_search_long(dir, qname->name, len, sinfo);
}
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+ return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
}
alias = d_find_alias(inode);
- if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+ if (alias && !vfat_d_anon_disconn(alias)) {
/*
- * This inode has non DCACHE_DISCONNECTED dentry. This
- * means, the user did ->lookup() by an another name
- * (longname vs 8.3 alias of it) in past.
+ * This inode has non anonymous-DCACHE_DISCONNECTED
+ * dentry. This means, the user did ->lookup() by an
+ * another name (longname vs 8.3 alias of it) in past.
*
* Switch to new one for reason of locality if possible.
*/
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
iput(inode);
unlock_super(sb);
return alias;
- }
+ } else
+ dput(alias);
+
out:
unlock_super(sb);
dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 97e01dc0d95..452d02f9075 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
switch (cmd) {
case F_DUPFD:
case F_DUPFD_CLOEXEC:
- if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ if (arg >= rlimit(RLIMIT_NOFILE))
break;
err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
if (err >= 0) {
diff --git a/fs/file.c b/fs/file.c
index 38039af6766..34bb7f71d99 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
- if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ if (nr >= rlimit(RLIMIT_NOFILE))
return -EMFILE;
/* Do we need to expand? */
diff --git a/fs/file_table.c b/fs/file_table.c
index b98404b5438..32d12b78bac 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -393,7 +393,9 @@ retry:
continue;
if (!(f->f_mode & FMODE_WRITE))
continue;
+ spin_lock(&f->f_lock);
f->f_mode &= ~FMODE_WRITE;
+ spin_unlock(&f->f_lock);
if (file_check_writeable(f) != 0)
continue;
file_release_write(f);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c64ff..76fc4d594ac 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
-static int write_inode(struct inode *inode, int sync)
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, sync);
+ return inode->i_sb->s_op->write_inode(inode, wbc);
return 0;
}
@@ -421,7 +421,6 @@ static int
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
- int wait = wbc->sync_mode == WB_SYNC_ALL;
unsigned dirty;
int ret;
@@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* We'll have another go at writing back this inode when we
* completed a full scan of b_io.
*/
- if (!wait) {
+ if (wbc->sync_mode != WB_SYNC_ALL) {
requeue_io(inode);
return 0;
}
@@ -461,15 +460,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
ret = do_writepages(mapping, wbc);
- /* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- int err = write_inode(inode, wait);
+ /*
+ * Make sure to wait on the data before writing out the metadata.
+ * This is important for filesystems that modify metadata on data
+ * I/O completion.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
- if (wait) {
- int err = filemap_fdatawait(mapping);
+ /* Don't write the inode if only I_DIRTY_PAGES was set */
+ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a24..cc94bb9563f 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
config FSCACHE
tristate "General filesystem local caching manager"
- depends on EXPERIMENTAL
select SLOW_WORK
help
This option enables a generic filesystem caching manager that can be
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24..ec14d19ce50 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
req->in.args[0].size = sizeof(*arg);
req->in.args[0].value = arg;
req->out.numargs = 1;
- /* Variable length arguement used for backward compatibility
+ /* Variable length argument used for backward compatibility
with interface version < 7.5. Rest of init_out is zeroed
by do_get_request(), so a short reply is not a problem */
req->out.argvar = 1;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326..a47b4310711 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
select FS_POSIX_ACL
select CRC32
select SLOW_WORK
- select QUOTA
select QUOTACTL
help
A cluster filesystem.
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a27..e6dd2aec6f8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(&ip->i_inode))
+ if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b8025e51cab..3aac46f6853 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -616,7 +616,7 @@ struct gfs2_sbd {
unsigned int sd_log_blks_reserved;
unsigned int sd_log_commited_buf;
unsigned int sd_log_commited_databuf;
- unsigned int sd_log_commited_revoke;
+ int sd_log_commited_revoke;
unsigned int sd_log_num_buf;
unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc45..e5bf4b59d46 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
databufhdrs_needed = (sdp->sd_log_commited_databuf +
(dbuf_limit - 1)) / dbuf_limit;
- if (sdp->sd_log_commited_revoke)
+ if (sdp->sd_log_commited_revoke > 0)
revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
sizeof(u64));
@@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
(((int)sdp->sd_log_commited_databuf) >= 0));
sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
- gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
reserved = calc_reserved(sdp);
gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a054b526dc0..c1309ed1c49 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1001,7 +1001,7 @@ static const struct lm_lockops nolock_ops = {
/**
* gfs2_lm_mount - mount a locking protocol
* @sdp: the filesystem
- * @args: mount arguements
+ * @args: mount arguments
* @silent: if 1, don't complain if the FS isn't a GFS2 fs
*
* Returns: errno
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3bf6eab875..6dbcbad6ab1 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
}
}
-int gfs2_quota_sync(struct super_block *sb, int type)
+int gfs2_quota_sync(struct super_block *sb, int type, int wait)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
@@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
+static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
+{
+ return gfs2_quota_sync(sb, type, 0);
+}
+
int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
{
struct gfs2_quota_data *qd;
@@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e271fa07ad0..195f60c8bd1 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct super_block *sb, int type);
+extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5e22629da6..50aac606b99 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -22,6 +22,7 @@
#include <linux/crc32.h>
#include <linux/time.h>
#include <linux/wait.h>
+#include <linux/writeback.h>
#include "gfs2.h"
#include "incore.h"
@@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
* Returns: errno
*/
-static int gfs2_write_inode(struct inode *inode, int sync)
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
do_unlock:
gfs2_glock_dq_uninit(&gh);
do_flush:
- if (sync != 0)
+ if (wbc->sync_mode == WB_SYNC_ALL)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
return ret;
}
@@ -763,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
int error;
flush_workqueue(gfs2_delete_workqueue);
- gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_quota_sync(sdp->sd_vfs, 0, 1);
gfs2_statfs_sync(sdp->sd_vfs, 0);
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a0db1c94317..419042f7f0b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -49,7 +49,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
return a->store ? a->store(sdp, buf, len) : len;
}
-static struct sysfs_ops gfs2_attr_ops = {
+static const struct sysfs_ops gfs2_attr_ops = {
.show = gfs2_attr_show,
.store = gfs2_attr_store,
};
@@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_quota_sync(sdp->sd_vfs, 0, 1);
return len;
}
@@ -574,7 +574,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
return 0;
}
-static struct kset_uevent_ops gfs2_uevent_ops = {
+static const struct kset_uevent_ops gfs2_uevent_ops = {
.uevent = gfs2_uevent,
};
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e1167..fe35e3b626c 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
-extern int hfs_write_inode(struct inode *, int);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
extern int hfs_inode_setattr(struct dentry *, struct iattr *);
extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
__be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d9..14f5cb1b9fd 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
HFS_SB(inode->i_sb)->alloc_blksz);
}
-int hfs_write_inode(struct inode *inode, int unused)
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct inode *main_inode = inode;
struct hfs_find_data fd;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d514..74b473a8ef9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
return ERR_PTR(err);
}
-static int hfsplus_write_inode(struct inode *inode, int unused)
+static int hfsplus_write_inode(struct inode *inode,
+ struct writeback_control *wbc)
{
struct hfsplus_vh *vhdr;
int ret = 0;
diff --git a/fs/inode.c b/fs/inode.c
index 03dfeb2e392..407bf392e20 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/dcache.h>
#include <linux/init.h>
-#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/module.h>
@@ -314,7 +313,6 @@ void clear_inode(struct inode *inode)
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
inode_sync_wait(inode);
- vfs_dq_drop(inode);
if (inode->i_sb->s_op->clear_inode)
inode->i_sb->s_op->clear_inode(inode);
if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
if (op->delete_inode) {
void (*delete)(struct inode *) = op->delete_inode;
- if (!is_bad_inode(inode))
- vfs_dq_init(inode);
/* Filesystems implementing their own
* s_op->delete_inode are required to call
* truncate_inode_pages and clear_inode()
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c4..2c90e3ef625 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -862,12 +862,12 @@ restart_loop:
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
- * that buffer in the future now that the last use has
- * been committed. That's not only a performance gain,
- * it also stops aliasing problems if the buffer is left
- * behind for writeback and gets reallocated for another
+ * that buffer in the future after the "add to orphan"
+ * operation been committed, That's not only a performance
+ * gain, it also stops aliasing problems if the buffer is
+ * left behind for writeback and gets reallocated for another
* use in a different page. */
- if (buffer_freed(bh)) {
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a..5ae71e75a49 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
* the case where our storage is so fast that it is more optimal to go
* ahead and force a flush and wait for the transaction to be committed
* than it is to wait for an arbitrary amount of time for new writers to
- * join the transaction. We acheive this by measuring how long it takes
+ * join the transaction. We achieve this by measuring how long it takes
* to commit a transaction, and compare it with how long this
* transaction has been running, and if run time < commit time then we
* sleep for the delta and commit. This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!jh)
goto zap_buffer_no_jh;
+ /*
+ * We cannot remove the buffer from checkpoint lists until the
+ * transaction adding inode to orphan list (let's call it T)
+ * is committed. Otherwise if the transaction changing the
+ * buffer would be cleaned from the journal before T is
+ * committed, a crash will cause that the correct contents of
+ * the buffer will be lost. On the other hand we have to
+ * clear the buffer dirty bit at latest at the moment when the
+ * transaction marking the buffer as freed in the filesystem
+ * structures is committed because from that moment on the
+ * buffer can be reallocated and used by a different page.
+ * Since the block hasn't been freed yet but the inode has
+ * already been added to orphan list, it is safe for us to add
+ * the buffer to BJ_Forget list of the newest transaction.
+ */
transaction = jh->b_transaction;
if (transaction == NULL) {
/* First case: not on any transaction. If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
goto zap_buffer;
}
/*
- * If it is committing, we simply cannot touch it. We
- * can remove it's next_transaction pointer from the
- * running transaction if that is set, but nothing
- * else. */
+ * The buffer is committing, we simply cannot touch
+ * it. So we just set j_next_transaction to the
+ * running transaction (if there is one) and mark
+ * buffer as freed so that commit code knows it should
+ * clear dirty bits when it is done with the buffer.
+ */
set_buffer_freed(bh);
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction ==
- journal->j_running_transaction);
- jh->b_next_transaction = NULL;
- }
+ if (journal->j_running_transaction && buffer_jbddirty(bh))
+ jh->b_next_transaction = journal->j_running_transaction;
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
*/
void __journal_refile_buffer(struct journal_head *jh)
{
- int was_dirty;
+ int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
__journal_temp_unlink_buffer(jh);
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
- __journal_file_buffer(jh, jh->b_transaction,
- jh->b_modified ? BJ_Metadata : BJ_Reserved);
+ if (buffer_freed(bh))
+ jlist = BJ_Forget;
+ else if (jh->b_modified)
+ jlist = BJ_Metadata;
+ else
+ jlist = BJ_Reserved;
+ __journal_file_buffer(jh, jh->b_transaction, jlist);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 88684937095..30beb11ef92 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
+ trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %lu), "
"freeing %lu\n",
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d..671da7fb7ff 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -883,8 +883,7 @@ restart_loop:
spin_unlock(&journal->j_list_lock);
bh = jh2bh(jh);
jbd_lock_bh_state(bh);
- J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
- jh->b_transaction == journal->j_running_transaction);
+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
* If there is undo-protected committed data against
@@ -930,12 +929,12 @@ restart_loop:
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
- * that buffer in the future now that the last use has
- * been committed. That's not only a performance gain,
- * it also stops aliasing problems if the buffer is left
- * behind for writeback and gets reallocated for another
+ * that buffer in the future after the "add to orphan"
+ * operation been committed, That's not only a performance
+ * gain, it also stops aliasing problems if the buffer is
+ * left behind for writeback and gets reallocated for another
* use in a different page. */
- if (buffer_freed(bh)) {
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d..c03d4dce4d7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
/*
* Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
}
}
+ /*
+ * Create a slab for this blocksize
+ */
+ err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+ if (err)
+ return err;
+
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
}
/*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data. Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks. (For example, 16k pages on Power systems
+ * with a 4k block file system.) For blocks smaller than a page, we
+ * use a SLAB allocator. There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded. For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+ "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+ "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < JBD2_MAX_SLABS; i++) {
+ if (jbd2_slab[i])
+ kmem_cache_destroy(jbd2_slab[i]);
+ jbd2_slab[i] = NULL;
+ }
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+ size_t slab_size;
+
+ if (size == PAGE_SIZE)
+ return 0;
+
+ if (i >= JBD2_MAX_SLABS)
+ return -EINVAL;
+
+ if (unlikely(i < 0))
+ i = 0;
+ down(&jbd2_slab_create_sem);
+ if (jbd2_slab[i]) {
+ up(&jbd2_slab_create_sem);
+ return 0; /* Already created */
+ }
+
+ slab_size = 1 << (i+10);
+ jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+ slab_size, 0, NULL);
+ up(&jbd2_slab_create_sem);
+ if (!jbd2_slab[i]) {
+ printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+ int i = order_base_2(size) - 10;
+
+ BUG_ON(i >= JBD2_MAX_SLABS);
+ if (unlikely(i < 0))
+ i = 0;
+ BUG_ON(jbd2_slab[i] == 0);
+ return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+ void *ptr;
+
+ BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+ flags |= __GFP_REPEAT;
+ if (size == PAGE_SIZE)
+ ptr = (void *)__get_free_pages(flags, 0);
+ else if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ ptr = (void *)__get_free_pages(flags, order);
+ else
+ ptr = vmalloc(size);
+ } else
+ ptr = kmem_cache_alloc(get_slab(size), flags);
+
+ /* Check alignment; SLUB has gotten this wrong in the past,
+ * and this can lead to user data corruption! */
+ BUG_ON(((unsigned long) ptr) & (size-1));
+
+ return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+ if (size == PAGE_SIZE) {
+ free_pages((unsigned long)ptr, 0);
+ return;
+ }
+ if (size > PAGE_SIZE) {
+ int order = get_order(size);
+
+ if (order < 3)
+ free_pages((unsigned long)ptr, order);
+ else
+ vfree(ptr);
+ return;
+ }
+ kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
* Journal_head storage management
*/
static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
jbd2_journal_destroy_revoke_caches();
jbd2_journal_destroy_jbd2_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_slabs();
}
static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542..bfc70f57900 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!jh)
goto zap_buffer_no_jh;
+ /*
+ * We cannot remove the buffer from checkpoint lists until the
+ * transaction adding inode to orphan list (let's call it T)
+ * is committed. Otherwise if the transaction changing the
+ * buffer would be cleaned from the journal before T is
+ * committed, a crash will cause that the correct contents of
+ * the buffer will be lost. On the other hand we have to
+ * clear the buffer dirty bit at latest at the moment when the
+ * transaction marking the buffer as freed in the filesystem
+ * structures is committed because from that moment on the
+ * buffer can be reallocated and used by a different page.
+ * Since the block hasn't been freed yet but the inode has
+ * already been added to orphan list, it is safe for us to add
+ * the buffer to BJ_Forget list of the newest transaction.
+ */
transaction = jh->b_transaction;
if (transaction == NULL) {
/* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
/*
- * If it is committing, we simply cannot touch it. We
- * can remove it's next_transaction pointer from the
- * running transaction if that is set, but nothing
- * else. */
+ * The buffer is committing, we simply cannot touch
+ * it. So we just set j_next_transaction to the
+ * running transaction (if there is one) and mark
+ * buffer as freed so that commit code knows it should
+ * clear dirty bits when it is done with the buffer.
+ */
set_buffer_freed(bh);
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction ==
- journal->j_running_transaction);
- jh->b_next_transaction = NULL;
- }
+ if (journal->j_running_transaction && buffer_jbddirty(bh))
+ jh->b_next_transaction = journal->j_running_transaction;
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
*/
void __jbd2_journal_refile_buffer(struct journal_head *jh)
{
- int was_dirty;
+ int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
- __jbd2_journal_file_buffer(jh, jh->b_transaction,
- jh->b_modified ? BJ_Metadata : BJ_Reserved);
+ if (buffer_freed(bh))
+ jlist = BJ_Forget;
+ else if (jh->b_modified)
+ jlist = BJ_Metadata;
+ else
+ jlist = BJ_Reserved;
+ __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b7..d32ee9412cb 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
else BUG();
}
}
- list->rb_node = NULL;
+ *list = RB_ROOT;
}
static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c3430..213169780b6 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -20,7 +20,6 @@
#include <linux/sched.h>
#include <linux/fs.h>
-#include <linux/quotaops.h>
#include <linux/posix_acl_xattr.h>
#include "jfs_incore.h"
#include "jfs_txnmgr.h"
@@ -174,7 +173,7 @@ cleanup:
return rc;
}
-static int jfs_acl_chmod(struct inode *inode)
+int jfs_acl_chmod(struct inode *inode)
{
struct posix_acl *acl, *clone;
int rc;
@@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode)
posix_acl_release(clone);
return rc;
}
-
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = dentry->d_inode;
- int rc;
-
- rc = inode_change_ok(inode, iattr);
- if (rc)
- return rc;
-
- if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
- if (vfs_dq_transfer(inode, iattr))
- return -EDQUOT;
- }
-
- rc = inode_setattr(inode, iattr);
-
- if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = jfs_acl_chmod(inode);
-
- return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a..14ba982b3f2 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
*/
#include <linux/fs.h>
+#include <linux/quotaops.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
{
int rc;
- if ((rc = generic_file_open(inode, file)))
+ if ((rc = dquot_file_open(inode, file)))
return rc;
/*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
return 0;
}
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int rc;
+
+ rc = inode_change_ok(inode, iattr);
+ if (rc)
+ return rc;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
+ if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+ rc = dquot_transfer(inode, iattr);
+ if (rc)
+ return rc;
+ }
+
+ rc = inode_setattr(inode, iattr);
+
+ if (!rc && (iattr->ia_valid & ATTR_MODE))
+ rc = jfs_acl_chmod(inode);
+
+ return rc;
+}
+
const struct inode_operations jfs_file_inode_operations = {
.truncate = jfs_truncate,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
.removexattr = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
.check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77b..9dd126276c9 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
return rc;
}
-int jfs_write_inode(struct inode *inode, int wait)
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
+ int wait = wbc->sync_mode == WB_SYNC_ALL;
+
if (test_cflag(COMMIT_Nolink, inode))
return 0;
/*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
{
jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
if (!is_bad_inode(inode) &&
(JFS_IP(inode)->fileset == FILESYSTEM_I)) {
truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
/*
* Free the inode from the quota allocation.
*/
- vfs_dq_init(inode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_initialize(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
}
clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef8..54e07559878 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
int jfs_check_acl(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_setattr(struct dentry *, struct iattr *);
+int jfs_acl_chmod(struct inode *inode);
#else
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
return 0;
}
+static inline int jfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
#endif
#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887..0e4623be70c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
* It's time to move the inline table to an external
* page and begin to build the xtree
*/
- if (vfs_dq_alloc_block(ip, sbi->nbperpage))
+ if (dquot_alloc_block(ip, sbi->nbperpage))
goto clean_up;
if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
- vfs_dq_free_block(ip, sbi->nbperpage);
+ dquot_free_block(ip, sbi->nbperpage);
goto clean_up;
}
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
memcpy(&jfs_ip->i_dirtable, temp_table,
sizeof (temp_table));
dbFree(ip, xaddr, sbi->nbperpage);
- vfs_dq_free_block(ip, sbi->nbperpage);
+ dquot_free_block(ip, sbi->nbperpage);
goto clean_up;
}
ip->i_size = PSIZE;
@@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid,
n = xlen;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, n)) {
- rc = -EDQUOT;
+ rc = dquot_alloc_block(ip, n);
+ if (rc)
goto extendOut;
- }
quota_allocation += n;
if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid,
/* Rollback quota allocation */
if (rc && quota_allocation)
- vfs_dq_free_block(ip, quota_allocation);
+ dquot_free_block(ip, quota_allocation);
dtSplitUp_Exit:
@@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
return -EIO;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid,
struct dt_lock *dtlck;
struct tlock *tlck;
struct lv *lv;
+ int rc;
/* get split root page */
smp = split->mp;
@@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid,
rp = rmp->data;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
xlen = lengthPXD(&fp->header.self);
/* Free quota allocation. */
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
/* free/invalidate its buffer page */
discard_metapage(fmp);
@@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
xlen = lengthPXD(&p->header.self);
/* Free quota allocation */
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
/* free/invalidate its buffer page */
discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb..5d3bbd10f8d 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
}
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, nxlen)) {
+ rc = dquot_alloc_block(ip, nxlen);
+ if (rc) {
dbFree(ip, nxaddr, (s64) nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return -EDQUOT;
+ return rc;
}
/* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
*/
if (rc) {
dbFree(ip, nxaddr, nxlen);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
return (rc);
}
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
goto exit;
/* Allocat blocks to quota. */
- if (vfs_dq_alloc_block(ip, nxlen)) {
+ rc = dquot_alloc_block(ip, nxlen);
+ if (rc) {
dbFree(ip, nxaddr, (s64) nxlen);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return -EDQUOT;
+ return rc;
}
delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
/* extend the extent */
if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
dbFree(ip, xaddr + xlen, delta);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
goto exit;
}
} else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
*/
if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
dbFree(ip, nxaddr, nxlen);
- vfs_dq_free_block(ip, nxlen);
+ dquot_free_block(ip, nxlen);
goto exit;
}
}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac..829921b6776 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
/*
* Allocate inode to quota.
*/
- if (vfs_dq_alloc_inode(inode)) {
- rc = -EDQUOT;
+ dquot_initialize(inode);
+ rc = dquot_alloc_inode(inode);
+ if (rc)
goto fail_drop;
- }
inode->i_mode = mode;
/* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
return inode;
fail_drop:
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
fail_unlock:
inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d6..79e2c79661d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
-extern int jfs_write_inode(struct inode*, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
extern void jfs_delete_inode(struct inode *);
extern void jfs_dirty_inode(struct inode *);
extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
extern void jfs_set_inode_flags(struct inode *);
extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
extern const struct address_space_operations jfs_aops;
extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a645864..6c50871e622 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
hint = addressXAD(xad) + lengthXAD(xad) - 1;
} else
hint = 0;
- if ((rc = vfs_dq_alloc_block(ip, xlen)))
+ if ((rc = dquot_alloc_block(ip, xlen)))
goto out;
if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
goto out;
}
}
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
/* undo data extent allocation */
if (*xaddrp == 0) {
dbFree(ip, xaddr, (s64) xlen);
- vfs_dq_free_block(ip, xlen);
+ dquot_free_block(ip, xlen);
}
return rc;
}
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
rbn = addressPXD(pxd);
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
- rc = -EDQUOT;
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc)
goto clean_up;
- }
quota_allocation += lengthPXD(pxd);
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
/* Rollback quota allocation. */
if (quota_allocation)
- vfs_dq_free_block(ip, quota_allocation);
+ dquot_free_block(ip, quota_allocation);
return (rc);
}
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
struct pxdlist *pxdlist;
struct tlock *tlck;
struct xtlock *xtlck;
+ int rc;
sp = &JFS_IP(ip)->i_xtroot;
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
return -EIO;
/* Allocate blocks to quota. */
- if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+ rc = dquot_alloc_block(ip, lengthPXD(pxd));
+ if (rc) {
release_metapage(rmp);
- return -EDQUOT;
+ return rc;
}
jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
ip->i_size = newsize;
/* update quota allocation to reflect freed blocks */
- vfs_dq_free_block(ip, nfreed);
+ dquot_free_block(ip, nfreed);
/*
* free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f08..4a3e9f39c21 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dquot_initialize(dip);
+
/*
* search parent directory for entry/freespace
* (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dquot_initialize(dip);
+
/* link count overflow on parent directory ? */
if (dip->i_nlink == JFS_LINK_MAX) {
rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
/* Init inode for quota operations. */
- vfs_dq_init(ip);
+ dquot_initialize(dip);
+ dquot_initialize(ip);
/* directory must be empty to be removed */
if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
/* Init inode for quota operations. */
- vfs_dq_init(ip);
+ dquot_initialize(dip);
+ dquot_initialize(ip);
if ((rc = get_UCSname(&dname, dentry)))
goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
if (ip->i_nlink == 0)
return -ENOENT;
+ dquot_initialize(dir);
+
tid = txBegin(ip->i_sb, 0);
mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+ dquot_initialize(dip);
+
ssize = strlen(name) + 1;
/*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_ip = old_dentry->d_inode;
new_ip = new_dentry->d_inode;
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
} else if (new_ip) {
IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
/* Init inode for quota operations. */
- vfs_dq_init(new_ip);
+ dquot_initialize(new_ip);
}
/*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
jfs_info("jfs_mknod: %s", dentry->d_name.name);
+ dquot_initialize(dir);
+
if ((rc = get_UCSname(&dname, dentry)))
goto out;
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
.removexattr = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
.check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d929a822a74..266699deb1c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode)
kmem_cache_free(jfs_inode_cachep, ji);
}
+static void jfs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = {
.dirty_inode = jfs_dirty_inode,
.write_inode = jfs_write_inode,
.delete_inode = jfs_delete_inode,
+ .clear_inode = jfs_clear_inode,
.put_super = jfs_put_super,
.sync_fs = jfs_sync_fs,
.freeze_fs = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc..1f594ab2189 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
/* Allocate new blocks to quota. */
- if (vfs_dq_alloc_block(ip, nblocks)) {
- return -EDQUOT;
- }
+ rc = dquot_alloc_block(ip, nblocks);
+ if (rc)
+ return rc;
rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
if (rc) {
/*Rollback quota allocation. */
- vfs_dq_free_block(ip, nblocks);
+ dquot_free_block(ip, nblocks);
return rc;
}
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
failed:
/* Rollback quota allocation. */
- vfs_dq_free_block(ip, nblocks);
+ dquot_free_block(ip, nblocks);
dbFree(ip, blkno, nblocks);
return rc;
@@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
if (blocks_needed > current_blocks) {
/* Allocate new blocks to quota. */
- if (vfs_dq_alloc_block(inode, blocks_needed))
+ rc = dquot_alloc_block(inode, blocks_needed);
+ if (rc)
return -EDQUOT;
quota_allocation = blocks_needed;
@@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
clean_up:
/* Rollback quota allocation */
if (quota_allocation)
- vfs_dq_free_block(inode, quota_allocation);
+ dquot_free_block(inode, quota_allocation);
return (rc);
}
@@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
/* If old blocks exist, they must be removed from quota allocation. */
if (old_blocks)
- vfs_dq_free_block(inode, old_blocks);
+ dquot_free_block(inode, old_blocks);
inode->i_ctime = CURRENT_TIME;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8..bb464d12104 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex);
}
}
}
-
mutex_unlock(&nlm_host_mutex);
+ nsm_release(nsm);
}
/*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f6..fefa4df3f00 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -349,9 +349,9 @@ retry:
* nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
* @info: pointer to NLMPROC_SM_NOTIFY arguments
*
- * Returns a matching nsm_handle if found in the nsm cache; the returned
- * nsm_handle's reference count is bumped and sm_monitored is cleared.
- * Otherwise returns NULL if some error occurred.
+ * Returns a matching nsm_handle if found in the nsm cache. The returned
+ * nsm_handle's reference count is bumped. Otherwise returns NULL if some
+ * error occurred.
*/
struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
{
@@ -370,12 +370,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
atomic_inc(&cached->sm_count);
spin_unlock(&nsm_lock);
- /*
- * During subsequent lock activity, force a fresh
- * notification to be set up for this host.
- */
- cached->sm_monitored = 0;
-
dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
cached->sm_name, cached->sm_addrbuf,
atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e50cfa3d965..7d150517ddf 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -243,11 +243,9 @@ static int make_socks(struct svc_serv *serv)
if (err < 0)
goto out_err;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
err = create_lockd_family(serv, PF_INET6);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_err;
-#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
warned = 0;
return 0;
diff --git a/fs/locks.c b/fs/locks.c
index ae9ded026b7..ab24d49fc04 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
* leases held by processes on this node.
*
* There is also no break_lease method; filesystems that
- * handle their own leases shoud break leases themselves from the
+ * handle their own leases should break leases themselves from the
* filesystem's open, create, and (on truncate) setattr methods.
*
* Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 00000000000..daf9a9b32dd
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+ tristate "LogFS file system (EXPERIMENTAL)"
+ depends on (MTD || BLOCK) && EXPERIMENTAL
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ select CRC32
+ select BTREE
+ help
+ Flash filesystem aimed to scale efficiently to large devices.
+ In comparison to JFFS2 it offers significantly faster mount
+ times and potentially less RAM usage, although the latter has
+ not been measured yet.
+
+ In its current state it is still very experimental and should
+ not be used for other than testing purposes.
+
+ If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 00000000000..4820027787e
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS) += logfs.o
+
+logfs-y += compr.o
+logfs-y += dir.o
+logfs-y += file.o
+logfs-y += gc.o
+logfs-y += inode.o
+logfs-y += journal.o
+logfs-y += readwrite.o
+logfs-y += segment.o
+logfs-y += super.o
+logfs-$(CONFIG_BLOCK) += dev_bdev.o
+logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 00000000000..44bbfd249ab
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c - compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ mutex_lock(&compr_mutex);
+ err = zlib_deflateInit(&stream, COMPR_LEVEL);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_deflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ if (stream.total_out >= stream.total_in)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ mutex_unlock(&compr_mutex);
+ return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ mutex_lock(&compr_mutex);
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_inflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ ret = 0;
+error:
+ mutex_unlock(&compr_mutex);
+ return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+ size_t size = max(zlib_deflate_workspacesize(),
+ zlib_inflate_workspacesize());
+ stream.workspace = vmalloc(size);
+ if (!stream.workspace)
+ return -ENOMEM;
+ return 0;
+}
+
+void logfs_compr_exit(void)
+{
+ vfree(stream.workspace);
+}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 00000000000..9718c22f186
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,327 @@
+/*
+ * fs/logfs/dev_bdev.c - Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static void request_complete(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+ struct bio bio;
+ struct bio_vec bio_vec;
+ struct completion complete;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &bio_vec;
+ bio_vec.bv_page = page;
+ bio_vec.bv_len = PAGE_SIZE;
+ bio_vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = PAGE_SIZE;
+ bio.bi_bdev = bdev;
+ bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+ init_completion(&complete);
+ bio.bi_private = &complete;
+ bio.bi_end_io = request_complete;
+
+ submit_bio(rw, &bio);
+ generic_unplug_device(bdev_get_queue(bdev));
+ wait_for_completion(&complete);
+ return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+
+static int bdev_readpage(void *_sb, struct page *page)
+{
+ struct super_block *sb = _sb;
+ struct block_device *bdev = logfs_super(sb)->s_bdev;
+ int err;
+
+ err = sync_request(page, bdev, READ);
+ if (err) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+ unlock_page(page);
+ return err;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+static void writeseg_end_io(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct super_block *sb = bio->bi_private;
+ struct logfs_super *super = logfs_super(sb);
+ struct page *page;
+
+ BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+ BUG_ON(err);
+ BUG_ON(bio->bi_vcnt == 0);
+ do {
+ page = bvec->bv_page;
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ end_page_writeback(page);
+ } while (bvec >= bio->bi_io_vec);
+ bio_put(bio);
+ if (atomic_dec_and_test(&super->s_pending_writes))
+ wake_up(&wq);
+}
+
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+ size_t nr_pages)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ struct bio *bio;
+ struct page *page;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+ int i;
+
+ bio = bio_alloc(GFP_NOFS, max_pages);
+ BUG_ON(!bio); /* FIXME: handle this */
+
+ for (i = 0; i < nr_pages; i++) {
+ if (i >= max_pages) {
+ /* Block layer cannot split bios :( */
+ bio->bi_vcnt = i;
+ bio->bi_idx = 0;
+ bio->bi_size = i * PAGE_SIZE;
+ bio->bi_bdev = super->s_bdev;
+ bio->bi_sector = ofs >> 9;
+ bio->bi_private = sb;
+ bio->bi_end_io = writeseg_end_io;
+ atomic_inc(&super->s_pending_writes);
+ submit_bio(WRITE, bio);
+
+ ofs += i * PAGE_SIZE;
+ index += i;
+ nr_pages -= i;
+ i = 0;
+
+ bio = bio_alloc(GFP_NOFS, max_pages);
+ BUG_ON(!bio);
+ }
+ page = find_lock_page(mapping, index + i);
+ BUG_ON(!page);
+ bio->bi_io_vec[i].bv_page = page;
+ bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[i].bv_offset = 0;
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ }
+ bio->bi_vcnt = nr_pages;
+ bio->bi_idx = 0;
+ bio->bi_size = nr_pages * PAGE_SIZE;
+ bio->bi_bdev = super->s_bdev;
+ bio->bi_sector = ofs >> 9;
+ bio->bi_private = sb;
+ bio->bi_end_io = writeseg_end_io;
+ atomic_inc(&super->s_pending_writes);
+ submit_bio(WRITE, bio);
+ return 0;
+}
+
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int head;
+
+ BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+
+ if (len == 0) {
+ /* This can happen when the object fit perfectly into a
+ * segment, the segment gets written per sync and subsequently
+ * closed.
+ */
+ return;
+ }
+ head = ofs & (PAGE_SIZE - 1);
+ if (head) {
+ ofs -= head;
+ len += head;
+ }
+ len = PAGE_ALIGN(len);
+ __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+ generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
+}
+
+
+static void erase_end_io(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct super_block *sb = bio->bi_private;
+ struct logfs_super *super = logfs_super(sb);
+
+ BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+ BUG_ON(err);
+ BUG_ON(bio->bi_vcnt == 0);
+ bio_put(bio);
+ if (atomic_dec_and_test(&super->s_pending_writes))
+ wake_up(&wq);
+}
+
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+ size_t nr_pages)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct bio *bio;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+ int i;
+
+ bio = bio_alloc(GFP_NOFS, max_pages);
+ BUG_ON(!bio); /* FIXME: handle this */
+
+ for (i = 0; i < nr_pages; i++) {
+ if (i >= max_pages) {
+ /* Block layer cannot split bios :( */
+ bio->bi_vcnt = i;
+ bio->bi_idx = 0;
+ bio->bi_size = i * PAGE_SIZE;
+ bio->bi_bdev = super->s_bdev;
+ bio->bi_sector = ofs >> 9;
+ bio->bi_private = sb;
+ bio->bi_end_io = erase_end_io;
+ atomic_inc(&super->s_pending_writes);
+ submit_bio(WRITE, bio);
+
+ ofs += i * PAGE_SIZE;
+ index += i;
+ nr_pages -= i;
+ i = 0;
+
+ bio = bio_alloc(GFP_NOFS, max_pages);
+ BUG_ON(!bio);
+ }
+ bio->bi_io_vec[i].bv_page = super->s_erase_page;
+ bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[i].bv_offset = 0;
+ }
+ bio->bi_vcnt = nr_pages;
+ bio->bi_idx = 0;
+ bio->bi_size = nr_pages * PAGE_SIZE;
+ bio->bi_bdev = super->s_bdev;
+ bio->bi_sector = ofs >> 9;
+ bio->bi_private = sb;
+ bio->bi_end_io = erase_end_io;
+ atomic_inc(&super->s_pending_writes);
+ submit_bio(WRITE, bio);
+ return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+ int ensure_write)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ BUG_ON(to & (PAGE_SIZE - 1));
+ BUG_ON(len & (PAGE_SIZE - 1));
+
+ if (super->s_flags & LOGFS_SB_FLAG_RO)
+ return -EROFS;
+
+ if (ensure_write) {
+ /*
+ * Object store doesn't care whether erases happen or not.
+ * But for the journal they are required. Otherwise a scan
+ * can find an old commit entry and assume it is the current
+ * one, travelling back in time.
+ */
+ do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
+ }
+
+ return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ filler_t *filler = bdev_readpage;
+
+ *ofs = 0;
+ return read_cache_page(mapping, 0, filler, sb);
+}
+
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ filler_t *filler = bdev_readpage;
+ u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+ pgoff_t index = pos >> PAGE_SHIFT;
+
+ *ofs = pos;
+ return read_cache_page(mapping, index, filler, sb);
+}
+
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+ struct block_device *bdev = logfs_super(sb)->s_bdev;
+
+ /* Nothing special to do for block devices. */
+ return sync_request(page, bdev, WRITE);
+}
+
+static void bdev_put_device(struct super_block *sb)
+{
+ close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+}
+
+static const struct logfs_device_ops bd_devops = {
+ .find_first_sb = bdev_find_first_sb,
+ .find_last_sb = bdev_find_last_sb,
+ .write_sb = bdev_write_sb,
+ .readpage = bdev_readpage,
+ .writeseg = bdev_writeseg,
+ .erase = bdev_erase,
+ .sync = bdev_sync,
+ .put_device = bdev_put_device,
+};
+
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+ const char *devname, struct vfsmount *mnt)
+{
+ struct block_device *bdev;
+
+ bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+ int mtdnr = MINOR(bdev->bd_dev);
+ close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+ return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+ }
+
+ return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 00000000000..cafb6ef2e05
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
+/*
+ * fs/logfs/dev_mtd.c - Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+ struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+ size_t retlen;
+ int ret;
+
+ ret = mtd->read(mtd, ofs, len, &retlen, buf);
+ BUG_ON(ret == -EINVAL);
+ if (ret)
+ return ret;
+
+ /* Not sure if we should loop instead. */
+ if (retlen != len)
+ return -EIO;
+
+ return 0;
+}
+
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct mtd_info *mtd = super->s_mtd;
+ size_t retlen;
+ loff_t page_start, page_end;
+ int ret;
+
+ if (super->s_flags & LOGFS_SB_FLAG_RO)
+ return -EROFS;
+
+ BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+ BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+ BUG_ON(len > PAGE_CACHE_SIZE);
+ page_start = ofs & PAGE_CACHE_MASK;
+ page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+ ret = mtd->write(mtd, ofs, len, &retlen, buf);
+ if (ret || (retlen != len))
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties. So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase(). What an excercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+ complete((struct completion *)ei->priv);
+}
+
+static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ struct page *page;
+ pgoff_t index = ofs >> PAGE_SHIFT;
+
+ for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+ page = find_get_page(mapping, index);
+ if (!page)
+ continue;
+ memset(page_address(page), 0xFF, PAGE_SIZE);
+ page_cache_release(page);
+ }
+ return 0;
+}
+
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+ int ensure_write)
+{
+ struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+ struct erase_info ei;
+ DECLARE_COMPLETION_ONSTACK(complete);
+ int ret;
+
+ BUG_ON(len % mtd->erasesize);
+ if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+ return -EROFS;
+
+ memset(&ei, 0, sizeof(ei));
+ ei.mtd = mtd;
+ ei.addr = ofs;
+ ei.len = len;
+ ei.callback = logfs_erase_callback;
+ ei.priv = (long)&complete;
+ ret = mtd->erase(mtd, &ei);
+ if (ret)
+ return -EIO;
+
+ wait_for_completion(&complete);
+ if (ei.state != MTD_ERASE_DONE)
+ return -EIO;
+ return mtd_erase_mapping(sb, ofs, len);
+}
+
+static void mtd_sync(struct super_block *sb)
+{
+ struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+
+ if (mtd->sync)
+ mtd->sync(mtd);
+}
+
+static int mtd_readpage(void *_sb, struct page *page)
+{
+ struct super_block *sb = _sb;
+ int err;
+
+ err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+ page_address(page));
+ if (err == -EUCLEAN) {
+ err = 0;
+ /* FIXME: force GC this segment */
+ }
+ if (err) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+ unlock_page(page);
+ return err;
+}
+
+static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ filler_t *filler = mtd_readpage;
+ struct mtd_info *mtd = super->s_mtd;
+
+ if (!mtd->block_isbad)
+ return NULL;
+
+ *ofs = 0;
+ while (mtd->block_isbad(mtd, *ofs)) {
+ *ofs += mtd->erasesize;
+ if (*ofs >= mtd->size)
+ return NULL;
+ }
+ BUG_ON(*ofs & ~PAGE_MASK);
+ return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ filler_t *filler = mtd_readpage;
+ struct mtd_info *mtd = super->s_mtd;
+
+ if (!mtd->block_isbad)
+ return NULL;
+
+ *ofs = mtd->size - mtd->erasesize;
+ while (mtd->block_isbad(mtd, *ofs)) {
+ *ofs -= mtd->erasesize;
+ if (*ofs <= 0)
+ return NULL;
+ }
+ *ofs = *ofs + mtd->erasesize - 0x1000;
+ BUG_ON(*ofs & ~PAGE_MASK);
+ return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+ size_t nr_pages)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ struct page *page;
+ int i, err;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = find_lock_page(mapping, index + i);
+ BUG_ON(!page);
+
+ err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+ page_address(page));
+ unlock_page(page);
+ page_cache_release(page);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int head;
+
+ if (super->s_flags & LOGFS_SB_FLAG_RO)
+ return;
+
+ if (len == 0) {
+ /* This can happen when the object fit perfectly into a
+ * segment, the segment gets written per sync and subsequently
+ * closed.
+ */
+ return;
+ }
+ head = ofs & (PAGE_SIZE - 1);
+ if (head) {
+ ofs -= head;
+ len += head;
+ }
+ len = PAGE_ALIGN(len);
+ __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+static void mtd_put_device(struct super_block *sb)
+{
+ put_mtd_device(logfs_super(sb)->s_mtd);
+}
+
+static const struct logfs_device_ops mtd_devops = {
+ .find_first_sb = mtd_find_first_sb,
+ .find_last_sb = mtd_find_last_sb,
+ .readpage = mtd_readpage,
+ .writeseg = mtd_writeseg,
+ .erase = mtd_erase,
+ .sync = mtd_sync,
+ .put_device = mtd_put_device,
+};
+
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+ int mtdnr, struct vfsmount *mnt)
+{
+ struct mtd_info *mtd;
+ const struct logfs_device_ops *devops = &mtd_devops;
+
+ mtd = get_mtd_device(NULL, mtdnr);
+ return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 00000000000..56a8bfbb012
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
+/*
+ * fs/logfs/dir.c - directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic. Dentries and Inodes are
+ * created/removed/altered in seperate operations. Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create. This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor. On next mount, if we were
+ * interrupted, we delete the inode. From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink. Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined. Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode. On next mount, if we were
+ * interrupted, we delete the dentry. From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry. If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode. If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount. From
+ * a users point of view, the operation succeeded.
+ */
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+ loff_t pos)
+{
+ return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+ return __logfs_write_inode(inode, WF_LOCK);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+ s64 new_pos = logfs_seek_data(inode, pos);
+
+ return max(pos, new_pos - 1);
+}
+
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+ loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+ return pos >= i_size_read(inode);
+}
+
+/*
+ * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space. A prime >256 ensures short names quickly spread the 32bit
+ * name space. Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferrably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+ u32 hash = seed;
+ int i;
+
+ for (i = 0; i < len; i++)
+ hash = hash * 293 + s[i];
+ return hash;
+}
+
+/*
+ * We have to satisfy several conflicting requirements here. Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks. The number of possible locations for a given hash
+ * should be small to make lookup() fast. And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme. First we reduce the hash to 0..15
+ * and try a direct block. If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block. Same for 2x and 3x indirect
+ * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing. Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions? Doing the appropriate math is beyond me
+ * and the Bronstein textbook. But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result. Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+ u32 i0_blocks = I0_BLOCKS;
+ u32 i1_blocks = I1_BLOCKS;
+ u32 i2_blocks = I2_BLOCKS;
+ u32 i3_blocks = I3_BLOCKS;
+
+ switch (round) {
+ case 0:
+ return hash % i0_blocks;
+ case 1:
+ return i0_blocks + hash % (i1_blocks - i0_blocks);
+ case 2:
+ return i1_blocks + hash % (i2_blocks - i1_blocks);
+ case 3:
+ return i2_blocks + hash % (i3_blocks - i2_blocks);
+ case 4 ... 19:
+ return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
+ + round - 4;
+ }
+ BUG();
+}
+
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+ struct qstr *name = &dentry->d_name;
+ struct page *page;
+ struct logfs_disk_dentry *dd;
+ u32 hash = hash_32(name->name, name->len, 0);
+ pgoff_t index;
+ int round;
+
+ if (name->len > LOGFS_MAX_NAMELEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ for (round = 0; round < 20; round++) {
+ index = hash_index(hash, round);
+
+ if (beyond_eof(dir, index))
+ return NULL;
+ if (!logfs_exist_block(dir, index))
+ continue;
+ page = read_cache_page(dir->i_mapping, index,
+ (filler_t *)logfs_readpage, NULL);
+ if (IS_ERR(page))
+ return page;
+ dd = kmap_atomic(page, KM_USER0);
+ BUG_ON(dd->namelen == 0);
+
+ if (name->len != be16_to_cpu(dd->namelen) ||
+ memcmp(name->name, dd->name, name->len)) {
+ kunmap_atomic(dd, KM_USER0);
+ page_cache_release(page);
+ continue;
+ }
+
+ kunmap_atomic(dd, KM_USER0);
+ return page;
+ }
+ return NULL;
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+ int ret;
+
+ inode->i_nlink--;
+ ret = write_inode(inode);
+ LOGFS_BUG_ON(ret, inode->i_sb);
+ return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+ if (logfs_inode(inode)->li_block)
+ logfs_inode(inode)->li_block->ta = NULL;
+ kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct logfs_super *super = logfs_super(dir->i_sb);
+ struct inode *inode = dentry->d_inode;
+ struct logfs_transaction *ta;
+ struct page *page;
+ pgoff_t index;
+ int ret;
+
+ ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+ if (!ta)
+ return -ENOMEM;
+
+ ta->state = UNLINK_1;
+ ta->ino = inode->i_ino;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+ page = logfs_get_dd_page(dir, dentry);
+ if (!page) {
+ kfree(ta);
+ return -ENOENT;
+ }
+ if (IS_ERR(page)) {
+ kfree(ta);
+ return PTR_ERR(page);
+ }
+ index = page->index;
+ page_cache_release(page);
+
+ mutex_lock(&super->s_dirop_mutex);
+ logfs_add_transaction(dir, ta);
+
+ ret = logfs_delete(dir, index, NULL);
+ if (!ret)
+ ret = write_inode(dir);
+
+ if (ret) {
+ abort_transaction(dir, ta);
+ printk(KERN_ERR"LOGFS: unable to delete inode\n");
+ goto out;
+ }
+
+ ta->state = UNLINK_2;
+ logfs_add_transaction(inode, ta);
+ ret = logfs_remove_inode(inode);
+out:
+ mutex_unlock(&super->s_dirop_mutex);
+ return ret;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+ u64 data;
+
+ data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+ return data >= i_size_read(dir);
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (!logfs_empty_dir(inode))
+ return -ENOTEMPTY;
+
+ return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+ struct inode *dir = file->f_dentry->d_inode;
+ loff_t pos = file->f_pos - IMPLICIT_NODES;
+ struct page *page;
+ struct logfs_disk_dentry *dd;
+ int full;
+
+ BUG_ON(pos < 0);
+ for (;; pos++) {
+ if (beyond_eof(dir, pos))
+ break;
+ if (!logfs_exist_block(dir, pos)) {
+ /* deleted dentry */
+ pos = dir_seek_data(dir, pos);
+ continue;
+ }
+ page = read_cache_page(dir->i_mapping, pos,
+ (filler_t *)logfs_readpage, NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ dd = kmap_atomic(page, KM_USER0);
+ BUG_ON(dd->namelen == 0);
+
+ full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+ pos, be64_to_cpu(dd->ino), dd->type);
+ kunmap_atomic(dd, KM_USER0);
+ page_cache_release(page);
+ if (full)
+ break;
+ }
+
+ file->f_pos = pos + IMPLICIT_NODES;
+ return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ ino_t pino = parent_ino(file->f_dentry);
+ int err;
+
+ if (file->f_pos < 0)
+ return -EINVAL;
+
+ if (file->f_pos == 0) {
+ if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+ return 0;
+ file->f_pos++;
+ }
+ if (file->f_pos == 1) {
+ if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+ return 0;
+ file->f_pos++;
+ }
+
+ err = __logfs_readdir(file, buf, filldir);
+ return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+ dd->namelen = cpu_to_be16(name->len);
+ memcpy(dd->name, name->name, name->len);
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct page *page;
+ struct logfs_disk_dentry *dd;
+ pgoff_t index;
+ u64 ino = 0;
+ struct inode *inode;
+
+ page = logfs_get_dd_page(dir, dentry);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ if (!page) {
+ d_add(dentry, NULL);
+ return NULL;
+ }
+ index = page->index;
+ dd = kmap_atomic(page, KM_USER0);
+ ino = be64_to_cpu(dd->ino);
+ kunmap_atomic(dd, KM_USER0);
+ page_cache_release(page);
+
+ inode = logfs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode)) {
+ printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+ ino, dir->i_ino, index);
+ return ERR_CAST(inode);
+ }
+ return d_splice_alias(inode, dentry);
+}
+
+static void grow_dir(struct inode *dir, loff_t index)
+{
+ index = (index + 1) << dir->i_sb->s_blocksize_bits;
+ if (i_size_read(dir) < index)
+ i_size_write(dir, index);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct page *page;
+ struct logfs_disk_dentry *dd;
+ u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+ pgoff_t index;
+ int round, err;
+
+ for (round = 0; round < 20; round++) {
+ index = hash_index(hash, round);
+
+ if (logfs_exist_block(dir, index))
+ continue;
+ page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ dd = kmap_atomic(page, KM_USER0);
+ memset(dd, 0, sizeof(*dd));
+ dd->ino = cpu_to_be64(inode->i_ino);
+ dd->type = logfs_type(inode);
+ logfs_set_name(dd, &dentry->d_name);
+ kunmap_atomic(dd, KM_USER0);
+
+ err = logfs_write_buf(dir, page, WF_LOCK);
+ unlock_page(page);
+ page_cache_release(page);
+ if (!err)
+ grow_dir(dir, index);
+ return err;
+ }
+ /* FIXME: Is there a better return value? In most cases neither
+ * the filesystem nor the directory are full. But we have had
+ * too many collisions for this particular hash and no fallback.
+ */
+ return -ENOSPC;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, const char *dest, long destlen)
+{
+ struct logfs_super *super = logfs_super(dir->i_sb);
+ struct logfs_inode *li = logfs_inode(inode);
+ struct logfs_transaction *ta;
+ int ret;
+
+ ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+ if (!ta)
+ return -ENOMEM;
+
+ ta->state = CREATE_1;
+ ta->ino = inode->i_ino;
+ mutex_lock(&super->s_dirop_mutex);
+ logfs_add_transaction(inode, ta);
+
+ if (dest) {
+ /* symlink */
+ ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+ if (!ret)
+ ret = write_inode(inode);
+ } else {
+ /* creat/mkdir/mknod */
+ ret = write_inode(inode);
+ }
+ if (ret) {
+ abort_transaction(inode, ta);
+ li->li_flags |= LOGFS_IF_STILLBORN;
+ /* FIXME: truncate symlink */
+ inode->i_nlink--;
+ iput(inode);
+ goto out;
+ }
+
+ ta->state = CREATE_2;
+ logfs_add_transaction(dir, ta);
+ ret = logfs_write_dir(dir, dentry, inode);
+ /* sync directory */
+ if (!ret)
+ ret = write_inode(dir);
+
+ if (ret) {
+ logfs_del_transaction(dir, ta);
+ ta->state = CREATE_2;
+ logfs_add_transaction(inode, ta);
+ logfs_remove_inode(inode);
+ iput(inode);
+ goto out;
+ }
+ d_instantiate(dentry, inode);
+out:
+ mutex_unlock(&super->s_dirop_mutex);
+ return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode;
+
+ /*
+ * FIXME: why do we have to fill in S_IFDIR, while the mode is
+ * correct for mknod, creat, etc.? Smells like the vfs *should*
+ * do it for us but for some reason fails to do so.
+ */
+ inode = logfs_new_inode(dir, S_IFDIR | mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &logfs_dir_iops;
+ inode->i_fop = &logfs_dir_fops;
+
+ return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+
+ inode = logfs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &logfs_reg_iops;
+ inode->i_fop = &logfs_reg_fops;
+ inode->i_mapping->a_ops = &logfs_reg_aops;
+
+ return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+ dev_t rdev)
+{
+ struct inode *inode;
+
+ if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+ return -ENAMETOOLONG;
+
+ inode = logfs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ init_special_inode(inode, mode, rdev);
+
+ return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *target)
+{
+ struct inode *inode;
+ size_t destlen = strlen(target) + 1;
+
+ if (destlen > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+ inode = logfs_new_inode(dir, S_IFLNK | 0777);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &logfs_symlink_iops;
+ inode->i_mapping->a_ops = &logfs_reg_aops;
+
+ return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_permission(struct inode *inode, int mask)
+{
+ return generic_permission(inode, mask, NULL);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+
+ if (inode->i_nlink >= LOGFS_LINK_MAX)
+ return -EMLINK;
+
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ atomic_inc(&inode->i_count);
+ inode->i_nlink++;
+ mark_inode_dirty_sync(inode);
+
+ return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+ struct logfs_disk_dentry *dd, loff_t *pos)
+{
+ struct page *page;
+ void *map;
+
+ page = logfs_get_dd_page(dir, dentry);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ *pos = page->index;
+ map = kmap_atomic(page, KM_USER0);
+ memcpy(dd, map, sizeof(*dd));
+ kunmap_atomic(map, KM_USER0);
+ page_cache_release(page);
+ return 0;
+}
+
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+ /*
+ * Getting called with pos somewhere beyond eof is either a goofup
+ * within this file or means someone maliciously edited the
+ * (crc-protected) journal.
+ */
+ BUG_ON(beyond_eof(dir, pos));
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+ return logfs_delete(dir, pos, NULL);
+}
+
+/*
+ * Cross-directory rename, target does not exist. Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct logfs_super *super = logfs_super(old_dir->i_sb);
+ struct logfs_disk_dentry dd;
+ struct logfs_transaction *ta;
+ loff_t pos;
+ int err;
+
+ /* 1. locate source dd */
+ err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+ if (err)
+ return err;
+
+ ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+ if (!ta)
+ return -ENOMEM;
+
+ ta->state = CROSS_RENAME_1;
+ ta->dir = old_dir->i_ino;
+ ta->pos = pos;
+
+ /* 2. write target dd */
+ mutex_lock(&super->s_dirop_mutex);
+ logfs_add_transaction(new_dir, ta);
+ err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+ if (!err)
+ err = write_inode(new_dir);
+
+ if (err) {
+ super->s_rename_dir = 0;
+ super->s_rename_pos = 0;
+ abort_transaction(new_dir, ta);
+ goto out;
+ }
+
+ /* 3. remove source dd */
+ ta->state = CROSS_RENAME_2;
+ logfs_add_transaction(old_dir, ta);
+ err = logfs_delete_dd(old_dir, pos);
+ if (!err)
+ err = write_inode(old_dir);
+ LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+ mutex_unlock(&super->s_dirop_mutex);
+ return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+ struct logfs_disk_dentry *dd, struct inode *inode)
+{
+ loff_t pos;
+ int err;
+
+ err = logfs_get_dd(dir, dentry, dd, &pos);
+ if (err)
+ return err;
+ dd->ino = cpu_to_be64(inode->i_ino);
+ dd->type = logfs_type(inode);
+
+ err = write_dir(dir, dd, pos);
+ if (err)
+ return err;
+ log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+ dd->name, be64_to_cpu(dd->ino));
+ return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case. We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct logfs_super *super = logfs_super(old_dir->i_sb);
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ int isdir = S_ISDIR(old_inode->i_mode);
+ struct logfs_disk_dentry dd;
+ struct logfs_transaction *ta;
+ loff_t pos;
+ int err;
+
+ BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+ if (isdir) {
+ if (!logfs_empty_dir(new_inode))
+ return -ENOTEMPTY;
+ }
+
+ /* 1. locate source dd */
+ err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+ if (err)
+ return err;
+
+ ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+ if (!ta)
+ return -ENOMEM;
+
+ ta->state = TARGET_RENAME_1;
+ ta->dir = old_dir->i_ino;
+ ta->pos = pos;
+ ta->ino = new_inode->i_ino;
+
+ /* 2. attach source inode to target dd */
+ mutex_lock(&super->s_dirop_mutex);
+ logfs_add_transaction(new_dir, ta);
+ err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+ if (err) {
+ super->s_rename_dir = 0;
+ super->s_rename_pos = 0;
+ super->s_victim_ino = 0;
+ abort_transaction(new_dir, ta);
+ goto out;
+ }
+
+ /* 3. remove source dd */
+ ta->state = TARGET_RENAME_2;
+ logfs_add_transaction(old_dir, ta);
+ err = logfs_delete_dd(old_dir, pos);
+ if (!err)
+ err = write_inode(old_dir);
+ LOGFS_BUG_ON(err, old_dir->i_sb);
+
+ /* 4. remove target inode */
+ ta->state = TARGET_RENAME_3;
+ logfs_add_transaction(new_inode, ta);
+ err = logfs_remove_inode(new_inode);
+
+out:
+ mutex_unlock(&super->s_dirop_mutex);
+ return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (new_dentry->d_inode)
+ return logfs_rename_target(old_dir, old_dentry,
+ new_dir, new_dentry);
+ return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode;
+ u64 ino, pos;
+ int err;
+
+ if (super->s_victim_ino) {
+ /* delete victim inode */
+ ino = super->s_victim_ino;
+ printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+ inode = logfs_iget(sb, ino);
+ if (IS_ERR(inode))
+ goto fail;
+
+ LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+ super->s_victim_ino = 0;
+ err = logfs_remove_inode(inode);
+ iput(inode);
+ if (err) {
+ super->s_victim_ino = ino;
+ goto fail;
+ }
+ }
+ if (super->s_rename_dir) {
+ /* delete old dd from rename */
+ ino = super->s_rename_dir;
+ pos = super->s_rename_pos;
+ printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+ ino, pos);
+ inode = logfs_iget(sb, ino);
+ if (IS_ERR(inode))
+ goto fail;
+
+ super->s_rename_dir = 0;
+ super->s_rename_pos = 0;
+ err = logfs_delete_dd(inode, pos);
+ iput(inode);
+ if (err) {
+ super->s_rename_dir = ino;
+ super->s_rename_pos = pos;
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ LOGFS_BUG(sb);
+ return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+ .create = logfs_create,
+ .link = logfs_link,
+ .lookup = logfs_lookup,
+ .mkdir = logfs_mkdir,
+ .mknod = logfs_mknod,
+ .rename = logfs_rename,
+ .rmdir = logfs_rmdir,
+ .permission = logfs_permission,
+ .symlink = logfs_symlink,
+ .unlink = logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+ .fsync = logfs_fsync,
+ .ioctl = logfs_ioctl,
+ .readdir = logfs_readdir,
+ .read = generic_read_dir,
+};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 00000000000..370f367a933
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
+/*
+ * fs/logfs/file.c - prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct page *page;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+ return 0;
+ if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned end = start + len;
+
+ /* Reading beyond i_size is simple: memset to zero */
+ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ return 0;
+ }
+ return logfs_readpage_nolock(page);
+}
+
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied, struct page *page,
+ void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index = page->index;
+ unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned end = start + copied;
+ int ret = 0;
+
+ BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+ BUG_ON(page->index > I3_BLOCKS);
+
+ if (copied < len) {
+ /*
+ * Short write of a non-initialized paged. Just tell userspace
+ * to retry the entire page.
+ */
+ if (!PageUptodate(page)) {
+ copied = 0;
+ goto out;
+ }
+ }
+ if (copied == 0)
+ goto out; /* FIXME: do we need to update inode? */
+
+ if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+ i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+ mark_inode_dirty_sync(inode);
+ }
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ if (!get_page_reserve(inode, page))
+ __set_page_dirty_nobuffers(page);
+ else
+ ret = logfs_write_buf(inode, page, WF_LOCK);
+ }
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ return ret ? ret : copied;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+ int ret;
+
+ ret = logfs_readpage_nolock(page);
+ unlock_page(page);
+ return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+ BUG_ON(PagePrivate(page) || page->private);
+ set_page_writeback(page);
+ end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int err;
+
+ err = logfs_write_buf(inode, page, WF_LOCK);
+ if (err)
+ set_page_dirty(page);
+ else
+ clear_radix_tree_dirty(page);
+ unlock_page(page);
+ return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ u64 bix;
+ level_t level;
+
+ log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+ page);
+
+ logfs_unpack_index(page->index, &bix, &level);
+
+ /* Indirect blocks are never truncated */
+ if (level != 0)
+ return __logfs_writepage(page);
+
+ /*
+ * TODO: everything below is a near-verbatim copy of nobh_writepage().
+ * The relevant bits should be factored out after logfs is merged.
+ */
+
+ /* Is the page fully inside i_size? */
+ if (bix < end_index)
+ return __logfs_writepage(page);
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (bix > end_index || offset == 0) {
+ unlock_page(page);
+ return 0; /* don't care */
+ }
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invokation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ return __logfs_writepage(page);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+ move_page_to_btree(page);
+ BUG_ON(PagePrivate(page) || page->private);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+ return 0; /* None of these are easy to release */
+}
+
+
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ unsigned int oldflags, flags;
+ int err;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *)arg);
+ case FS_IOC_SETFLAGS:
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ err = get_user(flags, (int __user *)arg);
+ if (err)
+ return err;
+
+ mutex_lock(&inode->i_mutex);
+ oldflags = li->li_flags;
+ flags &= LOGFS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+ li->li_flags = flags;
+ mutex_unlock(&inode->i_mutex);
+
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+ return 0;
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct super_block *sb = dentry->d_inode->i_sb;
+ struct logfs_super *super = logfs_super(sb);
+
+ /* FIXME: write anchor */
+ super->s_devops->sync(sb);
+ return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ int err = 0;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ err = logfs_truncate(inode, attr->ia_size);
+ attr->ia_valid &= ~ATTR_SIZE;
+
+ if (!err)
+ err = inode_change_ok(inode, attr);
+ if (!err)
+ err = inode_setattr(inode, attr);
+ return err;
+}
+
+const struct inode_operations logfs_reg_iops = {
+ .setattr = logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+ .aio_read = generic_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .fsync = logfs_fsync,
+ .ioctl = logfs_ioctl,
+ .llseek = generic_file_llseek,
+ .mmap = generic_file_readonly_mmap,
+ .open = generic_file_open,
+ .read = do_sync_read,
+ .write = do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+ .invalidatepage = logfs_invalidatepage,
+ .readpage = logfs_readpage,
+ .releasepage = logfs_releasepage,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .writepage = logfs_writepage,
+ .writepages = generic_writepages,
+ .write_begin = logfs_write_begin,
+ .write_end = logfs_write_end,
+};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 00000000000..92949f95a90
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,730 @@
+/*
+ * fs/logfs/gc.c - garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big. A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events. A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time. If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES 2600
+#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64 /* base size of candidate lists */
+#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static int no_free_segments(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ return super->s_free_list.count;
+}
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u8 gc_level = (__force u8)__gc_level;
+
+ switch (gc_level) {
+ case 0: /* fall through */
+ case 1: /* fall through */
+ case 2: /* fall through */
+ case 3:
+ /* file data or indirect blocks */
+ return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+ case 6: /* fall through */
+ case 7: /* fall through */
+ case 8: /* fall through */
+ case 9:
+ /* inode file data or indirect blocks */
+ return super->s_ifile_levels - (gc_level - 6);
+ default:
+ printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+ gc_level);
+ WARN_ON(1);
+ return super->s_ifile_levels + super->s_iblock_levels;
+ }
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area;
+ void *reserved;
+ int i;
+
+ /* Some segments are reserved. Just pretend they were all valid */
+ reserved = btree_lookup32(&super->s_reserved_segments, segno);
+ if (reserved)
+ return 1;
+
+ /* Currently open segments */
+ for_each_area(i) {
+ area = super->s_area[i];
+ if (area->a_is_open && area->a_segno == segno)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+ BUG();
+}
+
+/*
+ * Returns the bytes consumed by valid objects in this segment. Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+ gc_level_t *gc_level)
+{
+ struct logfs_segment_entry se;
+ u32 ec_level;
+
+ logfs_get_segment_entry(sb, segno, &se);
+ if (se.ec_level == cpu_to_be32(BADSEG) ||
+ se.valid == cpu_to_be32(RESERVED))
+ return RESERVED;
+
+ ec_level = be32_to_cpu(se.ec_level);
+ *ec = ec_level >> 4;
+ *gc_level = GC_LEVEL(ec_level & 0xf);
+ return be32_to_cpu(se.valid);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+ u64 bix, gc_level_t gc_level)
+{
+ struct inode *inode;
+ int err, cookie;
+
+ inode = logfs_safe_iget(sb, ino, &cookie);
+ err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+ BUG_ON(err);
+ logfs_safe_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_segment_header sh;
+ struct logfs_object_header oh;
+ u64 ofs, ino, bix;
+ u32 seg_ofs, logical_segno, cleaned = 0;
+ int err, len, valid;
+ gc_level_t gc_level;
+
+ LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+ btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+ err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+ BUG_ON(err);
+ gc_level = GC_LEVEL(sh.level);
+ logical_segno = be32_to_cpu(sh.segno);
+ if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+ logfs_mark_segment_bad(sb, segno);
+ cleaned = -1;
+ goto out;
+ }
+
+ for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+ seg_ofs + sizeof(oh) < super->s_segsize; ) {
+ ofs = dev_ofs(sb, logical_segno, seg_ofs);
+ err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+ &oh);
+ BUG_ON(err);
+
+ if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+ break;
+
+ if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+ logfs_mark_segment_bad(sb, segno);
+ cleaned = super->s_segsize - 1;
+ goto out;
+ }
+
+ ino = be64_to_cpu(oh.ino);
+ bix = be64_to_cpu(oh.bix);
+ len = sizeof(oh) + be16_to_cpu(oh.len);
+ valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+ if (valid == 1) {
+ logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+ cleaned += len;
+ } else if (valid == 2) {
+ /* Will be invalid upon journal commit */
+ cleaned += len;
+ }
+ seg_ofs += len;
+ }
+out:
+ btree_remove32(&super->s_reserved_segments, segno);
+ return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+ struct candidate_list *list)
+{
+ struct rb_node **p = &list->rb_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct gc_candidate *cur;
+ int comp;
+
+ cand->list = list;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct gc_candidate, rb_node);
+
+ if (list->sort_by_ec)
+ comp = cand->erase_count < cur->erase_count;
+ else
+ comp = cand->valid < cur->valid;
+
+ if (comp)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&cand->rb_node, parent, p);
+ rb_insert_color(&cand->rb_node, &list->rb_tree);
+
+ if (list->count <= list->maxcount) {
+ list->count++;
+ return NULL;
+ }
+ cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+ rb_erase(&cand->rb_node, &list->rb_tree);
+ cand->list = NULL;
+ return cand;
+}
+
+static void remove_from_list(struct gc_candidate *cand)
+{
+ struct candidate_list *list = cand->list;
+
+ rb_erase(&cand->rb_node, &list->rb_tree);
+ list->count--;
+}
+
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ btree_remove32(&super->s_cand_tree, cand->segno);
+ kfree(cand);
+}
+
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+ struct gc_candidate *cand;
+ u32 segno;
+
+ BUG_ON(list->count == 0);
+
+ cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+ remove_from_list(cand);
+ segno = cand->segno;
+ if (ec)
+ *ec = cand->erase_count;
+ free_candidate(sb, cand);
+ return segno;
+}
+
+/*
+ * We have several lists to manage segments with. The reserve_list is used to
+ * deal with bad blocks. We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage. It usually gets the
+ * second pick after the reserve_list. But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list. If we have
+ * to run garbage collection, we pick a candidate from there. All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while. We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+ if (cand->valid == 0) {
+ /* 100% free segments */
+ log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+ cand->segno, cand->erase_count,
+ dev_ofs(sb, cand->segno, 0));
+ cand = add_list(cand, &super->s_reserve_list);
+ if (cand) {
+ log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+ cand->segno, cand->erase_count,
+ dev_ofs(sb, cand->segno, 0));
+ cand = add_list(cand, &super->s_free_list);
+ }
+ } else {
+ /* good candidates for Garbage Collection */
+ if (cand->valid < full)
+ cand = add_list(cand, &super->s_low_list[cand->dist]);
+ /* good candidates for wear leveling,
+ * segments that were recently written get ignored */
+ if (cand)
+ cand = add_list(cand, &super->s_ec_list);
+ }
+ if (cand)
+ free_candidate(sb, cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+ u8 dist)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct gc_candidate *cand;
+
+ cand = kmalloc(sizeof(*cand), GFP_NOFS);
+ if (!cand)
+ return -ENOMEM;
+
+ cand->segno = segno;
+ cand->valid = valid;
+ cand->erase_count = ec;
+ cand->dist = dist;
+
+ btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+ __add_candidate(sb, cand);
+ return 0;
+}
+
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct gc_candidate *cand;
+
+ cand = btree_lookup32(&super->s_cand_tree, segno);
+ if (cand) {
+ remove_from_list(cand);
+ free_candidate(sb, cand);
+ }
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+ u32 valid, ec = 0;
+ gc_level_t gc_level = 0;
+ u8 dist;
+
+ if (segment_is_reserved(sb, segno))
+ return;
+
+ remove_segment_from_lists(sb, segno);
+ valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+ if (valid == RESERVED)
+ return;
+
+ dist = root_distance(sb, gc_level);
+ add_candidate(sb, segno, valid, ec, dist);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+ if (list->count == 0)
+ return NULL;
+ return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+
+/*
+ * Find the best segment for garbage collection. Main criterion is
+ * the segment requiring the least effort to clean. Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i, max_dist;
+ struct gc_candidate *cand = NULL, *this;
+
+ max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+
+ for (i = max_dist; i >= 0; i--) {
+ this = first_in_list(&super->s_low_list[i]);
+ if (!this)
+ continue;
+ if (!cand)
+ cand = this;
+ if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+ cand = this;
+ }
+ return cand;
+}
+
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+ struct logfs_super *super = logfs_super(sb);
+ gc_level_t gc_level;
+ u32 cleaned, valid, segno, ec;
+ u8 dist;
+
+ if (!cand) {
+ log_gc("GC attempted, but no candidate found\n");
+ return 0;
+ }
+
+ segno = cand->segno;
+ dist = cand->dist;
+ valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+ free_candidate(sb, cand);
+ log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+ segno, (u64)segno << super->s_segshift,
+ dist, no_free_segments(sb), valid,
+ super->s_free_bytes);
+ cleaned = logfs_gc_segment(sb, segno, dist);
+ log_gc("GC segment #%02x complete - now %x valid\n", segno,
+ valid - cleaned);
+ BUG_ON(cleaned != valid);
+ return 1;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+ struct gc_candidate *cand;
+
+ cand = get_candidate(sb);
+ if (cand)
+ remove_from_list(cand);
+ return __logfs_gc_once(sb, cand);
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u32 segno;
+ int i, ret = 0;
+
+ segno = super->s_sweeper;
+ for (i = SCAN_RATIO; i > 0; i--) {
+ segno++;
+ if (segno >= super->s_no_segs) {
+ segno = 0;
+ ret = 1;
+ /* Break out of the loop. We want to read a single
+ * block from the segment size on next invocation if
+ * SCAN_RATIO is set to match block size
+ */
+ break;
+ }
+
+ scan_segment(sb, segno);
+ }
+ super->s_sweeper = segno;
+ return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data. LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed. An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_block *block;
+ int round, progress, last_progress = 0;
+
+ if (no_free_segments(sb) >= target &&
+ super->s_no_object_aliases < MAX_OBJ_ALIASES)
+ return;
+
+ log_gc("__logfs_gc_pass(%x)\n", target);
+ for (round = 0; round < SCAN_ROUNDS; ) {
+ if (no_free_segments(sb) >= target)
+ goto write_alias;
+
+ /* Sync in-memory state with on-medium state in case they
+ * diverged */
+ logfs_write_anchor(sb);
+ round += logfs_scan_some(sb);
+ if (no_free_segments(sb) >= target)
+ goto write_alias;
+ progress = logfs_gc_once(sb);
+ if (progress)
+ last_progress = round;
+ else if (round - last_progress > 2)
+ break;
+ continue;
+
+ /*
+ * The goto logic is nasty, I just don't know a better way to
+ * code it. GC is supposed to ensure two things:
+ * 1. Enough free segments are available.
+ * 2. The number of aliases is bounded.
+ * When 1. is achieved, we take a look at 2. and write back
+ * some alias-containing blocks, if necessary. However, after
+ * each such write we need to go back to 1., as writes can
+ * consume free segments.
+ */
+write_alias:
+ if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+ return;
+ if (list_empty(&super->s_object_alias)) {
+ /* All aliases are still in btree */
+ return;
+ }
+ log_gc("Write back one alias\n");
+ block = list_entry(super->s_object_alias.next,
+ struct logfs_block, alias_list);
+ block->ops->write_block(block);
+ /*
+ * To round off the nasty goto logic, we reset round here. It
+ * is a safety-net for GC not making any progress and limited
+ * to something reasonably small. If incremented it for every
+ * single alias, the loop could terminate rather quickly.
+ */
+ round = 0;
+ }
+ LOGFS_BUG(sb);
+}
+
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ if (*next_event < super->s_gec) {
+ *next_event = super->s_gec + WL_RATELIMIT;
+ return 0;
+ }
+ return 1;
+}
+
+static void logfs_wl_pass(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct gc_candidate *wl_cand, *free_cand;
+
+ if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+ return;
+
+ wl_cand = first_in_list(&super->s_ec_list);
+ if (!wl_cand)
+ return;
+ free_cand = first_in_list(&super->s_free_list);
+ if (!free_cand)
+ return;
+
+ if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+ remove_from_list(wl_cand);
+ __logfs_gc_once(sb, wl_cand);
+ }
+}
+
+/*
+ * The journal needs wear leveling as well. But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible. And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations. First we check whether moving the journal would be a
+ * significant improvement. That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal. Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling. Journal data is read and rewritten rapidly, comparatively. So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct gc_candidate *cand;
+ u32 min_journal_ec = -1, max_reserve_ec = 0;
+ int i;
+
+ if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+ return;
+
+ if (super->s_reserve_list.count < super->s_no_journal_segs) {
+ /* Reserve is not full enough to move complete journal */
+ return;
+ }
+
+ journal_for_each(i)
+ if (super->s_journal_seg[i])
+ min_journal_ec = min(min_journal_ec,
+ super->s_journal_ec[i]);
+ cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+ struct gc_candidate, rb_node);
+ max_reserve_ec = cand->erase_count;
+ for (i = 0; i < 2; i++) {
+ struct logfs_segment_entry se;
+ u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+ u32 ec;
+
+ logfs_get_segment_entry(sb, segno, &se);
+ ec = be32_to_cpu(se.ec_level) >> 4;
+ max_reserve_ec = max(max_reserve_ec, ec);
+ }
+
+ if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+ do_logfs_journal_wl_pass(sb);
+ }
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+ /* Write journal before free space is getting saturated with dirty
+ * objects.
+ */
+ if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+ + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+ logfs_write_anchor(sb);
+ __logfs_gc_pass(sb, super->s_total_levels);
+ logfs_wl_pass(sb);
+ logfs_journal_wl_pass(sb);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_area[i];
+ struct logfs_object_header oh;
+ u32 segno = area->a_segno;
+ u32 ofs = area->a_used_bytes;
+ __be32 crc;
+ int err;
+
+ if (!area->a_is_open)
+ return 0;
+
+ for (ofs = area->a_used_bytes;
+ ofs <= super->s_segsize - sizeof(oh);
+ ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
+ err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
+ if (err)
+ return err;
+
+ if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+ break;
+
+ crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+ if (crc != oh.crc) {
+ printk(KERN_INFO "interrupted header at %llx\n",
+ dev_ofs(sb, segno, ofs));
+ return 0;
+ }
+ }
+ if (ofs != area->a_used_bytes) {
+ printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+ ofs - area->a_used_bytes,
+ dev_ofs(sb, segno, area->a_used_bytes));
+ area->a_used_bytes = ofs;
+ }
+ return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+ int i, err;
+
+ for_each_area(i) {
+ err = check_area(sb, i);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+ int sort_by_ec)
+{
+ list->count = 0;
+ list->maxcount = maxcount;
+ list->sort_by_ec = sort_by_ec;
+ list->rb_tree = RB_ROOT;
+}
+
+int logfs_init_gc(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+ logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+ logfs_init_candlist(&super->s_reserve_list,
+ super->s_bad_seg_reserve, 1);
+ for_each_area(i)
+ logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+ logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+ return 0;
+}
+
+static void logfs_cleanup_list(struct super_block *sb,
+ struct candidate_list *list)
+{
+ struct gc_candidate *cand;
+
+ while (list->count) {
+ cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+ rb_node);
+ remove_from_list(cand);
+ free_candidate(sb, cand);
+ }
+ BUG_ON(list->rb_tree.rb_node);
+}
+
+void logfs_cleanup_gc(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ if (!super->s_free_list.count)
+ return;
+
+ /*
+ * FIXME: The btree may still contain a single empty node. So we
+ * call the grim visitor to clean up that mess. Btree code should
+ * do it for us, really.
+ */
+ btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+ logfs_cleanup_list(sb, &super->s_free_list);
+ logfs_cleanup_list(sb, &super->s_reserve_list);
+ for_each_area(i)
+ logfs_cleanup_list(sb, &super->s_low_list[i]);
+ logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 00000000000..33ec1aeaeec
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,417 @@
+/*
+ * fs/logfs/inode.c - inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
+ * on the medium. It therefore also lacks a method to store the previous
+ * generation number for deleted inodes. Instead a single generation number
+ * is stored which will be used for new inodes. Being just a 32bit counter,
+ * this can obvious wrap relatively quickly. So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value? Beats me. 64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder. GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * wether the inode in question might be in I_FREEING state. Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first. Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does. When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted. But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
+ * logfs_iget/iput - normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static DEFINE_SPINLOCK(logfs_inode_lock);
+
+static void logfs_inode_setops(struct inode *inode)
+{
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_op = &logfs_dir_iops;
+ inode->i_fop = &logfs_dir_fops;
+ inode->i_mapping->a_ops = &logfs_reg_aops;
+ break;
+ case S_IFREG:
+ inode->i_op = &logfs_reg_iops;
+ inode->i_fop = &logfs_reg_fops;
+ inode->i_mapping->a_ops = &logfs_reg_aops;
+ break;
+ case S_IFLNK:
+ inode->i_op = &logfs_symlink_iops;
+ inode->i_mapping->a_ops = &logfs_reg_aops;
+ break;
+ case S_IFSOCK: /* fall through */
+ case S_IFBLK: /* fall through */
+ case S_IFCHR: /* fall through */
+ case S_IFIFO:
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+ struct inode *inode = iget_locked(sb, ino);
+ int err;
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ err = logfs_read_inode(inode);
+ if (err || inode->i_nlink == 0) {
+ /* inode->i_nlink == 0 can be true when called from
+ * block validator */
+ /* set i_nlink to 0 to prevent caching */
+ inode->i_nlink = 0;
+ logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+ iget_failed(inode);
+ if (!err)
+ err = -ENOENT;
+ return ERR_PTR(err);
+ }
+
+ logfs_inode_setops(inode);
+ unlock_new_inode(inode);
+ return inode;
+}
+
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+ BUG_ON(ino == LOGFS_INO_MASTER);
+ BUG_ON(ino == LOGFS_INO_SEGFILE);
+ return __logfs_iget(sb, ino);
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_inode *li;
+
+ if (ino == LOGFS_INO_MASTER)
+ return super->s_master_inode;
+ if (ino == LOGFS_INO_SEGFILE)
+ return super->s_segfile_inode;
+
+ spin_lock(&logfs_inode_lock);
+ list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+ if (li->vfs_inode.i_ino == ino) {
+ li->li_refcount++;
+ spin_unlock(&logfs_inode_lock);
+ *is_cached = 1;
+ return &li->vfs_inode;
+ }
+ spin_unlock(&logfs_inode_lock);
+
+ *is_cached = 0;
+ return __logfs_iget(sb, ino);
+}
+
+static void __logfs_destroy_inode(struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ BUG_ON(li->li_block);
+ list_del(&li->li_freeing_list);
+ kmem_cache_free(logfs_inode_cache, li);
+}
+
+static void logfs_destroy_inode(struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ BUG_ON(list_empty(&li->li_freeing_list));
+ spin_lock(&logfs_inode_lock);
+ li->li_refcount--;
+ if (li->li_refcount == 0)
+ __logfs_destroy_inode(inode);
+ spin_unlock(&logfs_inode_lock);
+}
+
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+ if (inode->i_ino == LOGFS_INO_MASTER)
+ return;
+ if (inode->i_ino == LOGFS_INO_SEGFILE)
+ return;
+
+ if (is_cached) {
+ logfs_destroy_inode(inode);
+ return;
+ }
+
+ iput(inode);
+}
+
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ int i;
+
+ li->li_flags = 0;
+ li->li_height = 0;
+ li->li_used_bytes = 0;
+ li->li_block = NULL;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_nlink = 1;
+ INIT_LIST_HEAD(&li->li_freeing_list);
+
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ li->li_data[i] = 0;
+
+ return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+ struct logfs_inode *li;
+
+ li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+ if (!li)
+ return NULL;
+ logfs_init_inode(sb, &li->vfs_inode);
+ return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file. The inode file, like any
+ * other file, is managed with a inode. The inode file's inode, aka master
+ * inode, requires special handling in several respects. First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written. The ordering is important. Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty. Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed. Sadly, this method has another side-effect. The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below. Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use. An ugly hack, no doubt. Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+ struct inode *inode;
+
+ inode = logfs_alloc_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_mode = S_IFREG;
+ inode->i_ino = ino;
+ inode->i_sb = sb;
+
+ /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
+ * to be nonstatic, alas. */
+ {
+ struct address_space * const mapping = &inode->i_data;
+
+ mapping->a_ops = &logfs_reg_aops;
+ mapping->host = inode;
+ mapping->flags = 0;
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ inode->i_mapping = mapping;
+ inode->i_nlink = 1;
+ }
+
+ return inode;
+}
+
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+ struct inode *inode;
+ int err;
+
+ inode = logfs_new_meta_inode(sb, ino);
+ if (IS_ERR(inode))
+ return inode;
+
+ err = logfs_read_inode(inode);
+ if (err) {
+ destroy_meta_inode(inode);
+ return ERR_PTR(err);
+ }
+ logfs_inode_setops(inode);
+ return inode;
+}
+
+static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ int ret;
+ long flags = WF_LOCK;
+
+ /* Can only happen if creat() failed. Safe to skip. */
+ if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+ return 0;
+
+ ret = __logfs_write_inode(inode, flags);
+ LOGFS_BUG_ON(ret, inode->i_sb);
+ return ret;
+}
+
+void destroy_meta_inode(struct inode *inode)
+{
+ if (inode) {
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ logfs_clear_inode(inode);
+ kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+ }
+}
+
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ struct logfs_inode *li = logfs_inode(inode);
+
+ spin_lock(&logfs_inode_lock);
+ list_move(&li->li_freeing_list, &super->s_freeing_list);
+ spin_unlock(&logfs_inode_lock);
+ generic_drop_inode(inode);
+}
+
+static void logfs_set_ino_generation(struct super_block *sb,
+ struct inode *inode)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u64 ino;
+
+ mutex_lock(&super->s_journal_mutex);
+ ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+ super->s_last_ino = ino;
+ super->s_inos_till_wrap--;
+ if (super->s_inos_till_wrap < 0) {
+ super->s_last_ino = LOGFS_RESERVED_INOS;
+ super->s_generation++;
+ super->s_inos_till_wrap = INOS_PER_WRAP;
+ }
+ inode->i_ino = ino;
+ inode->i_generation = super->s_generation;
+ mutex_unlock(&super->s_journal_mutex);
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+ struct super_block *sb = dir->i_sb;
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ logfs_init_inode(sb, inode);
+
+ /* inherit parent flags */
+ logfs_inode(inode)->li_flags |=
+ logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+ inode->i_mode = mode;
+ logfs_set_ino_generation(sb, inode);
+
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ }
+
+ logfs_inode_setops(inode);
+ insert_inode_hash(inode);
+
+ return inode;
+}
+
+static void logfs_init_once(void *_li)
+{
+ struct logfs_inode *li = _li;
+ int i;
+
+ li->li_flags = 0;
+ li->li_used_bytes = 0;
+ li->li_refcount = 1;
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ li->li_data[i] = 0;
+ inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+ /* FIXME: write anchor */
+ logfs_super(sb)->s_devops->sync(sb);
+ return 0;
+}
+
+const struct super_operations logfs_super_operations = {
+ .alloc_inode = logfs_alloc_inode,
+ .clear_inode = logfs_clear_inode,
+ .delete_inode = logfs_delete_inode,
+ .destroy_inode = logfs_destroy_inode,
+ .drop_inode = logfs_drop_inode,
+ .write_inode = logfs_write_inode,
+ .statfs = logfs_statfs,
+ .sync_fs = logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+ logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+ sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ logfs_init_once);
+ if (!logfs_inode_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+ kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 00000000000..6ad30a4c905
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,883 @@
+/*
+ * fs/logfs/journal.c - journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+static void logfs_calc_free(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u64 reserve, no_segs = super->s_no_segs;
+ s64 free;
+ int i;
+
+ /* superblock segments */
+ no_segs -= 2;
+ super->s_no_journal_segs = 0;
+ /* journal */
+ journal_for_each(i)
+ if (super->s_journal_seg[i]) {
+ no_segs--;
+ super->s_no_journal_segs++;
+ }
+
+ /* open segments plus one extra per level for GC */
+ no_segs -= 2 * super->s_total_levels;
+
+ free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+ free -= super->s_used_bytes;
+ /* just a bit extra */
+ free -= super->s_total_levels * 4096;
+
+ /* Bad blocks are 'paid' for with speed reserve - the filesystem
+ * simply gets slower as bad blocks accumulate. Until the bad blocks
+ * exceed the speed reserve - then the filesystem gets smaller.
+ */
+ reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+ reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+ reserve = max(reserve, super->s_speed_reserve);
+ free -= reserve;
+ if (free < 0)
+ free = 0;
+
+ super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct btree_head32 *head = &super->s_reserved_segments;
+ int i, err;
+
+ err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+ GFP_KERNEL);
+ BUG_ON(err);
+
+ err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+ GFP_KERNEL);
+ BUG_ON(err);
+
+ journal_for_each(i) {
+ if (!super->s_journal_seg[i])
+ continue;
+ err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+ GFP_KERNEL);
+ BUG_ON(err);
+ }
+}
+
+static void read_dynsb(struct super_block *sb,
+ struct logfs_je_dynsb *dynsb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ super->s_gec = be64_to_cpu(dynsb->ds_gec);
+ super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
+ super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
+ super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
+ super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
+ super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
+ super->s_generation = be32_to_cpu(dynsb->ds_generation);
+}
+
+static void read_anchor(struct super_block *sb,
+ struct logfs_je_anchor *da)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode = super->s_master_inode;
+ struct logfs_inode *li = logfs_inode(inode);
+ int i;
+
+ super->s_last_ino = be64_to_cpu(da->da_last_ino);
+ li->li_flags = 0;
+ li->li_height = da->da_height;
+ i_size_write(inode, be64_to_cpu(da->da_size));
+ li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+ struct logfs_je_journal_ec *ec)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ journal_for_each(i)
+ super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_area[a->gc_level];
+ u64 ofs;
+ u32 writemask = ~(super->s_writesize - 1);
+
+ if (a->gc_level >= LOGFS_NO_AREAS)
+ return -EIO;
+ if (a->vim != VIM_DEFAULT)
+ return -EIO; /* TODO: close area and continue */
+
+ area->a_used_bytes = be32_to_cpu(a->used_bytes);
+ area->a_written_bytes = area->a_used_bytes & writemask;
+ area->a_segno = be32_to_cpu(a->segno);
+ if (area->a_segno)
+ area->a_is_open = 1;
+
+ ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+ if (super->s_writesize > 1)
+ logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+ else
+ logfs_buf_recover(area, ofs, NULL, 0);
+ return 0;
+}
+
+static void *unpack(void *from, void *to)
+{
+ struct logfs_journal_header *jh = from;
+ void *data = from + sizeof(struct logfs_journal_header);
+ int err;
+ size_t inlen, outlen;
+
+ inlen = be16_to_cpu(jh->h_len);
+ outlen = be16_to_cpu(jh->h_datalen);
+
+ if (jh->h_compr == COMPR_NONE)
+ memcpy(to, data, inlen);
+ else {
+ err = logfs_uncompress(data, to, inlen, outlen);
+ BUG_ON(err);
+ }
+ return to;
+}
+
+static int __read_je_header(struct super_block *sb, u64 ofs,
+ struct logfs_journal_header *jh)
+{
+ struct logfs_super *super = logfs_super(sb);
+ size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+ + MAX_JOURNAL_HEADER;
+ u16 type, len, datalen;
+ int err;
+
+ /* read header only */
+ err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+ if (err)
+ return err;
+ type = be16_to_cpu(jh->h_type);
+ len = be16_to_cpu(jh->h_len);
+ datalen = be16_to_cpu(jh->h_datalen);
+ if (len > sb->s_blocksize)
+ return -EIO;
+ if ((type < JE_FIRST) || (type > JE_LAST))
+ return -EIO;
+ if (datalen > bufsize)
+ return -EIO;
+ return 0;
+}
+
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+ struct logfs_journal_header *jh)
+{
+ u16 len;
+ int err;
+
+ len = be16_to_cpu(jh->h_len);
+ err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+ if (err)
+ return err;
+ if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+ /* Old code was confused. It forgot about the header length
+ * and stopped calculating the crc 16 bytes before the end
+ * of data - ick!
+ * FIXME: Remove this hack once the old code is fixed.
+ */
+ if (jh->h_crc == logfs_crc32(jh, len, 4))
+ WARN_ON_ONCE(1);
+ else
+ return -EIO;
+ }
+ return 0;
+}
+
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+ struct logfs_journal_header *jh)
+{
+ int err;
+
+ err = __read_je_header(sb, ofs, jh);
+ if (err)
+ return err;
+ return __read_je_payload(sb, ofs, jh);
+}
+
+static int read_je(struct super_block *sb, u64 ofs)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_journal_header *jh = super->s_compressed_je;
+ void *scratch = super->s_je;
+ u16 type, datalen;
+ int err;
+
+ err = __read_je(sb, ofs, jh);
+ if (err)
+ return err;
+ type = be16_to_cpu(jh->h_type);
+ datalen = be16_to_cpu(jh->h_datalen);
+
+ switch (type) {
+ case JE_DYNSB:
+ read_dynsb(sb, unpack(jh, scratch));
+ break;
+ case JE_ANCHOR:
+ read_anchor(sb, unpack(jh, scratch));
+ break;
+ case JE_ERASECOUNT:
+ read_erasecount(sb, unpack(jh, scratch));
+ break;
+ case JE_AREA:
+ read_area(sb, unpack(jh, scratch));
+ break;
+ case JE_OBJ_ALIAS:
+ err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+ datalen);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+ return err;
+}
+
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_journal_header *jh = super->s_compressed_je;
+ u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+ u32 h_ofs, last_ofs = 0;
+ u16 len, datalen, last_len = 0;
+ int i, err;
+
+ /* search for most recent commit */
+ for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+ ofs = seg_ofs + h_ofs;
+ err = __read_je_header(sb, ofs, jh);
+ if (err)
+ continue;
+ if (jh->h_type != cpu_to_be16(JE_COMMIT))
+ continue;
+ err = __read_je_payload(sb, ofs, jh);
+ if (err)
+ continue;
+ len = be16_to_cpu(jh->h_len);
+ datalen = be16_to_cpu(jh->h_datalen);
+ if ((datalen > sizeof(super->s_je_array)) ||
+ (datalen % sizeof(__be64)))
+ continue;
+ last_ofs = h_ofs;
+ last_len = datalen;
+ h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+ }
+ /* read commit */
+ if (last_ofs == 0)
+ return -ENOENT;
+ ofs = seg_ofs + last_ofs;
+ log_journal("Read commit from %llx\n", ofs);
+ err = __read_je(sb, ofs, jh);
+ BUG_ON(err); /* We should have caught it in the scan loop already */
+ if (err)
+ return err;
+ /* uncompress */
+ unpack(jh, super->s_je_array);
+ super->s_no_je = last_len / sizeof(__be64);
+ /* iterate over array */
+ for (i = 0; i < super->s_no_je; i++) {
+ err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+ if (err)
+ return err;
+ }
+ super->s_journal_area->a_segno = segno;
+ return 0;
+}
+
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+ struct logfs_segment_header sh;
+ __be32 crc;
+ int err;
+
+ if (!segno)
+ return 0;
+ err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+ if (err)
+ return 0;
+ crc = logfs_crc32(&sh, sizeof(sh), 4);
+ if (crc != sh.crc) {
+ WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+ /* Most likely it was just erased */
+ return 0;
+ }
+ return be64_to_cpu(sh.gec);
+}
+
+static int logfs_read_journal(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u64 gec[LOGFS_JOURNAL_SEGS], max;
+ u32 segno;
+ int i, max_i;
+
+ max = 0;
+ max_i = -1;
+ journal_for_each(i) {
+ segno = super->s_journal_seg[i];
+ gec[i] = read_gec(sb, super->s_journal_seg[i]);
+ if (gec[i] > max) {
+ max = gec[i];
+ max_i = i;
+ }
+ }
+ if (max_i == -1)
+ return -EIO;
+ /* FIXME: Try older segments in case of error */
+ return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+ struct logfs_super *super = logfs_super(area->a_sb);
+ int i;
+
+ journal_for_each(i) {
+ if (area->a_segno != super->s_journal_seg[i])
+ continue;
+
+ do {
+ i++;
+ if (i == LOGFS_JOURNAL_SEGS)
+ i = 0;
+ } while (!super->s_journal_seg[i]);
+
+ area->a_segno = super->s_journal_seg[i];
+ area->a_erase_count = ++(super->s_journal_ec[i]);
+ log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+ area->a_erase_count);
+ return;
+ }
+ BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+ /* erase count is stored globally and incremented in
+ * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_segment_header sh;
+ u64 ofs;
+ int err;
+
+ err = logfs_erase_segment(sb, area->a_segno, 1);
+ if (err)
+ return err;
+
+ sh.pad = 0;
+ sh.type = SEG_JOURNAL;
+ sh.level = 0;
+ sh.segno = cpu_to_be32(area->a_segno);
+ sh.ec = cpu_to_be32(area->a_erase_count);
+ sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+ sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+ /* This causes a bug in segment.c. Not yet. */
+ //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+
+ ofs = dev_ofs(sb, area->a_segno, 0);
+ area->a_used_bytes = ALIGN(sizeof(sh), 16);
+ logfs_buf_write(area, ofs, &sh, sizeof(sh));
+ return 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+ struct logfs_journal_header *jh, size_t len, size_t datalen,
+ u16 type, u8 compr)
+{
+ jh->h_len = cpu_to_be16(len);
+ jh->h_type = cpu_to_be16(type);
+ jh->h_datalen = cpu_to_be16(datalen);
+ jh->h_compr = compr;
+ jh->h_pad[0] = 'H';
+ jh->h_pad[1] = 'E';
+ jh->h_pad[2] = 'A';
+ jh->h_pad[3] = 'D';
+ jh->h_pad[4] = 'R';
+ jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
+ return ALIGN(len, 16) + sizeof(*jh);
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+ struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+ size_t len = datalen;
+
+ return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+ return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+ u16 *type, size_t *len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_je_journal_ec *ec = _ec;
+ int i;
+
+ journal_for_each(i)
+ ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+ *type = JE_ERASECOUNT;
+ *len = logfs_journal_erasecount_size(super);
+ return ec;
+}
+
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+ size_t ignore2)
+{
+ struct logfs_shadow *shadow = _shadow;
+ struct super_block *sb = (void *)_sb;
+ struct logfs_super *super = logfs_super(sb);
+
+ /* consume new space */
+ super->s_free_bytes -= shadow->new_len;
+ super->s_used_bytes += shadow->new_len;
+ super->s_dirty_used_bytes -= shadow->new_len;
+
+ /* free up old space */
+ super->s_free_bytes += shadow->old_len;
+ super->s_used_bytes -= shadow->old_len;
+ super->s_dirty_free_bytes -= shadow->old_len;
+
+ logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+ logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+
+ log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+ shadow->ino, shadow->bix, shadow->gc_level,
+ shadow->old_ofs, shadow->new_ofs,
+ shadow->old_len, shadow->new_len);
+ mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void account_shadows(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode = super->s_master_inode;
+ struct logfs_inode *li = logfs_inode(inode);
+ struct shadow_tree *tree = &super->s_shadow_tree;
+
+ btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+ btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+
+ if (li->li_block) {
+ /*
+ * We never actually use the structure, when attached to the
+ * master inode. But it is easier to always free it here than
+ * to have checks in several places elsewhere when allocating
+ * it.
+ */
+ li->li_block->ops->free_block(sb, li->li_block);
+ }
+ BUG_ON((s64)li->li_used_bytes < 0);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+ u16 *type, size_t *len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_je_anchor *da = _da;
+ struct inode *inode = super->s_master_inode;
+ struct logfs_inode *li = logfs_inode(inode);
+ int i;
+
+ da->da_height = li->li_height;
+ da->da_last_ino = cpu_to_be64(super->s_last_ino);
+ da->da_size = cpu_to_be64(i_size_read(inode));
+ da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ da->da_data[i] = cpu_to_be64(li->li_data[i]);
+ *type = JE_ANCHOR;
+ *len = sizeof(*da);
+ return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+ u16 *type, size_t *len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_je_dynsb *dynsb = _dynsb;
+
+ dynsb->ds_gec = cpu_to_be64(super->s_gec);
+ dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
+ dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
+ dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
+ dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
+ dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
+ dynsb->ds_generation = cpu_to_be32(super->s_generation);
+ *type = JE_DYNSB;
+ *len = sizeof(*dynsb);
+ return dynsb;
+}
+
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+ void *wbuf)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ u64 ofs;
+ pgoff_t index;
+ int page_ofs;
+ struct page *page;
+
+ ofs = dev_ofs(sb, area->a_segno,
+ area->a_used_bytes & ~(super->s_writesize - 1));
+ index = ofs >> PAGE_SHIFT;
+ page_ofs = ofs & (PAGE_SIZE - 1);
+
+ page = find_lock_page(mapping, index);
+ BUG_ON(!page);
+ memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+ unlock_page(page);
+}
+
+static void *logfs_write_area(struct super_block *sb, void *_a,
+ u16 *type, size_t *len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_area[super->s_sum_index];
+ struct logfs_je_area *a = _a;
+
+ a->vim = VIM_DEFAULT;
+ a->gc_level = super->s_sum_index;
+ a->used_bytes = cpu_to_be32(area->a_used_bytes);
+ a->segno = cpu_to_be32(area->a_segno);
+ if (super->s_writesize > 1)
+ write_wbuf(sb, area, a + 1);
+
+ *type = JE_AREA;
+ *len = sizeof(*a) + super->s_writesize;
+ return a;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+ u16 *type, size_t *len)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ *type = JE_COMMIT;
+ *len = super->s_no_je * sizeof(__be64);
+ return super->s_je_array;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+ size_t len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ void *header = super->s_compressed_je;
+ void *data = header + sizeof(struct logfs_journal_header);
+ ssize_t compr_len, pad_len;
+ u8 compr = COMPR_ZLIB;
+
+ if (len == 0)
+ return logfs_write_header(super, header, 0, type);
+
+ compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+ if (compr_len < 0 || type == JE_ANCHOR) {
+ BUG_ON(len > sb->s_blocksize);
+ memcpy(data, buf, len);
+ compr_len = len;
+ compr = COMPR_NONE;
+ }
+
+ pad_len = ALIGN(compr_len, 16);
+ memset(data + compr_len, 0, pad_len - compr_len);
+
+ return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+ int must_pad)
+{
+ u32 writesize = logfs_super(area->a_sb)->s_writesize;
+ s32 ofs;
+ int ret;
+
+ ret = logfs_open_area(area, *bytes);
+ if (ret)
+ return -EAGAIN;
+
+ ofs = area->a_used_bytes;
+ area->a_used_bytes += *bytes;
+
+ if (must_pad) {
+ area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+ *bytes = area->a_used_bytes - ofs;
+ }
+
+ return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+ size_t buf_len)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_journal_area;
+ struct logfs_journal_header *jh = super->s_compressed_je;
+ size_t len;
+ int must_pad = 0;
+ s64 ofs;
+
+ len = __logfs_write_je(sb, buf, type, buf_len);
+ if (jh->h_type == cpu_to_be16(JE_COMMIT))
+ must_pad = 1;
+
+ ofs = logfs_get_free_bytes(area, &len, must_pad);
+ if (ofs < 0)
+ return ofs;
+ logfs_buf_write(area, ofs, super->s_compressed_je, len);
+ super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+ return 0;
+}
+
+static int logfs_write_je(struct super_block *sb,
+ void* (*write)(struct super_block *sb, void *scratch,
+ u16 *type, size_t *len))
+{
+ void *buf;
+ size_t len;
+ u16 type;
+
+ buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+ return logfs_write_je_buf(sb, buf, type, len);
+}
+
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+ level_t level, int child_no, __be64 val)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_obj_alias *oa = super->s_je;
+ int err = 0, fill = super->s_je_fill;
+
+ log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+ fill, ino, bix, level, child_no, be64_to_cpu(val));
+ oa[fill].ino = cpu_to_be64(ino);
+ oa[fill].bix = cpu_to_be64(bix);
+ oa[fill].val = val;
+ oa[fill].level = (__force u8)level;
+ oa[fill].child_no = cpu_to_be16(child_no);
+ fill++;
+ if (fill >= sb->s_blocksize / sizeof(*oa)) {
+ err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+ fill = 0;
+ }
+
+ super->s_je_fill = fill;
+ return err;
+}
+
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int err;
+
+ log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+ super->s_no_object_aliases);
+ super->s_je_fill = 0;
+ err = logfs_write_obj_aliases_pagecache(sb);
+ if (err)
+ return err;
+
+ if (super->s_je_fill)
+ err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+ super->s_je_fill
+ * sizeof(struct logfs_obj_alias));
+ return err;
+}
+
+/*
+ * Write all journal entries. The goto logic ensures that all journal entries
+ * are written whenever a new segment is used. It is ugly and potentially a
+ * bit wasteful, but robustness is more important. With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_journal_area;
+ int i, err;
+
+ if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
+ return;
+ super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
+
+ BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+ mutex_lock(&super->s_journal_mutex);
+
+ /* Do this first or suffer corruption */
+ logfs_sync_segments(sb);
+ account_shadows(sb);
+
+again:
+ super->s_no_je = 0;
+ for_each_area(i) {
+ if (!super->s_area[i]->a_is_open)
+ continue;
+ super->s_sum_index = i;
+ err = logfs_write_je(sb, logfs_write_area);
+ if (err)
+ goto again;
+ }
+ err = logfs_write_obj_aliases(sb);
+ if (err)
+ goto again;
+ err = logfs_write_je(sb, logfs_write_erasecount);
+ if (err)
+ goto again;
+ err = logfs_write_je(sb, __logfs_write_anchor);
+ if (err)
+ goto again;
+ err = logfs_write_je(sb, logfs_write_dynsb);
+ if (err)
+ goto again;
+ /*
+ * Order is imperative. First we sync all writes, including the
+ * non-committed journal writes. Then we write the final commit and
+ * sync the current journal segment.
+ * There is a theoretical bug here. Syncing the journal segment will
+ * write a number of journal entries and the final commit. All these
+ * are written in a single operation. If the device layer writes the
+ * data back-to-front, the commit will precede the other journal
+ * entries, leaving a race window.
+ * Two fixes are possible. Preferred is to fix the device layer to
+ * ensure writes happen front-to-back. Alternatively we can insert
+ * another logfs_sync_area() super->s_devops->sync() combo before
+ * writing the commit.
+ */
+ /*
+ * On another subject, super->s_devops->sync is usually not necessary.
+ * Unless called from sys_sync or friends, a barrier would suffice.
+ */
+ super->s_devops->sync(sb);
+ err = logfs_write_je(sb, logfs_write_commit);
+ if (err)
+ goto again;
+ log_journal("Write commit to %llx\n",
+ be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+ logfs_sync_area(area);
+ BUG_ON(area->a_used_bytes != area->a_written_bytes);
+ super->s_devops->sync(sb);
+
+ mutex_unlock(&super->s_journal_mutex);
+ return;
+}
+
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_area *area = super->s_journal_area;
+ u32 segno, ec;
+ int i, err;
+
+ log_journal("Journal requires wear-leveling.\n");
+ /* Drop old segments */
+ journal_for_each(i)
+ if (super->s_journal_seg[i]) {
+ logfs_set_segment_unreserved(sb,
+ super->s_journal_seg[i],
+ super->s_journal_ec[i]);
+ super->s_journal_seg[i] = 0;
+ super->s_journal_ec[i] = 0;
+ }
+ /* Get new segments */
+ for (i = 0; i < super->s_no_journal_segs; i++) {
+ segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+ super->s_journal_seg[i] = segno;
+ super->s_journal_ec[i] = ec;
+ logfs_set_segment_reserved(sb, segno);
+ }
+ /* Manually move journal_area */
+ area->a_segno = super->s_journal_seg[0];
+ area->a_is_open = 0;
+ area->a_used_bytes = 0;
+ /* Write journal */
+ logfs_write_anchor(sb);
+ /* Write superblocks */
+ err = logfs_write_sb(sb);
+ BUG_ON(err);
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+ .get_free_segment = journal_get_free_segment,
+ .get_erase_count = journal_get_erase_count,
+ .erase_segment = journal_erase_segment,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+ + MAX_JOURNAL_HEADER;
+ int ret = -ENOMEM;
+
+ mutex_init(&super->s_journal_mutex);
+ btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+
+ super->s_je = kzalloc(bufsize, GFP_KERNEL);
+ if (!super->s_je)
+ return ret;
+
+ super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+ if (!super->s_compressed_je)
+ return ret;
+
+ super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+ if (IS_ERR(super->s_master_inode))
+ return PTR_ERR(super->s_master_inode);
+
+ ret = logfs_read_journal(sb);
+ if (ret)
+ return -EIO;
+
+ reserve_sb_and_journal(sb);
+ logfs_calc_free(sb);
+
+ super->s_journal_area->a_ops = &journal_area_ops;
+ return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+ destroy_meta_inode(super->s_master_inode);
+ super->s_master_inode = NULL;
+
+ kfree(super->s_compressed_je);
+ kfree(super->s_je);
+}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 00000000000..12977943137
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,724 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define LOGFS_DEBUG_SUPER (0x0001)
+#define LOGFS_DEBUG_SEGMENT (0x0002)
+#define LOGFS_DEBUG_JOURNAL (0x0004)
+#define LOGFS_DEBUG_DIR (0x0008)
+#define LOGFS_DEBUG_FILE (0x0010)
+#define LOGFS_DEBUG_INODE (0x0020)
+#define LOGFS_DEBUG_READWRITE (0x0040)
+#define LOGFS_DEBUG_GC (0x0080)
+#define LOGFS_DEBUG_GC_NOISY (0x0100)
+#define LOGFS_DEBUG_ALIASES (0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
+#define LOGFS_DEBUG_ALL (0xffffffff)
+
+#define LOGFS_DEBUG (0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG (0)
+#endif
+
+#define log_cond(cond, fmt, arg...) do { \
+ if (cond) \
+ printk(KERN_DEBUG fmt, ##arg); \
+} while (0)
+
+#define log_super(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+ log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+
+#define PG_pre_locked PG_owner_priv_1
+#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX (1<<30)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO 0x0001
+#define LOGFS_SB_FLAG_DIRTY 0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
+
+/* Write Control Flags */
+#define WF_LOCK 0x01 /* take write lock */
+#define WF_WRITE 0x02 /* write block */
+#define WF_DELETE 0x04 /* delete old block */
+
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
+ (__force level_t)((__force u8)(level) - 1) )
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb: the superblock this area belongs to
+ * @a_is_open: 1 if the area is currently open, else 0
+ * @a_segno: segment number of area
+ * @a_written_bytes: number of bytes already written back
+ * @a_used_bytes: number of used bytes
+ * @a_ops: area operations (either journal or ostore)
+ * @a_erase_count: erase count
+ * @a_level: GC level
+ */
+struct logfs_area { /* a segment open for writing */
+ struct super_block *a_sb;
+ int a_is_open;
+ u32 a_segno;
+ u32 a_written_bytes;
+ u32 a_used_bytes;
+ const struct logfs_area_ops *a_ops;
+ u32 a_erase_count;
+ gc_level_t a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment: fill area->ofs with the offset of a free segment
+ * @get_erase_count: fill area->erase_count (needs area->ofs)
+ * @erase_segment: erase and setup segment
+ */
+struct logfs_area_ops {
+ void (*get_free_segment)(struct logfs_area *area);
+ void (*get_erase_count)(struct logfs_area *area);
+ int (*erase_segment)(struct logfs_area *area);
+};
+
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage: read one page (mm page)
+ * @writeseg: write one segment. may be a partial segment
+ * @erase: erase one segment
+ * @read: read from the device
+ * @erase: erase part of the device
+ */
+struct logfs_device_ops {
+ struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+ struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+ int (*write_sb)(struct super_block *sb, struct page *page);
+ int (*readpage)(void *_sb, struct page *page);
+ void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+ int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+ int ensure_write);
+ void (*sync)(struct super_block *sb);
+ void (*put_device)(struct super_block *sb);
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+ struct rb_root rb_tree;
+ int count;
+ int maxcount;
+ int sort_by_ec;
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list: list (either free of low)
+ * @segno: segment number
+ * @valid: number of valid bytes
+ * @erase_count: erase count of segment
+ * @dist: distance from tree root
+ *
+ * Candidates can be on two lists. The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data. The
+ * low list contains candidates to be picked for GC. It should be kept
+ * short. It is not required to always pick a perfect candidate. In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+ struct rb_node rb_node;
+ struct candidate_list *list;
+ u32 segno;
+ u32 valid;
+ u32 erase_count;
+ u8 dist;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version: normalized version
+ * @len: length
+ * @offset: offset
+ */
+struct logfs_journal_entry {
+ int used;
+ s16 version;
+ u16 len;
+ u16 datalen;
+ u64 offset;
+};
+
+enum transaction_state {
+ CREATE_1 = 1,
+ CREATE_2,
+ UNLINK_1,
+ UNLINK_2,
+ CROSS_RENAME_1,
+ CROSS_RENAME_2,
+ TARGET_RENAME_1,
+ TARGET_RENAME_2,
+ TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino: target inode
+ * @dir: inode of directory containing dentry
+ * @pos: pos of dentry in directory
+ */
+struct logfs_transaction {
+ enum transaction_state state;
+ u64 ino;
+ u64 dir;
+ u64 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs: offset of old block on medium
+ * @new_ofs: offset of new block on medium
+ * @ino: inode number
+ * @bix: block index
+ * @old_len: size of old block, including header
+ * @new_len: size of new block, including header
+ * @level: block level
+ */
+struct logfs_shadow {
+ u64 old_ofs;
+ u64 new_ofs;
+ u64 ino;
+ u64 bix;
+ int old_len;
+ int new_len;
+ gc_level_t gc_level;
+};
+
+/**
+ * struct shadow_tree
+ * @new: shadows where old_ofs==0, indexed by new_ofs
+ * @old: shadows where old_ofs!=0, indexed by old_ofs
+ */
+struct shadow_tree {
+ struct btree_head64 new;
+ struct btree_head64 old;
+};
+
+struct object_alias_item {
+ struct list_head list;
+ __be64 val;
+ int child_no;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @type: indirect block or inode
+ * @full: number of fully populated children
+ * @partial: number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages. But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT 1 /* Indirect block */
+#define BLOCK_INODE 2 /* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+ struct list_head alias_list;
+ struct list_head item_list;
+ struct super_block *sb;
+ u64 ino;
+ u64 bix;
+ level_t level;
+ struct page *page;
+ struct inode *inode;
+ struct logfs_transaction *ta;
+ unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+ struct logfs_block_ops *ops;
+ int full;
+ int partial;
+ int reserved_bytes;
+};
+
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+ level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+ void (*write_block)(struct logfs_block *block);
+ gc_level_t (*block_level)(struct logfs_block *block);
+ void (*free_block)(struct super_block *sb, struct logfs_block*block);
+ int (*write_alias)(struct super_block *sb,
+ struct logfs_block *block,
+ write_alias_t *write_one_alias);
+};
+
+struct logfs_super {
+ struct mtd_info *s_mtd; /* underlying device */
+ struct block_device *s_bdev; /* underlying device */
+ const struct logfs_device_ops *s_devops;/* device access */
+ struct inode *s_master_inode; /* inode file */
+ struct inode *s_segfile_inode; /* segment file */
+ struct inode *s_mapping_inode; /* device mapping */
+ atomic_t s_pending_writes; /* outstanting bios */
+ long s_flags;
+ mempool_t *s_btree_pool; /* for btree nodes */
+ mempool_t *s_alias_pool; /* aliases in segment.c */
+ u64 s_feature_incompat;
+ u64 s_feature_ro_compat;
+ u64 s_feature_compat;
+ u64 s_feature_flags;
+ u64 s_sb_ofs[2];
+ struct page *s_erase_page; /* for dev_bdev.c */
+ /* alias.c fields */
+ struct btree_head32 s_segment_alias; /* remapped segments */
+ int s_no_object_aliases;
+ struct list_head s_object_alias; /* remapped objects */
+ struct btree_head128 s_object_alias_tree; /* remapped objects */
+ struct mutex s_object_alias_mutex;
+ /* dir.c fields */
+ struct mutex s_dirop_mutex; /* for creat/unlink/rename */
+ u64 s_victim_ino; /* used for atomic dir-ops */
+ u64 s_rename_dir; /* source directory ino */
+ u64 s_rename_pos; /* position of source dd */
+ /* gc.c fields */
+ long s_segsize; /* size of a segment */
+ int s_segshift; /* log2 of segment size */
+ long s_segmask; /* 1 << s_segshift - 1 */
+ long s_no_segs; /* segments on device */
+ long s_no_journal_segs; /* segments used for journal */
+ long s_no_blocks; /* blocks per segment */
+ long s_writesize; /* minimum write size */
+ int s_writeshift; /* log2 of write size */
+ u64 s_size; /* filesystem size */
+ struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
+ u64 s_gec; /* global erase count */
+ u64 s_wl_gec_ostore; /* time of last wl event */
+ u64 s_wl_gec_journal; /* time of last wl event */
+ u64 s_sweeper; /* current sweeper pos */
+ u8 s_ifile_levels; /* max level of ifile */
+ u8 s_iblock_levels; /* max level of regular files */
+ u8 s_data_levels; /* # of segments to leaf block*/
+ u8 s_total_levels; /* sum of above three */
+ struct btree_head32 s_cand_tree; /* all candidates */
+ struct candidate_list s_free_list; /* 100% free segments */
+ struct candidate_list s_reserve_list; /* Bad segment reserve */
+ struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+ struct candidate_list s_ec_list; /* wear level candidates */
+ struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+ /* inode.c fields */
+ u64 s_last_ino; /* highest ino used */
+ long s_inos_till_wrap;
+ u32 s_generation; /* i_generation for new files */
+ struct list_head s_freeing_list; /* inodes being freed */
+ /* journal.c fields */
+ struct mutex s_journal_mutex;
+ void *s_je; /* journal entry to compress */
+ void *s_compressed_je; /* block to write to journal */
+ u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+ u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+ u64 s_last_version;
+ struct logfs_area *s_journal_area; /* open journal segment */
+ __be64 s_je_array[64];
+ int s_no_je;
+
+ int s_sum_index; /* for the 12 summaries */
+ struct shadow_tree s_shadow_tree;
+ int s_je_fill; /* index of current je */
+ /* readwrite.c fields */
+ struct mutex s_write_mutex;
+ int s_lock_count;
+ mempool_t *s_block_pool; /* struct logfs_block pool */
+ mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
+ /*
+ * Space accounting:
+ * - s_used_bytes specifies space used to store valid data objects.
+ * - s_dirty_used_bytes is space used to store non-committed data
+ * objects. Those objects have already been written themselves,
+ * but they don't become valid until all indirect blocks up to the
+ * journal have been written as well.
+ * - s_dirty_free_bytes is space used to store the old copy of a
+ * replaced object, as long as the replacement is non-committed.
+ * In other words, it is the amount of space freed when all dirty
+ * blocks are written back.
+ * - s_free_bytes is the amount of free space available for any
+ * purpose.
+ * - s_root_reserve is the amount of free space available only to
+ * the root user. Non-privileged users can no longer write once
+ * this watermark has been reached.
+ * - s_speed_reserve is space which remains unused to speed up
+ * garbage collection performance.
+ * - s_dirty_pages is the space reserved for currently dirty pages.
+ * It is a pessimistic estimate, so some/most will get freed on
+ * page writeback.
+ *
+ * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+ */
+ u64 s_free_bytes;
+ u64 s_used_bytes;
+ u64 s_dirty_free_bytes;
+ u64 s_dirty_used_bytes;
+ u64 s_root_reserve;
+ u64 s_speed_reserve;
+ u64 s_dirty_pages;
+ /* Bad block handling:
+ * - s_bad_seg_reserve is a number of segments usually kept
+ * free. When encountering bad blocks, the affected segment's data
+ * is _temporarily_ moved to a reserved segment.
+ * - s_bad_segments is the number of known bad segments.
+ */
+ u32 s_bad_seg_reserve;
+ u32 s_bad_segments;
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode: struct inode
+ * @li_data: data pointers
+ * @li_used_bytes: number of used bytes
+ * @li_freeing_list: used to track inodes currently being freed
+ * @li_flags: inode flags
+ * @li_refcount: number of internal (GC-induced) references
+ */
+struct logfs_inode {
+ struct inode vfs_inode;
+ u64 li_data[LOGFS_EMBEDDED_FIELDS];
+ u64 li_used_bytes;
+ struct list_head li_freeing_list;
+ struct logfs_block *li_block;
+ u32 li_flags;
+ u8 li_height;
+ int li_refcount;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+ const char *devname, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+ const char *devname, struct vfsmount *mnt)
+{
+ return -ENODEV;
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+ int mtdnr, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+ int mtdnr, struct vfsmount *mnt)
+{
+ return -ENODEV;
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+ unsigned long arg);
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void destroy_meta_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, long flags);
+void logfs_delete_inode(struct inode *inode);
+void logfs_clear_inode(struct inode *inode);
+
+/* journal.c */
+void logfs_write_anchor(struct super_block *sb);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+ level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+ loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+ struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+ gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+ gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+ struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+ gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+ u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+ __be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+extern struct logfs_block_ops indirect_block_ops;
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+ level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+ struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+ struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+ int use_filler);
+
+static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+ void *buf, size_t len)
+{
+ __logfs_buf_write(area, ofs, buf, len, 0);
+}
+
+static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+ void *buf, size_t len)
+{
+ __logfs_buf_write(area, ofs, buf, len, 1);
+}
+
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+ struct mtd_info *mtd, struct block_device *bdev,
+ const struct logfs_device_ops *devops, struct vfsmount *mnt);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+ return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+ logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do { \
+ struct super_block *__sb = sb; \
+ logfs_crash_dump(__sb); \
+ logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
+ BUG(); \
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+ do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+ return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+ return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+ return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+ return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+ return ofs >> logfs_super(sb)->s_segshift;
+}
+
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+ return ofs & logfs_super(sb)->s_segmask;
+}
+
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+ return ofs & ~logfs_super(sb)->s_segmask;
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+ return (void *)page->private;
+}
+
+static inline level_t shrink_level(gc_level_t __level)
+{
+ u8 level = (__force u8)__level;
+
+ if (level >= LOGFS_MAX_LEVELS)
+ level -= LOGFS_MAX_LEVELS;
+ return (__force level_t)level;
+}
+
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+ u8 level = (__force u8)__level;
+
+ if (ino == LOGFS_INO_MASTER) {
+ /* ifile has seperate areas */
+ level += LOGFS_MAX_LEVELS;
+ }
+ return (__force gc_level_t)level;
+}
+
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+ level = shrink_level((__force gc_level_t)level);
+ return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+ return ~0ull << logfs_block_shift(sb, level);
+}
+
+static inline struct logfs_area *get_area(struct super_block *sb,
+ gc_level_t gc_level)
+{
+ return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+
+#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 00000000000..f674725663f
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+
+#define SIZE_CHECK(type, size) \
+static inline void check_##type(void) \
+{ \
+ BUILD_BUG_ON(sizeof(struct type) != (size)); \
+}
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets. To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level. The following levels are used:
+ * 0 - regular data block
+ * 1 - i1 indirect blocks
+ * 2 - i2 indirect blocks
+ * 3 - i3 indirect blocks
+ * 4 - i4 indirect blocks
+ * 5 - i5 indirect blocks
+ * 6 - ifile data blocks
+ * 7 - ifile i1 indirect blocks
+ * 8 - ifile i2 indirect blocks
+ * 9 - ifile i3 indirect blocks
+ * 10 - ifile i4 indirect blocks
+ * 11 - ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12 - gc recycled blocks, long-lived data
+ * 13 - replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help seperate
+ * short-lived metadata from longer-lived file data. In the future,
+ * file data should get seperated into several segments based on simple
+ * heuristics. Old data recycled during gc operation is expected to be
+ * long-lived. New data is of uncertain life expectancy. New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
+#define LOGFS_MAGIC_U32 0xc97e8168u
+
+/*
+ * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent. Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE - self-explaining
+ * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
+ * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE (4096ull)
+#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS (9)
+
+/*
+ * Number of blocks at various levels of indirection. There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS (16)
+#define I1_BLOCKS LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+#define INDIRECT_INDEX I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
+
+/*
+ * Sizes at which files require another level of indirection. Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to seperate data into levels. Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT (5)
+#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN (255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS (16)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS (64)
+
+
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE (0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE \
+ (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+/*
+ * Segment types:
+ * SEG_SUPER - Data or indirect block
+ * SEG_JOURNAL - Inode
+ * SEG_OSTORE - Dentry
+ */
+enum {
+ SEG_SUPER = 0x01,
+ SEG_JOURNAL = 0x02,
+ SEG_OSTORE = 0x03,
+};
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc: crc32 of header (there is no data)
+ * @pad: unused, must be 0
+ * @type: segment type, see above
+ * @level: GC level for all objects in this segment
+ * @segno: segment number
+ * @ec: erase count for this segment
+ * @gec: global erase count at time of writing
+ */
+struct logfs_segment_header {
+ __be32 crc;
+ __be16 pad;
+ __u8 type;
+ __u8 level;
+ __be32 segno;
+ __be32 ec;
+ __be64 gec;
+};
+
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+
+#define LOGFS_FEATURES_INCOMPAT (0ull)
+#define LOGFS_FEATURES_RO_COMPAT (0ull)
+#define LOGFS_FEATURES_COMPAT (0ull)
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic: magic number, must equal LOGFS_MAGIC
+ * @ds_crc: crc32 of structure starting with the next field
+ * @ds_ifile_levels: maximum number of levels for ifile
+ * @ds_iblock_levels: maximum number of levels for regular files
+ * @ds_data_levels: number of seperate levels for data
+ * @pad0: reserved, must be 0
+ * @ds_feature_incompat: incompatible filesystem features
+ * @ds_feature_ro_compat: read-only compatible filesystem features
+ * @ds_feature_compat: compatible filesystem features
+ * @ds_flags: flags
+ * @ds_segment_shift: log2 of segment size
+ * @ds_block_shift: log2 of block size
+ * @ds_write_shift: log2 of write size
+ * @pad1: reserved, must be 0
+ * @ds_journal_seg: segments used by primary journal
+ * @ds_root_reserve: bytes reserved for the superuser
+ * @ds_speed_reserve: bytes reserved to speed up GC
+ * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
+ * @pad2: reserved, must be 0
+ * @pad3: reserved, must be 0
+ *
+ * Contains only read-only fields. Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+ struct logfs_segment_header ds_sh;
+ __be64 ds_magic;
+
+ __be32 ds_crc;
+ __u8 ds_ifile_levels;
+ __u8 ds_iblock_levels;
+ __u8 ds_data_levels;
+ __u8 ds_segment_shift;
+ __u8 ds_block_shift;
+ __u8 ds_write_shift;
+ __u8 pad0[6];
+
+ __be64 ds_filesystem_size;
+ __be32 ds_segment_size;
+ __be32 ds_bad_seg_reserve;
+
+ __be64 ds_feature_incompat;
+ __be64 ds_feature_ro_compat;
+
+ __be64 ds_feature_compat;
+ __be64 ds_feature_flags;
+
+ __be64 ds_root_reserve;
+ __be64 ds_speed_reserve;
+
+ __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+ __be64 ds_super_ofs[2];
+ __be64 pad3[8];
+};
+
+SIZE_CHECK(logfs_disk_super, 256);
+
+/*
+ * Object types:
+ * OBJ_BLOCK - Data or indirect block
+ * OBJ_INODE - Inode
+ * OBJ_DENTRY - Dentry
+ */
+enum {
+ OBJ_BLOCK = 0x04,
+ OBJ_INODE = 0x05,
+ OBJ_DENTRY = 0x06,
+};
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc: crc32 of header, excluding data_crc
+ * @len: length of data
+ * @type: object type, see above
+ * @compr: compression type
+ * @ino: inode number
+ * @bix: block index
+ * @data_crc: crc32 of payload
+ */
+struct logfs_object_header {
+ __be32 crc;
+ __be16 len;
+ __u8 type;
+ __u8 compr;
+ __be64 ino;
+ __be64 bix;
+ __be32 data_crc;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER - master inode (for inode file)
+ * LOGFS_INO_ROOT - root directory
+ * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
+ */
+enum {
+ LOGFS_INO_MAPPING = 0x00,
+ LOGFS_INO_MASTER = 0x01,
+ LOGFS_INO_ROOT = 0x02,
+ LOGFS_INO_SEGFILE = 0x03,
+ LOGFS_RESERVED_INOS = 0x10,
+};
+
+/*
+ * Inode flags. High bits should never be written to the medium. They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY Inode must be written back
+ * LOGFS_IF_ZOMBIE Inode has been deleted
+ * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY 0x20000000
+#define LOGFS_IF_ZOMBIE 0x40000000
+#define LOGFS_IF_STILLBORN 0x80000000
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode: file mode
+ * @di_pad: reserved, must be 0
+ * @di_flags: inode flags, see above
+ * @di_uid: user id
+ * @di_gid: group id
+ * @di_ctime: change time
+ * @di_mtime: modify time
+ * @di_refcount: reference count (aka nlink or link count)
+ * @di_generation: inode generation, for nfs
+ * @di_used_bytes: number of bytes used
+ * @di_size: file size
+ * @di_data: data pointers
+ */
+struct logfs_disk_inode {
+ __be16 di_mode;
+ __u8 di_height;
+ __u8 di_pad;
+ __be32 di_flags;
+ __be32 di_uid;
+ __be32 di_gid;
+
+ __be64 di_ctime;
+ __be64 di_mtime;
+
+ __be64 di_atime;
+ __be32 di_refcount;
+ __be32 di_generation;
+
+ __be64 di_used_bytes;
+ __be64 di_size;
+
+ __be64 di_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_disk_inode, 200);
+
+#define INODE_POINTER_OFS \
+ (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+ (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+ (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS (0)
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino: inode number
+ * @namelen: length of file name
+ * @type: file type, identical to bits 12..15 of mode
+ * @name: file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+ __be64 ino;
+ __be16 namelen;
+ __u8 type;
+ __u8 name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_disk_dentry, 266);
+
+#define RESERVED 0xffffffff
+#define BADSEG 0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level: erase count and level
+ * @valid: number of valid bytes
+ *
+ * Segment file contains one entry for every segment. ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits. An
+ * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+ __be32 ec_level;
+ __be32 valid;
+};
+
+SIZE_CHECK(logfs_segment_entry, 8);
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc: crc32 of journal entry
+ * @h_len: length of compressed journal entry,
+ * not including header
+ * @h_datalen: length of uncompressed data
+ * @h_type: JE type
+ * @h_compr: compression type
+ * @h_pad: reserved
+ */
+struct logfs_journal_header {
+ __be32 h_crc;
+ __be16 h_len;
+ __be16 h_datalen;
+ __be16 h_type;
+ __u8 h_compr;
+ __u8 h_pad[5];
+};
+
+SIZE_CHECK(logfs_journal_header, 16);
+
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT - default vim
+ * VIM_SEGFILE - for segment file only - very short-living
+ * VIM_GC - GC'd data - likely long-living
+ */
+enum logfs_vim {
+ VIM_DEFAULT = 0,
+ VIM_SEGFILE = 1,
+};
+
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno: segment number of area
+ * @used_bytes: number of bytes already used
+ * @gc_level: GC level
+ * @vim: life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing. There is at least
+ * one area per GC level. Several may be used to seperate long-living from
+ * short-living data. If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+ __be32 segno;
+ __be32 used_bytes;
+ __u8 gc_level;
+ __u8 vim;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_je_area, 10);
+
+#define MAX_JOURNAL_HEADER \
+ (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec: global erase count
+ * @ds_sweeper: current position of GC "sweeper"
+ * @ds_rename_dir: source directory ino (see dir.c documentation)
+ * @ds_rename_pos: position of source dd (see dir.c documentation)
+ * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino: parent inode of victim (see dir.c)
+ * @ds_used_bytes: number of used bytes
+ */
+struct logfs_je_dynsb {
+ __be64 ds_gec;
+ __be64 ds_sweeper;
+
+ __be64 ds_rename_dir;
+ __be64 ds_rename_pos;
+
+ __be64 ds_victim_ino;
+ __be64 ds_victim_parent; /* XXX */
+
+ __be64 ds_used_bytes;
+ __be32 ds_generation;
+ __be32 pad;
+};
+
+SIZE_CHECK(logfs_je_dynsb, 64);
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size: size of inode file
+ * @da_last_ino: last created inode
+ * @da_used_bytes: number of bytes used
+ * @da_data: data pointers
+ */
+struct logfs_je_anchor {
+ __be64 da_size;
+ __be64 da_last_ino;
+
+ __be64 da_used_bytes;
+ u8 da_height;
+ u8 pad[7];
+
+ __be64 da_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_je_anchor, 168);
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment: segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+ __be64 so_segment[0];
+};
+
+SIZE_CHECK(logfs_je_spillout, 0);
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec: erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+ __be32 ec[0];
+};
+
+SIZE_CHECK(logfs_je_journal_ec, 0);
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+ __be32 segno;
+ __be32 ec;
+};
+
+SIZE_CHECK(logfs_je_free_segments, 8);
+
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+ __be32 old_segno;
+ __be32 new_segno;
+};
+
+SIZE_CHECK(logfs_seg_alias, 8);
+
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+ __be64 ino;
+ __be64 bix;
+ __be64 val;
+ u8 level;
+ u8 pad[5];
+ __be16 child_no;
+};
+
+SIZE_CHECK(logfs_obj_alias, 32);
+
+/**
+ * Compression types.
+ *
+ * COMPR_NONE - uncompressed
+ * COMPR_ZLIB - compressed with zlib
+ */
+enum {
+ COMPR_NONE = 0,
+ COMPR_ZLIB = 1,
+};
+
+/*
+ * Journal entries come in groups of 16. First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST - smallest possible journal entry number
+ *
+ * JEG_BASE - base group, containing unique entries
+ * JE_COMMIT - commit entry, validates all previous entries
+ * JE_DYNSB - dynamic superblock, anything that ought to be in the
+ * superblock but cannot because it is read-write data
+ * JE_ANCHOR - anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT erasecounts for all journal segments
+ * JE_SPILLOUT - unused
+ * JE_SEG_ALIAS - aliases segments
+ * JE_AREA - area description
+ *
+ * JE_LAST - largest possible journal entry number
+ */
+enum {
+ JE_FIRST = 0x01,
+
+ JEG_BASE = 0x00,
+ JE_COMMIT = 0x02,
+ JE_DYNSB = 0x03,
+ JE_ANCHOR = 0x04,
+ JE_ERASECOUNT = 0x05,
+ JE_SPILLOUT = 0x06,
+ JE_OBJ_ALIAS = 0x0d,
+ JE_AREA = 0x0e,
+
+ JE_LAST = 0x0e,
+};
+
+#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 00000000000..7a23b3e7c0a
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2246 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read read blocks from a file
+ * seek_hole find next hole
+ * seek_data find next data block
+ * valid check whether a block still belongs to a file
+ * write write blocks to a file
+ * delete delete a block (for directories and ifile)
+ * rewrite move existing blocks of a file to a new location (gc helper)
+ * truncate truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+static u64 adjust_bix(u64 bix, level_t level)
+{
+ switch (level) {
+ case 0:
+ return bix;
+ case LEVEL(1):
+ return max_t(u64, bix, I0_BLOCKS);
+ case LEVEL(2):
+ return max_t(u64, bix, I1_BLOCKS);
+ case LEVEL(3):
+ return max_t(u64, bix, I2_BLOCKS);
+ case LEVEL(4):
+ return max_t(u64, bix, I3_BLOCKS);
+ case LEVEL(5):
+ return max_t(u64, bix, I4_BLOCKS);
+ default:
+ WARN_ON(1);
+ return bix;
+ }
+}
+
+static inline u64 maxbix(u8 height)
+{
+ return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+
+/**
+ * The inode address space is cut in two halves. Lower half belongs to data
+ * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT (BITS_PER_LONG - 32)
+#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT (28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+ return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+ pgoff_t index;
+
+ BUG_ON(bix >= INDIRECT_BIT);
+ if (level == 0)
+ return bix;
+
+ index = INDIRECT_BIT;
+ index |= (__force long)level << LEVEL_SHIFT;
+ index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+ return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+ u8 __level;
+
+ if (!(index & INDIRECT_BIT)) {
+ *bix = index;
+ *level = 0;
+ return;
+ }
+
+ __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+ *level = LEVEL(__level);
+ *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+ *bix = adjust_bix(*bix, *level);
+ return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+ return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+ return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ int i;
+
+ inode->i_mode = be16_to_cpu(di->di_mode);
+ li->li_height = di->di_height;
+ li->li_flags = be32_to_cpu(di->di_flags);
+ inode->i_uid = be32_to_cpu(di->di_uid);
+ inode->i_gid = be32_to_cpu(di->di_gid);
+ inode->i_size = be64_to_cpu(di->di_size);
+ logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+ inode->i_atime = be64_to_timespec(di->di_atime);
+ inode->i_ctime = be64_to_timespec(di->di_ctime);
+ inode->i_mtime = be64_to_timespec(di->di_mtime);
+ inode->i_nlink = be32_to_cpu(di->di_refcount);
+ inode->i_generation = be32_to_cpu(di->di_generation);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFSOCK: /* fall through */
+ case S_IFBLK: /* fall through */
+ case S_IFCHR: /* fall through */
+ case S_IFIFO:
+ inode->i_rdev = be64_to_cpu(di->di_data[0]);
+ break;
+ case S_IFDIR: /* fall through */
+ case S_IFREG: /* fall through */
+ case S_IFLNK:
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ li->li_data[i] = be64_to_cpu(di->di_data[i]);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ int i;
+
+ di->di_mode = cpu_to_be16(inode->i_mode);
+ di->di_height = li->li_height;
+ di->di_pad = 0;
+ di->di_flags = cpu_to_be32(li->li_flags);
+ di->di_uid = cpu_to_be32(inode->i_uid);
+ di->di_gid = cpu_to_be32(inode->i_gid);
+ di->di_size = cpu_to_be64(i_size_read(inode));
+ di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+ di->di_atime = timespec_to_be64(inode->i_atime);
+ di->di_ctime = timespec_to_be64(inode->i_ctime);
+ di->di_mtime = timespec_to_be64(inode->i_mtime);
+ di->di_refcount = cpu_to_be32(inode->i_nlink);
+ di->di_generation = cpu_to_be32(inode->i_generation);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFSOCK: /* fall through */
+ case S_IFBLK: /* fall through */
+ case S_IFCHR: /* fall through */
+ case S_IFIFO:
+ di->di_data[0] = cpu_to_be64(inode->i_rdev);
+ break;
+ case S_IFDIR: /* fall through */
+ case S_IFREG: /* fall through */
+ case S_IFLNK:
+ for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+ di->di_data[i] = cpu_to_be64(li->li_data[i]);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_inode *li = logfs_inode(inode);
+
+ inode->i_blocks = ULONG_MAX;
+ if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+ inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ li->li_used_bytes = bytes;
+ __logfs_set_blocks(inode);
+}
+
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ BUG_ON(!PageLocked(page));
+ if (lock) {
+ BUG_ON(PagePreLocked(page));
+ SetPagePreLocked(page);
+ } else {
+ /* We are in GC path. */
+ if (PagePreLocked(page))
+ super->s_lock_count++;
+ else
+ SetPagePreLocked(page);
+ }
+}
+
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ BUG_ON(!PageLocked(page));
+ if (lock)
+ ClearPagePreLocked(page);
+ else {
+ /* We are in GC path. */
+ BUG_ON(!PagePreLocked(page));
+ if (super->s_lock_count)
+ super->s_lock_count--;
+ else
+ ClearPagePreLocked(page);
+ }
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+ int lock)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ if (page)
+ prelock_page(sb, page, lock);
+
+ if (lock) {
+ mutex_lock(&super->s_write_mutex);
+ logfs_gc_pass(sb);
+ /* FIXME: We also have to check for shadowed space
+ * and mempool fill grade */
+ }
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+ int lock)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ if (page)
+ preunlock_page(sb, page, lock);
+ /* Order matters - we must clear PG_pre_locked before releasing
+ * s_write_mutex or we could race against another task. */
+ if (lock)
+ mutex_unlock(&super->s_write_mutex);
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+ level_t level)
+{
+ return find_or_create_page(inode->i_mapping,
+ logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+ unlock_page(page);
+ page_cache_release(page);
+}
+
+static void logfs_lock_write_page(struct page *page)
+{
+ int loop = 0;
+
+ while (unlikely(!trylock_page(page))) {
+ if (loop++ > 0x1000) {
+ /* Has been observed once so far... */
+ printk(KERN_ERR "stack at %p\n", &loop);
+ BUG();
+ }
+ if (PagePreLocked(page)) {
+ /* Holder of page lock is waiting for us, it
+ * is safe to use this page. */
+ break;
+ }
+ /* Some other process has this page locked and has
+ * nothing to do with us. Wait for it to finish.
+ */
+ schedule();
+ }
+ BUG_ON(!PageLocked(page));
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+ level_t level)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index = logfs_pack_index(bix, level);
+ struct page *page;
+ int err;
+
+repeat:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(GFP_NOFS);
+ if (!page)
+ return NULL;
+ err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ if (err == -EEXIST)
+ goto repeat;
+ return NULL;
+ }
+ } else logfs_lock_write_page(page);
+ BUG_ON(!PageLocked(page));
+ return page;
+}
+
+static void logfs_unlock_write_page(struct page *page)
+{
+ if (!PagePreLocked(page))
+ unlock_page(page);
+}
+
+static void logfs_put_write_page(struct page *page)
+{
+ logfs_unlock_write_page(page);
+ page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+ int rw)
+{
+ if (rw == READ)
+ return logfs_get_read_page(inode, bix, level);
+ else
+ return logfs_get_write_page(inode, bix, level);
+}
+
+static void logfs_put_page(struct page *page, int rw)
+{
+ if (rw == READ)
+ logfs_put_read_page(page);
+ else
+ logfs_put_write_page(page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+ u64 ret = val;
+
+ ret >>= skip * no;
+ ret <<= 64 - no;
+ ret >>= 64 - no;
+ return ret;
+}
+
+static unsigned long get_bits(u64 val, level_t skip)
+{
+ return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+
+static inline void init_shadow_tree(struct super_block *sb,
+ struct shadow_tree *tree)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ btree_init_mempool64(&tree->new, super->s_btree_pool);
+ btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+
+static void indirect_write_block(struct logfs_block *block)
+{
+ struct page *page;
+ struct inode *inode;
+ int ret;
+
+ page = block->page;
+ inode = page->mapping->host;
+ logfs_lock_write_page(page);
+ ret = logfs_write_buf(inode, page, 0);
+ logfs_unlock_write_page(page);
+ /*
+ * This needs some rework. Unless you want your filesystem to run
+ * completely synchronously (you don't), the filesystem will always
+ * report writes as 'successful' before the actual work has been
+ * done. The actual work gets done here and this is where any errors
+ * will show up. And there isn't much we can do about it, really.
+ *
+ * Some attempts to fix the errors (move from bad blocks, retry io,...)
+ * have already been done, so anything left should be either a broken
+ * device or a bug somewhere in logfs itself. Being relatively new,
+ * the odds currently favor a bug, so for now the line below isn't
+ * entirely tasteles.
+ */
+ BUG_ON(ret);
+}
+
+static void inode_write_block(struct logfs_block *block)
+{
+ struct inode *inode;
+ int ret;
+
+ inode = block->inode;
+ if (inode->i_ino == LOGFS_INO_MASTER)
+ logfs_write_anchor(inode->i_sb);
+ else {
+ ret = __logfs_write_inode(inode, 0);
+ /* see indirect_write_block comment */
+ BUG_ON(ret);
+ }
+}
+
+static gc_level_t inode_block_level(struct logfs_block *block)
+{
+ BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
+ return GC_LEVEL(LOGFS_MAX_LEVELS);
+}
+
+static gc_level_t indirect_block_level(struct logfs_block *block)
+{
+ struct page *page;
+ struct inode *inode;
+ u64 bix;
+ level_t level;
+
+ page = block->page;
+ inode = page->mapping->host;
+ logfs_unpack_index(page->index, &bix, &level);
+ return expand_level(inode->i_ino, level);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning. I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ return find_next_bit(addr, size, offset);
+}
+
+static __be64 inode_val0(struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ u64 val;
+
+ /*
+ * Explicit shifting generates good code, but must match the format
+ * of the structure. Add some paranoia just in case.
+ */
+ BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+ BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+ BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+
+ val = (u64)inode->i_mode << 48 |
+ (u64)li->li_height << 40 |
+ (u64)li->li_flags;
+ return cpu_to_be64(val);
+}
+
+static int inode_write_alias(struct super_block *sb,
+ struct logfs_block *block, write_alias_t *write_one_alias)
+{
+ struct inode *inode = block->inode;
+ struct logfs_inode *li = logfs_inode(inode);
+ unsigned long pos;
+ u64 ino , bix;
+ __be64 val;
+ level_t level;
+ int err;
+
+ for (pos = 0; ; pos++) {
+ pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+ if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+ return 0;
+
+ switch (pos) {
+ case INODE_HEIGHT_OFS:
+ val = inode_val0(inode);
+ break;
+ case INODE_USED_OFS:
+ val = cpu_to_be64(li->li_used_bytes);;
+ break;
+ case INODE_SIZE_OFS:
+ val = cpu_to_be64(i_size_read(inode));
+ break;
+ case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+ val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+ break;
+ default:
+ BUG();
+ }
+
+ ino = LOGFS_INO_MASTER;
+ bix = inode->i_ino;
+ level = LEVEL(0);
+ err = write_one_alias(sb, ino, bix, level, pos, val);
+ if (err)
+ return err;
+ }
+}
+
+static int indirect_write_alias(struct super_block *sb,
+ struct logfs_block *block, write_alias_t *write_one_alias)
+{
+ unsigned long pos;
+ struct page *page = block->page;
+ u64 ino , bix;
+ __be64 *child, val;
+ level_t level;
+ int err;
+
+ for (pos = 0; ; pos++) {
+ pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+ if (pos >= LOGFS_BLOCK_FACTOR)
+ return 0;
+
+ ino = page->mapping->host->i_ino;
+ logfs_unpack_index(page->index, &bix, &level);
+ child = kmap_atomic(page, KM_USER0);
+ val = child[pos];
+ kunmap_atomic(child, KM_USER0);
+ err = write_one_alias(sb, ino, bix, level, pos, val);
+ if (err)
+ return err;
+ }
+}
+
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_block *block;
+ int err;
+
+ list_for_each_entry(block, &super->s_object_alias, alias_list) {
+ err = block->ops->write_alias(sb, block, write_alias_journal);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+ BUG_ON(!list_empty(&block->item_list));
+ list_del(&block->alias_list);
+ mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+ struct inode *inode = block->inode;
+
+ logfs_inode(inode)->li_block = NULL;
+ __free_block(sb, block);
+}
+
+static void indirect_free_block(struct super_block *sb,
+ struct logfs_block *block)
+{
+ ClearPagePrivate(block->page);
+ block->page->private = 0;
+ __free_block(sb, block);
+}
+
+
+static struct logfs_block_ops inode_block_ops = {
+ .write_block = inode_write_block,
+ .block_level = inode_block_level,
+ .free_block = inode_free_block,
+ .write_alias = inode_write_alias,
+};
+
+struct logfs_block_ops indirect_block_ops = {
+ .write_block = indirect_write_block,
+ .block_level = indirect_block_level,
+ .free_block = indirect_free_block,
+ .write_alias = indirect_write_alias,
+};
+
+struct logfs_block *__alloc_block(struct super_block *sb,
+ u64 ino, u64 bix, level_t level)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_block *block;
+
+ block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+ memset(block, 0, sizeof(*block));
+ INIT_LIST_HEAD(&block->alias_list);
+ INIT_LIST_HEAD(&block->item_list);
+ block->sb = sb;
+ block->ino = ino;
+ block->bix = bix;
+ block->level = level;
+ return block;
+}
+
+static void alloc_inode_block(struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct logfs_block *block;
+
+ if (li->li_block)
+ return;
+
+ block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+ block->inode = inode;
+ li->li_block = block;
+ block->ops = &inode_block_ops;
+}
+
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+ __be64 *array, int page_is_empty)
+{
+ u64 ptr;
+ int i, start;
+
+ block->partial = 0;
+ block->full = 0;
+ start = 0;
+ if (page->index < first_indirect_block()) {
+ /* Counters are pointless on level 0 */
+ return;
+ }
+ if (page->index == first_indirect_block()) {
+ /* Skip unused pointers */
+ start = I0_BLOCKS;
+ block->full = I0_BLOCKS;
+ }
+ if (!page_is_empty) {
+ for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+ ptr = be64_to_cpu(array[i]);
+ if (ptr)
+ block->partial++;
+ if (ptr & LOGFS_FULLY_POPULATED)
+ block->full++;
+ }
+ }
+}
+
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+ struct logfs_block *block;
+ u64 bix;
+ level_t level;
+
+ if (PagePrivate(page))
+ return;
+
+ logfs_unpack_index(page->index, &bix, &level);
+ block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+ block->page = page;
+ SetPagePrivate(page);
+ page->private = (unsigned long)block;
+ block->ops = &indirect_block_ops;
+}
+
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+ int page_is_empty)
+{
+ struct logfs_block *block;
+ __be64 *array;
+
+ if (PagePrivate(page))
+ return;
+
+ alloc_data_block(inode, page);
+
+ block = logfs_block(page);
+ array = kmap_atomic(page, KM_USER0);
+ initialize_block_counters(page, block, array, page_is_empty);
+ kunmap_atomic(array, KM_USER0);
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+ struct logfs_block *block = logfs_block(page);
+ __be64 *array;
+ u64 oldptr;
+
+ BUG_ON(!block);
+ array = kmap_atomic(page, KM_USER0);
+ oldptr = be64_to_cpu(array[index]);
+ array[index] = cpu_to_be64(ptr);
+ kunmap_atomic(array, KM_USER0);
+ SetPageUptodate(page);
+
+ block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+ - !!(oldptr & LOGFS_FULLY_POPULATED);
+ block->partial += !!ptr - !!oldptr;
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+ __be64 *block;
+ u64 ptr;
+
+ block = kmap_atomic(page, KM_USER0);
+ ptr = be64_to_cpu(block[index]);
+ kunmap_atomic(block, KM_USER0);
+ return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ pgoff_t index = page->index;
+ u64 block;
+
+ block = li->li_data[index];
+ if (!block)
+ return logfs_read_empty(page);
+
+ return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page,
+ int rw_context)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+ level_t level, target_level;
+ int ret;
+ struct page *ipage;
+
+ logfs_unpack_index(page->index, &bix, &target_level);
+ if (!bofs)
+ return logfs_read_empty(page);
+
+ if (bix >= maxbix(li->li_height))
+ return logfs_read_empty(page);
+
+ for (level = LEVEL(li->li_height);
+ (__force u8)level > (__force u8)target_level;
+ level = SUBLEVEL(level)){
+ ipage = logfs_get_page(inode, bix, level, rw_context);
+ if (!ipage)
+ return -ENOMEM;
+
+ ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+ if (ret) {
+ logfs_put_read_page(ipage);
+ return ret;
+ }
+
+ bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+ logfs_put_page(ipage, rw_context);
+ if (!bofs)
+ return logfs_read_empty(page);
+ }
+
+ return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page,
+ int rw_context)
+{
+ pgoff_t index = page->index;
+
+ if (index < I0_BLOCKS)
+ return logfs_read_direct(inode, page);
+ return logfs_read_loop(inode, page, rw_context);
+}
+
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ u64 bofs = li->li_data[INDIRECT_INDEX];
+ level_t level;
+ int ret;
+ struct page *ipage;
+
+ if (!bofs)
+ return 0;
+ if (bix >= maxbix(li->li_height))
+ return 0;
+
+ for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+ ipage = logfs_get_read_page(inode, bix, level);
+ if (!ipage)
+ return -ENOMEM;
+
+ ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+ if (ret) {
+ logfs_put_read_page(ipage);
+ return ret;
+ }
+
+ bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+ logfs_put_read_page(ipage);
+ if (!bofs)
+ return 0;
+ }
+
+ return 1;
+}
+
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if (bix < I0_BLOCKS)
+ return !!li->li_data[bix];
+ return logfs_exist_loop(inode, bix);
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ for (; bix < I0_BLOCKS; bix++)
+ if (data ^ (li->li_data[bix] == 0))
+ return bix;
+ return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ __be64 *rblock;
+ u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+ level_t level;
+ int ret, slot;
+ struct page *page;
+
+ BUG_ON(!bofs);
+
+ for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+ increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+ page = logfs_get_read_page(inode, bix, level);
+ if (!page)
+ return bix;
+
+ ret = logfs_segment_read(inode, page, bofs, bix, level);
+ if (ret) {
+ logfs_put_read_page(page);
+ return bix;
+ }
+
+ slot = get_bits(bix, SUBLEVEL(level));
+ rblock = kmap_atomic(page, KM_USER0);
+ while (slot < LOGFS_BLOCK_FACTOR) {
+ if (data && (rblock[slot] != 0))
+ break;
+ if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+ break;
+ slot++;
+ bix += increment;
+ bix &= ~(increment - 1);
+ }
+ if (slot >= LOGFS_BLOCK_FACTOR) {
+ kunmap_atomic(rblock, KM_USER0);
+ logfs_put_read_page(page);
+ return bix;
+ }
+ bofs = be64_to_cpu(rblock[slot]);
+ kunmap_atomic(rblock, KM_USER0);
+ logfs_put_read_page(page);
+ if (!bofs) {
+ BUG_ON(data);
+ return bix;
+ }
+ }
+ return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode: inode to search in
+ * @bix: block index to start searching
+ *
+ * Returns next hole. If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if (bix < I0_BLOCKS) {
+ bix = seek_holedata_direct(inode, bix, 0);
+ if (bix < I0_BLOCKS)
+ return bix;
+ }
+
+ if (!li->li_data[INDIRECT_INDEX])
+ return bix;
+ else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+ bix = maxbix(li->li_height);
+ else {
+ bix = seek_holedata_loop(inode, bix, 0);
+ if (bix < maxbix(li->li_height))
+ return bix;
+ /* Should not happen anymore. But if some port writes semi-
+ * corrupt images (as this one used to) we might run into it.
+ */
+ WARN_ON_ONCE(bix == maxbix(li->li_height));
+ }
+
+ return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if (bix < I0_BLOCKS) {
+ bix = seek_holedata_direct(inode, bix, 1);
+ if (bix < I0_BLOCKS)
+ return bix;
+ }
+
+ if (bix < maxbix(li->li_height)) {
+ if (!li->li_data[INDIRECT_INDEX])
+ bix = maxbix(li->li_height);
+ else
+ return seek_holedata_loop(inode, bix, 1);
+ }
+
+ return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode: inode to search in
+ * @bix: block index to start searching
+ *
+ * Returns next data block. If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+ struct super_block *sb = inode->i_sb;
+ u64 ret, end;
+
+ ret = __logfs_seek_data(inode, bix);
+ end = i_size_read(inode) >> sb->s_blocksize_bits;
+ if (ret >= end)
+ ret = max(bix, end);
+ return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+ return pure_ofs(li->li_data[bix]) == ofs;
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+ u64 ofs, u64 bofs)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ level_t level;
+ int ret;
+ struct page *page;
+
+ for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+ page = logfs_get_write_page(inode, bix, level);
+ BUG_ON(!page);
+
+ ret = logfs_segment_read(inode, page, bofs, bix, level);
+ if (ret) {
+ logfs_put_write_page(page);
+ return 0;
+ }
+
+ bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+ logfs_put_write_page(page);
+ if (!bofs)
+ return 0;
+
+ if (pure_ofs(bofs) == ofs)
+ return 1;
+ }
+ return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ u64 bofs = li->li_data[INDIRECT_INDEX];
+
+ if (!bofs)
+ return 0;
+
+ if (bix >= maxbix(li->li_height))
+ return 0;
+
+ if (pure_ofs(bofs) == ofs)
+ return 1;
+
+ return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+ return 0;
+
+ if (bix < I0_BLOCKS)
+ return logfs_is_valid_direct(li, bix, ofs);
+ return logfs_is_valid_loop(inode, bix, ofs);
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb - superblock
+ * @ofs - block physical offset
+ * @ino - block inode number
+ * @bix - block index
+ * @level - block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+ gc_level_t gc_level)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode;
+ int ret, cookie;
+
+ /* Umount closes a segment with free blocks remaining. Those
+ * blocks are by definition invalid. */
+ if (ino == -1)
+ return 0;
+
+ LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+ inode = logfs_safe_iget(sb, ino, &cookie);
+ if (IS_ERR(inode))
+ goto invalid;
+
+ ret = __logfs_is_valid_block(inode, bix, ofs);
+ logfs_safe_iput(inode, cookie);
+ if (ret)
+ return ret;
+
+invalid:
+ /* Block is nominally invalid, but may still sit in the shadow tree,
+ * waiting for a journal commit.
+ */
+ if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+ return 2;
+ return 0;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int ret = -EIO;
+
+ ret = logfs_read_block(inode, page, READ);
+
+ if (ret) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+ flush_dcache_page(page);
+
+ return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+ - super->s_dirty_used_bytes - super->s_dirty_pages;
+
+ if (!bytes)
+ return 0;
+
+ if (available < bytes)
+ return -ENOSPC;
+
+ if (available < bytes + super->s_root_reserve &&
+ !capable(CAP_SYS_RESOURCE))
+ return -ENOSPC;
+
+ return 0;
+}
+
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ int ret;
+
+ if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+ return 0;
+
+ logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+ ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+ if (!ret) {
+ alloc_data_block(inode, page);
+ logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+ super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+ }
+ logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+ return ret;
+}
+
+/*
+ * We are protected by write lock. Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+ struct logfs_transaction *ta)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+
+ if (!ta)
+ return;
+ logfs_inode(inode)->li_block->ta = NULL;
+
+ if (inode->i_ino != LOGFS_INO_MASTER) {
+ BUG(); /* FIXME: Yes, this needs more thought */
+ /* just remember the transaction until inode is written */
+ //BUG_ON(logfs_inode(inode)->li_transaction);
+ //logfs_inode(inode)->li_transaction = ta;
+ return;
+ }
+
+ switch (ta->state) {
+ case CREATE_1: /* fall through */
+ case UNLINK_1:
+ BUG_ON(super->s_victim_ino);
+ super->s_victim_ino = ta->ino;
+ break;
+ case CREATE_2: /* fall through */
+ case UNLINK_2:
+ BUG_ON(super->s_victim_ino != ta->ino);
+ super->s_victim_ino = 0;
+ /* transaction ends here - free it */
+ kfree(ta);
+ break;
+ case CROSS_RENAME_1:
+ BUG_ON(super->s_rename_dir);
+ BUG_ON(super->s_rename_pos);
+ super->s_rename_dir = ta->dir;
+ super->s_rename_pos = ta->pos;
+ break;
+ case CROSS_RENAME_2:
+ BUG_ON(super->s_rename_dir != ta->dir);
+ BUG_ON(super->s_rename_pos != ta->pos);
+ super->s_rename_dir = 0;
+ super->s_rename_pos = 0;
+ kfree(ta);
+ break;
+ case TARGET_RENAME_1:
+ BUG_ON(super->s_rename_dir);
+ BUG_ON(super->s_rename_pos);
+ BUG_ON(super->s_victim_ino);
+ super->s_rename_dir = ta->dir;
+ super->s_rename_pos = ta->pos;
+ super->s_victim_ino = ta->ino;
+ break;
+ case TARGET_RENAME_2:
+ BUG_ON(super->s_rename_dir != ta->dir);
+ BUG_ON(super->s_rename_pos != ta->pos);
+ BUG_ON(super->s_victim_ino != ta->ino);
+ super->s_rename_dir = 0;
+ super->s_rename_pos = 0;
+ break;
+ case TARGET_RENAME_3:
+ BUG_ON(super->s_rename_dir);
+ BUG_ON(super->s_rename_pos);
+ BUG_ON(super->s_victim_ino != ta->ino);
+ super->s_victim_ino = 0;
+ kfree(ta);
+ break;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+ return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+struct write_control {
+ u64 ofs;
+ long flags;
+};
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+ level_t level, u64 old_ofs)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ struct logfs_shadow *shadow;
+
+ shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+ memset(shadow, 0, sizeof(*shadow));
+ shadow->ino = inode->i_ino;
+ shadow->bix = bix;
+ shadow->gc_level = expand_level(inode->i_ino, level);
+ shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+ return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+
+ mempool_free(shadow, super->s_shadow_pool);
+}
+
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode: Inode owning the page
+ * @page: Struct page that was written
+ * @shadow: Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects. The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium. Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write. It is attached to the indirect block, which is
+ * marked dirty. When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode). Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation. It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block. If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+ struct logfs_shadow *shadow)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ struct logfs_block *block = logfs_block(page);
+ struct shadow_tree *tree = &super->s_shadow_tree;
+
+ if (PagePrivate(page)) {
+ if (block->alias_map)
+ super->s_no_object_aliases -= bitmap_weight(
+ block->alias_map, LOGFS_BLOCK_FACTOR);
+ logfs_handle_transaction(inode, block->ta);
+ block->ops->free_block(inode->i_sb, block);
+ }
+ if (shadow) {
+ if (shadow->old_ofs)
+ btree_insert64(&tree->old, shadow->old_ofs, shadow,
+ GFP_NOFS);
+ else
+ btree_insert64(&tree->new, shadow->new_ofs, shadow,
+ GFP_NOFS);
+
+ super->s_dirty_used_bytes += shadow->new_len;
+ super->s_dirty_free_bytes += shadow->old_len;
+ }
+}
+
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+ long child_no)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+ /* Aliases in the master inode are pointless. */
+ return;
+ }
+
+ if (!test_bit(child_no, block->alias_map)) {
+ set_bit(child_no, block->alias_map);
+ super->s_no_object_aliases++;
+ }
+ list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file. So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes. Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if (shadow->new_len == shadow->old_len)
+ return;
+
+ alloc_inode_block(inode);
+ li->li_used_bytes += shadow->new_len - shadow->old_len;
+ __logfs_set_blocks(inode);
+ logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+ logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+ struct write_control *wc)
+{
+ struct logfs_shadow *shadow;
+ u64 bix;
+ level_t level;
+ int full, err = 0;
+
+ logfs_unpack_index(page->index, &bix, &level);
+ if (wc->ofs == 0)
+ if (logfs_reserve_blocks(inode, 1))
+ return -ENOSPC;
+
+ shadow = alloc_shadow(inode, bix, level, wc->ofs);
+ if (wc->flags & WF_WRITE)
+ err = logfs_segment_write(inode, page, shadow);
+ if (wc->flags & WF_DELETE)
+ logfs_segment_delete(inode, shadow);
+ if (err) {
+ free_shadow(inode, shadow);
+ return err;
+ }
+
+ set_iused(inode, shadow);
+ full = 1;
+ if (level != 0) {
+ alloc_indirect_block(inode, page, 0);
+ full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+ }
+ fill_shadow_tree(inode, page, shadow);
+ wc->ofs = shadow->new_ofs;
+ if (wc->ofs && full)
+ wc->ofs |= LOGFS_FULLY_POPULATED;
+ return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+ long flags)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct write_control wc = {
+ .ofs = li->li_data[page->index],
+ .flags = flags,
+ };
+ int err;
+
+ alloc_inode_block(inode);
+
+ err = logfs_write_i0(inode, page, &wc);
+ if (err)
+ return err;
+
+ li->li_data[page->index] = wc.ofs;
+ logfs_set_alias(inode->i_sb, li->li_block,
+ page->index + INODE_POINTER_OFS);
+ return 0;
+}
+
+static int ptr_change(u64 ofs, struct page *page)
+{
+ struct logfs_block *block = logfs_block(page);
+ int empty0, empty1, full0, full1;
+
+ empty0 = ofs == 0;
+ empty1 = block->partial == 0;
+ if (empty0 != empty1)
+ return 1;
+
+ /* The !! is necessary to shrink result to int */
+ full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+ full1 = block->full == LOGFS_BLOCK_FACTOR;
+ if (full0 != full1)
+ return 1;
+ return 0;
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+ struct write_control *this_wc,
+ pgoff_t bix, level_t target_level, level_t level)
+{
+ int ret, page_empty = 0;
+ int child_no = get_bits(bix, SUBLEVEL(level));
+ struct page *ipage;
+ struct write_control child_wc = {
+ .flags = this_wc->flags,
+ };
+
+ ipage = logfs_get_write_page(inode, bix, level);
+ if (!ipage)
+ return -ENOMEM;
+
+ if (this_wc->ofs) {
+ ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+ if (ret)
+ goto out;
+ } else if (!PageUptodate(ipage)) {
+ page_empty = 1;
+ logfs_read_empty(ipage);
+ }
+
+ child_wc.ofs = block_get_pointer(ipage, child_no);
+
+ if ((__force u8)level-1 > (__force u8)target_level)
+ ret = __logfs_write_rec(inode, page, &child_wc, bix,
+ target_level, SUBLEVEL(level));
+ else
+ ret = logfs_write_i0(inode, page, &child_wc);
+
+ if (ret)
+ goto out;
+
+ alloc_indirect_block(inode, ipage, page_empty);
+ block_set_pointer(ipage, child_no, child_wc.ofs);
+ /* FIXME: first condition seems superfluous */
+ if (child_wc.ofs || logfs_block(ipage)->partial)
+ this_wc->flags |= WF_WRITE;
+ /* the condition on this_wc->ofs ensures that we won't consume extra
+ * space for indirect blocks in the future, which we cannot reserve */
+ if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+ ret = logfs_write_i0(inode, ipage, this_wc);
+ else
+ logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+ logfs_put_write_page(ipage);
+ return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+ pgoff_t bix, level_t target_level, long flags)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct write_control wc = {
+ .ofs = li->li_data[INDIRECT_INDEX],
+ .flags = flags,
+ };
+ int ret;
+
+ alloc_inode_block(inode);
+
+ if (li->li_height > (__force u8)target_level)
+ ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+ LEVEL(li->li_height));
+ else
+ ret = logfs_write_i0(inode, page, &wc);
+ if (ret)
+ return ret;
+
+ if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+ li->li_data[INDIRECT_INDEX] = wc.ofs;
+ logfs_set_alias(inode->i_sb, li->li_block,
+ INDIRECT_INDEX + INODE_POINTER_OFS);
+ }
+ return ret;
+}
+
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+ alloc_inode_block(inode);
+ logfs_inode(inode)->li_block->ta = ta;
+}
+
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+ struct logfs_block *block = logfs_inode(inode)->li_block;
+
+ if (block && block->ta)
+ block->ta = NULL;
+}
+
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ u8 height = (__force u8)level;
+ struct page *page;
+ struct write_control wc = {
+ .flags = WF_WRITE,
+ };
+ int err;
+
+ BUG_ON(height > 5 || li->li_height > 5);
+ while (height > li->li_height || bix >= maxbix(li->li_height)) {
+ page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+ LEVEL(li->li_height + 1));
+ if (!page)
+ return -ENOMEM;
+ logfs_read_empty(page);
+ alloc_indirect_block(inode, page, 1);
+ block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+ err = logfs_write_i0(inode, page, &wc);
+ logfs_put_write_page(page);
+ if (err)
+ return err;
+ li->li_data[INDIRECT_INDEX] = wc.ofs;
+ wc.ofs = 0;
+ li->li_height++;
+ logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+ }
+ return 0;
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+ struct logfs_super *super = logfs_super(inode->i_sb);
+ pgoff_t index = page->index;
+ u64 bix;
+ level_t level;
+ int err;
+
+ flags |= WF_WRITE | WF_DELETE;
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ logfs_unpack_index(index, &bix, &level);
+ if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+ super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+
+ if (index < I0_BLOCKS)
+ return logfs_write_direct(inode, page, flags);
+
+ bix = adjust_bix(bix, level);
+ err = grow_inode(inode, bix, level);
+ if (err)
+ return err;
+ return logfs_write_rec(inode, page, bix, level, flags);
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ logfs_get_wblocks(sb, page, flags & WF_LOCK);
+ ret = __logfs_write_buf(inode, page, flags);
+ logfs_put_wblocks(sb, page, flags & WF_LOCK);
+ return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+ long flags = WF_DELETE;
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ if (page->index < I0_BLOCKS)
+ return logfs_write_direct(inode, page, flags);
+ return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+ struct shadow_tree *shadow_tree)
+{
+ struct super_block *sb = inode->i_sb;
+ struct page *page;
+ int ret;
+
+ page = logfs_get_read_page(inode, index, 0);
+ if (!page)
+ return -ENOMEM;
+
+ logfs_get_wblocks(sb, page, 1);
+ ret = __logfs_delete(inode, page);
+ logfs_put_wblocks(sb, page, 1);
+
+ logfs_put_read_page(page);
+
+ return ret;
+}
+
+/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+ gc_level_t gc_level, long flags)
+{
+ level_t level = shrink_level(gc_level);
+ struct page *page;
+ int err;
+
+ page = logfs_get_write_page(inode, bix, level);
+ if (!page)
+ return -ENOMEM;
+
+ err = logfs_segment_read(inode, page, ofs, bix, level);
+ if (!err) {
+ if (level != 0)
+ alloc_indirect_block(inode, page, 0);
+ err = logfs_write_buf(inode, page, flags);
+ }
+ logfs_put_write_page(page);
+ return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+ u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+ loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+ u64 bix;
+ level_t level;
+ int err;
+
+ /* Does truncation happen within this page? */
+ if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+ return 0;
+
+ logfs_unpack_index(page->index, &bix, &level);
+ BUG_ON(level != 0);
+
+ err = logfs_segment_read(inode, page, ofs, bix, level);
+ if (err)
+ return err;
+
+ zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+ return logfs_segment_write(inode, page, shadow);
+}
+
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+ struct write_control *wc, u64 size)
+{
+ struct logfs_shadow *shadow;
+ u64 bix;
+ level_t level;
+ int err = 0;
+
+ logfs_unpack_index(page->index, &bix, &level);
+ BUG_ON(level != 0);
+ shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+ err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+ if (err) {
+ free_shadow(inode, shadow);
+ return err;
+ }
+
+ logfs_segment_delete(inode, shadow);
+ set_iused(inode, shadow);
+ fill_shadow_tree(inode, page, shadow);
+ wc->ofs = shadow->new_ofs;
+ return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct write_control wc;
+ struct page *page;
+ int e;
+ int err;
+
+ alloc_inode_block(inode);
+
+ for (e = I0_BLOCKS - 1; e >= 0; e--) {
+ if (size > (e+1) * LOGFS_BLOCKSIZE)
+ break;
+
+ wc.ofs = li->li_data[e];
+ if (!wc.ofs)
+ continue;
+
+ page = logfs_get_write_page(inode, e, 0);
+ if (!page)
+ return -ENOMEM;
+ err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+ if (err) {
+ logfs_put_write_page(page);
+ return err;
+ }
+ err = logfs_truncate_i0(inode, page, &wc, size);
+ logfs_put_write_page(page);
+ if (err)
+ return err;
+
+ li->li_data[e] = wc.ofs;
+ }
+ return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+ 1,
+ I1_BLOCKS,
+ I2_BLOCKS,
+ I3_BLOCKS,
+};
+
+static u64 __logfs_start_index[] = {
+ I0_BLOCKS,
+ I1_BLOCKS,
+ I2_BLOCKS,
+ I3_BLOCKS
+};
+
+static inline u64 logfs_step(level_t level)
+{
+ return __logfs_step[(__force u8)level];
+}
+
+static inline u64 logfs_factor(u8 level)
+{
+ return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+
+static inline u64 logfs_start_index(level_t level)
+{
+ return __logfs_start_index[(__force u8)level];
+}
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+ logfs_unpack_index(index, bix, level);
+ if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+ *bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+ struct write_control *this_wc, u64 size)
+{
+ int truncate_happened = 0;
+ int e, err = 0;
+ u64 bix, child_bix, next_bix;
+ level_t level;
+ struct page *page;
+ struct write_control child_wc = { /* FIXME: flags */ };
+
+ logfs_unpack_raw_index(ipage->index, &bix, &level);
+ err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+ if (err)
+ return err;
+
+ for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+ child_bix = bix + e * logfs_step(SUBLEVEL(level));
+ next_bix = child_bix + logfs_step(SUBLEVEL(level));
+ if (size > next_bix * LOGFS_BLOCKSIZE)
+ break;
+
+ child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+ if (!child_wc.ofs)
+ continue;
+
+ page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+ if (!page)
+ return -ENOMEM;
+
+ if ((__force u8)level > 1)
+ err = __logfs_truncate_rec(inode, page, &child_wc, size);
+ else
+ err = logfs_truncate_i0(inode, page, &child_wc, size);
+ logfs_put_write_page(page);
+ if (err)
+ return err;
+
+ truncate_happened = 1;
+ alloc_indirect_block(inode, ipage, 0);
+ block_set_pointer(ipage, e, child_wc.ofs);
+ }
+
+ if (!truncate_happened) {
+ printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+ return 0;
+ }
+
+ this_wc->flags = WF_DELETE;
+ if (logfs_block(ipage)->partial)
+ this_wc->flags |= WF_WRITE;
+
+ return logfs_write_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct write_control wc = {
+ .ofs = li->li_data[INDIRECT_INDEX],
+ };
+ struct page *page;
+ int err;
+
+ alloc_inode_block(inode);
+
+ if (!wc.ofs)
+ return 0;
+
+ page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+ if (!page)
+ return -ENOMEM;
+
+ err = __logfs_truncate_rec(inode, page, &wc, size);
+ logfs_put_write_page(page);
+ if (err)
+ return err;
+
+ if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+ li->li_data[INDIRECT_INDEX] = wc.ofs;
+ return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+ int ret;
+
+ if (size >= logfs_factor(logfs_inode(inode)->li_height))
+ return 0;
+
+ ret = logfs_truncate_rec(inode, size);
+ if (ret)
+ return ret;
+
+ return logfs_truncate_direct(inode, size);
+}
+
+int logfs_truncate(struct inode *inode, u64 size)
+{
+ struct super_block *sb = inode->i_sb;
+ int err;
+
+ logfs_get_wblocks(sb, NULL, 1);
+ err = __logfs_truncate(inode, size);
+ if (!err)
+ err = __logfs_write_inode(inode, 0);
+ logfs_put_wblocks(sb, NULL, 1);
+
+ if (!err)
+ err = vmtruncate(inode, size);
+
+ /* I don't trust error recovery yet. */
+ WARN_ON(err);
+ return err;
+}
+
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct logfs_block *block = logfs_block(page);
+
+ if (!block)
+ return;
+
+ log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+ block->ino, block->bix, block->level);
+ BUG_ON(li->li_block);
+ block->ops = &inode_block_ops;
+ block->inode = inode;
+ li->li_block = block;
+
+ block->page = NULL;
+ page->private = 0;
+ ClearPagePrivate(page);
+}
+
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+ struct logfs_block *block = li->li_block;
+
+ if (!block)
+ return;
+
+ log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+ block->ino, block->bix, block->level);
+ BUG_ON(PagePrivate(page));
+ block->ops = &indirect_block_ops;
+ block->page = page;
+ page->private = (unsigned long)block;
+ SetPagePrivate(page);
+
+ block->inode = NULL;
+ li->li_block = NULL;
+}
+
+int logfs_read_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *master_inode = super->s_master_inode;
+ struct page *page;
+ struct logfs_disk_inode *di;
+ u64 ino = inode->i_ino;
+
+ if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+ return -ENODATA;
+ if (!logfs_exist_block(master_inode, ino))
+ return -ENODATA;
+
+ page = read_cache_page(master_inode->i_mapping, ino,
+ (filler_t *)logfs_readpage, NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ di = kmap_atomic(page, KM_USER0);
+ logfs_disk_to_inode(di, inode);
+ kunmap_atomic(di, KM_USER0);
+ move_page_to_inode(inode, page);
+ page_cache_release(page);
+ return 0;
+}
+
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+ struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+ struct logfs_disk_inode *di;
+ struct page *page;
+
+ BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+
+ page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+ if (!page)
+ return NULL;
+
+ di = kmap_atomic(page, KM_USER0);
+ logfs_inode_to_disk(inode, di);
+ kunmap_atomic(di, KM_USER0);
+ move_inode_to_page(page, inode);
+ return page;
+}
+
+/* Cheaper version of write_inode. All changes are concealed in
+ * aliases, which are moved back. No write to the medium happens.
+ */
+void logfs_clear_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_inode *li = logfs_inode(inode);
+ struct logfs_block *block = li->li_block;
+ struct page *page;
+
+ /* Only deleted files may be dirty at this point */
+ BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+ if (!block)
+ return;
+ if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+ block->ops->free_block(inode->i_sb, block);
+ return;
+ }
+
+ BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+ page = inode_to_page(inode);
+ BUG_ON(!page); /* FIXME: Use emergency page */
+ logfs_put_write_page(page);
+}
+
+static int do_write_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct inode *master_inode = logfs_super(sb)->s_master_inode;
+ loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+ struct page *page;
+ int err;
+
+ BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+ /* FIXME: lock inode */
+
+ if (i_size_read(master_inode) < size)
+ i_size_write(master_inode, size);
+
+ /* TODO: Tell vfs this inode is clean now */
+
+ page = inode_to_page(inode);
+ if (!page)
+ return -ENOMEM;
+
+ /* FIXME: transaction is part of logfs_block now. Is that enough? */
+ err = logfs_write_buf(master_inode, page, 0);
+ logfs_put_write_page(page);
+ return err;
+}
+
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+ int write,
+ void (*change_se)(struct logfs_segment_entry *, long),
+ long arg)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode;
+ struct page *page;
+ struct logfs_segment_entry *se;
+ pgoff_t page_no;
+ int child_no;
+
+ page_no = segno >> (sb->s_blocksize_bits - 3);
+ child_no = segno & ((sb->s_blocksize >> 3) - 1);
+
+ inode = super->s_segfile_inode;
+ page = logfs_get_write_page(inode, page_no, 0);
+ BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+ if (!PageUptodate(page))
+ logfs_read_block(inode, page, WRITE);
+
+ if (write)
+ alloc_indirect_block(inode, page, 0);
+ se = kmap_atomic(page, KM_USER0);
+ change_se(se + child_no, arg);
+ if (write) {
+ logfs_set_alias(sb, logfs_block(page), child_no);
+ BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+ }
+ kunmap_atomic(se, KM_USER0);
+
+ logfs_put_write_page(page);
+}
+
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+ struct logfs_segment_entry *target = (void *)_target;
+
+ *target = *se;
+}
+
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+ struct logfs_segment_entry *se)
+{
+ logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+ u32 valid;
+
+ valid = be32_to_cpu(se->valid);
+ valid += increment;
+ se->valid = cpu_to_be32(valid);
+}
+
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+ struct logfs_super *super = logfs_super(sb);
+ u32 segno = ofs >> super->s_segshift;
+
+ if (!increment)
+ return;
+
+ logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+ se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+ gc_level_t gc_level)
+{
+ u32 ec_level = ec << 4 | (__force u8)gc_level;
+
+ logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+ se->valid = cpu_to_be32(RESERVED);
+}
+
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+ logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+ long ec_level)
+{
+ se->valid = 0;
+ se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+ u32 ec_level = ec << 4;
+
+ logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+ ec_level);
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+ ret = do_write_inode(inode);
+ logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+ return ret;
+}
+
+static int do_delete_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct inode *master_inode = logfs_super(sb)->s_master_inode;
+ struct page *page;
+ int ret;
+
+ page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+ if (!page)
+ return -ENOMEM;
+
+ move_inode_to_page(page, inode);
+
+ logfs_get_wblocks(sb, page, 1);
+ ret = __logfs_delete(master_inode, page);
+ logfs_put_wblocks(sb, page, 1);
+
+ logfs_put_write_page(page);
+ return ret;
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking. No need to kill them again here.
+ */
+void logfs_delete_inode(struct inode *inode)
+{
+ struct logfs_inode *li = logfs_inode(inode);
+
+ if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+ li->li_flags |= LOGFS_IF_ZOMBIE;
+ if (i_size_read(inode) > 0)
+ logfs_truncate(inode, 0);
+ do_delete_inode(inode);
+ }
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+void btree_write_block(struct logfs_block *block)
+{
+ struct inode *inode;
+ struct page *page;
+ int err, cookie;
+
+ inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+ page = logfs_get_write_page(inode, block->bix, block->level);
+
+ err = logfs_readpage_nolock(page);
+ BUG_ON(err);
+ BUG_ON(!PagePrivate(page));
+ BUG_ON(logfs_block(page) != block);
+ err = __logfs_write_buf(inode, page, 0);
+ BUG_ON(err);
+ BUG_ON(PagePrivate(page) || page->private);
+
+ logfs_put_write_page(page);
+ logfs_safe_iput(inode, cookie);
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode: parent inode (ifile or directory)
+ * @buf: object to write (inode or dentry)
+ * @n: object size
+ * @_pos: object number (file position in blocks/objects)
+ * @flags: write flags
+ * @lock: 0 if write lock is already taken, 1 otherwise
+ * @shadow_tree: shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable. A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+ loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+ loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+ int err;
+ struct page *page;
+ void *pagebuf;
+
+ BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+ BUG_ON(count > LOGFS_BLOCKSIZE);
+ page = logfs_get_write_page(inode, bix, 0);
+ if (!page)
+ return -ENOMEM;
+
+ pagebuf = kmap_atomic(page, KM_USER0);
+ memcpy(pagebuf, buf, count);
+ flush_dcache_page(page);
+ kunmap_atomic(pagebuf, KM_USER0);
+
+ if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+ i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+ err = logfs_write_buf(inode, page, flags);
+ logfs_put_write_page(page);
+ return err;
+}
+
+int logfs_open_segfile(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *inode;
+
+ inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ super->s_segfile_inode = inode;
+ return 0;
+}
+
+int logfs_init_rw(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int min_fill = 3 * super->s_no_blocks;
+
+ INIT_LIST_HEAD(&super->s_object_alias);
+ mutex_init(&super->s_write_mutex);
+ super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+ sizeof(struct logfs_block));
+ super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+ sizeof(struct logfs_shadow));
+ return 0;
+}
+
+void logfs_cleanup_rw(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ destroy_meta_inode(super->s_segfile_inode);
+ if (super->s_block_pool)
+ mempool_destroy(super->s_block_pool);
+ if (super->s_shadow_pool)
+ mempool_destroy(super->s_shadow_pool);
+}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 00000000000..1a14f9910d5
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,927 @@
+/*
+ * fs/logfs/segment.c - Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas. Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct btree_head32 *head = &super->s_reserved_segments;
+ int err;
+
+ err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+ if (err)
+ return err;
+ logfs_super(sb)->s_bad_segments++;
+ /* FIXME: write to journal */
+ return 0;
+}
+
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ super->s_gec++;
+
+ return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+ super->s_segsize, ensure_erase);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+ s32 ofs;
+
+ logfs_open_area(area, bytes);
+
+ ofs = area->a_used_bytes;
+ area->a_used_bytes += bytes;
+ BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+ return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+ int use_filler)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ filler_t *filler = super->s_devops->readpage;
+ struct page *page;
+
+ BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ if (use_filler)
+ page = read_cache_page(mapping, index, filler, sb);
+ else {
+ page = find_or_create_page(mapping, index, GFP_NOFS);
+ unlock_page(page);
+ }
+ return page;
+}
+
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+ int use_filler)
+{
+ pgoff_t index = ofs >> PAGE_SHIFT;
+ struct page *page;
+ long offset = ofs & (PAGE_SIZE-1);
+ long copylen;
+
+ /* Only logfs_wbuf_recover may use len==0 */
+ BUG_ON(!len && !use_filler);
+ do {
+ copylen = min((ulong)len, PAGE_SIZE - offset);
+
+ page = get_mapping_page(area->a_sb, index, use_filler);
+ SetPageUptodate(page);
+ BUG_ON(!page); /* FIXME: reserve a pool */
+ memcpy(page_address(page) + offset, buf, copylen);
+ SetPagePrivate(page);
+ page_cache_release(page);
+
+ buf += copylen;
+ len -= copylen;
+ offset = 0;
+ index++;
+ } while (len);
+}
+
+/*
+ * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct page *page;
+ u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+ pgoff_t index = ofs >> PAGE_SHIFT;
+ long offset = ofs & (PAGE_SIZE-1);
+ u32 len = PAGE_SIZE - offset;
+
+ if (len == PAGE_SIZE) {
+ /* The math in this function can surely use some love */
+ len = 0;
+ }
+ if (len) {
+ BUG_ON(area->a_used_bytes >= super->s_segsize);
+
+ page = get_mapping_page(area->a_sb, index, 0);
+ BUG_ON(!page); /* FIXME: reserve a pool */
+ memset(page_address(page) + offset, 0xff, len);
+ SetPagePrivate(page);
+ page_cache_release(page);
+ }
+
+ if (!final)
+ return;
+
+ area->a_used_bytes += len;
+ for ( ; area->a_used_bytes < super->s_segsize;
+ area->a_used_bytes += PAGE_SIZE) {
+ /* Memset another page */
+ index++;
+ page = get_mapping_page(area->a_sb, index, 0);
+ BUG_ON(!page); /* FIXME: reserve a pool */
+ memset(page_address(page), 0xff, PAGE_SIZE);
+ SetPagePrivate(page);
+ page_cache_release(page);
+ }
+}
+
+/*
+ * We have to be careful with the alias tree. Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks. So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+ level_t level)
+{
+ struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+ pgoff_t index = logfs_pack_index(bix, level);
+
+ return btree_lookup128(head, ino, index);
+}
+
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+ level_t level, void *val)
+{
+ struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+ pgoff_t index = logfs_pack_index(bix, level);
+
+ return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+ write_alias_t *write_one_alias)
+{
+ struct object_alias_item *item;
+ int err;
+
+ list_for_each_entry(item, &block->item_list, list) {
+ err = write_alias_journal(sb, block->ino, block->bix,
+ block->level, item->child_no, item->val);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static gc_level_t btree_block_level(struct logfs_block *block)
+{
+ return expand_level(block->ino, block->level);
+}
+
+static struct logfs_block_ops btree_block_ops = {
+ .write_block = btree_write_block,
+ .block_level = btree_block_level,
+ .free_block = __free_block,
+ .write_alias = btree_write_alias,
+};
+
+int logfs_load_object_aliases(struct super_block *sb,
+ struct logfs_obj_alias *oa, int count)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_block *block;
+ struct object_alias_item *item;
+ u64 ino, bix;
+ level_t level;
+ int i, err;
+
+ super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+ count /= sizeof(*oa);
+ for (i = 0; i < count; i++) {
+ item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+ if (!item)
+ return -ENOMEM;
+ memset(item, 0, sizeof(*item));
+
+ super->s_no_object_aliases++;
+ item->val = oa[i].val;
+ item->child_no = be16_to_cpu(oa[i].child_no);
+
+ ino = be64_to_cpu(oa[i].ino);
+ bix = be64_to_cpu(oa[i].bix);
+ level = LEVEL(oa[i].level);
+
+ log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+ ino, bix, level, item->child_no,
+ be64_to_cpu(item->val));
+ block = alias_tree_lookup(sb, ino, bix, level);
+ if (!block) {
+ block = __alloc_block(sb, ino, bix, level);
+ block->ops = &btree_block_ops;
+ err = alias_tree_insert(sb, ino, bix, level, block);
+ BUG_ON(err); /* mempool empty */
+ }
+ if (test_and_set_bit(item->child_no, block->alias_map)) {
+ printk(KERN_ERR"LogFS: Alias collision detected\n");
+ return -EIO;
+ }
+ list_move_tail(&block->alias_list, &super->s_object_alias);
+ list_add(&item->list, &block->item_list);
+ }
+ return 0;
+}
+
+static void kill_alias(void *_block, unsigned long ignore0,
+ u64 ignore1, u64 ignore2, size_t ignore3)
+{
+ struct logfs_block *block = _block;
+ struct super_block *sb = block->sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct object_alias_item *item;
+
+ while (!list_empty(&block->item_list)) {
+ item = list_entry(block->item_list.next, typeof(*item), list);
+ list_del(&item->list);
+ mempool_free(item, super->s_alias_pool);
+ }
+ block->ops->free_block(sb, block);
+}
+
+static int obj_type(struct inode *inode, level_t level)
+{
+ if (level == 0) {
+ if (S_ISDIR(inode->i_mode))
+ return OBJ_DENTRY;
+ if (inode->i_ino == LOGFS_INO_MASTER)
+ return OBJ_INODE;
+ }
+ return OBJ_BLOCK;
+}
+
+static int obj_len(struct super_block *sb, int obj_type)
+{
+ switch (obj_type) {
+ case OBJ_DENTRY:
+ return sizeof(struct logfs_disk_dentry);
+ case OBJ_INODE:
+ return sizeof(struct logfs_disk_inode);
+ case OBJ_BLOCK:
+ return sb->s_blocksize;
+ default:
+ BUG();
+ }
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+ struct logfs_shadow *shadow, int type, int len, int compr)
+{
+ struct logfs_area *area;
+ struct super_block *sb = inode->i_sb;
+ s64 ofs;
+ struct logfs_object_header h;
+ int acc_len;
+
+ if (shadow->gc_level == 0)
+ acc_len = len;
+ else
+ acc_len = obj_len(sb, type);
+
+ area = get_area(sb, shadow->gc_level);
+ ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+ LOGFS_BUG_ON(ofs <= 0, sb);
+ /*
+ * Order is important. logfs_get_free_bytes(), by modifying the
+ * segment file, may modify the content of the very page we're about
+ * to write now. Which is fine, as long as the calculated crc and
+ * written data still match. So do the modifications _before_
+ * calculating the crc.
+ */
+
+ h.len = cpu_to_be16(len);
+ h.type = type;
+ h.compr = compr;
+ h.ino = cpu_to_be64(inode->i_ino);
+ h.bix = cpu_to_be64(shadow->bix);
+ h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
+ h.data_crc = logfs_crc32(buf, len, 0);
+
+ logfs_buf_write(area, ofs, &h, sizeof(h));
+ logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+
+ shadow->new_ofs = ofs;
+ shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+
+ return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+ struct logfs_shadow *shadow, int type, int len)
+{
+ struct super_block *sb = inode->i_sb;
+ void *compressor_buf = logfs_super(sb)->s_compressed_je;
+ ssize_t compr_len;
+ int ret;
+
+ mutex_lock(&logfs_super(sb)->s_journal_mutex);
+ compr_len = logfs_compress(buf, compressor_buf, len, len);
+
+ if (compr_len >= 0) {
+ ret = __logfs_segment_write(inode, compressor_buf, shadow,
+ type, compr_len, COMPR_ZLIB);
+ } else {
+ ret = __logfs_segment_write(inode, buf, shadow, type, len,
+ COMPR_NONE);
+ }
+ mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+ return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode: inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+ struct logfs_shadow *shadow)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_super *super = logfs_super(sb);
+ int do_compress, type, len;
+ int ret;
+ void *buf;
+
+ super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+ BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+ do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+ if (shadow->gc_level != 0) {
+ /* temporarily disable compression for indirect blocks */
+ do_compress = 0;
+ }
+
+ type = obj_type(inode, shrink_level(shadow->gc_level));
+ len = obj_len(sb, type);
+ buf = kmap(page);
+ if (do_compress)
+ ret = logfs_segment_write_compress(inode, buf, shadow, type,
+ len);
+ else
+ ret = __logfs_segment_write(inode, buf, shadow, type, len,
+ COMPR_NONE);
+ kunmap(page);
+
+ log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+ shadow->ino, shadow->bix, shadow->gc_level,
+ shadow->old_ofs, shadow->new_ofs,
+ shadow->old_len, shadow->new_len);
+ /* this BUG_ON did catch a locking bug. useful */
+ BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+ return ret;
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+ pgoff_t index = ofs >> PAGE_SHIFT;
+ struct page *page;
+ long offset = ofs & (PAGE_SIZE-1);
+ long copylen;
+
+ while (len) {
+ copylen = min((ulong)len, PAGE_SIZE - offset);
+
+ page = get_mapping_page(sb, index, 1);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ memcpy(buf, page_address(page) + offset, copylen);
+ page_cache_release(page);
+
+ buf += copylen;
+ len -= copylen;
+ offset = 0;
+ index++;
+ }
+ return 0;
+}
+
+/*
+ * The "position" of indirect blocks is ambiguous. It can be the position
+ * of any data block somewhere behind this indirect block. So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+ return (pos1 & logfs_block_mask(sb, level)) !=
+ (pos2 & logfs_block_mask(sb, level));
+}
+
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+ struct logfs_segment_header *sh)
+{
+ __be32 crc;
+ int err;
+
+ err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+ if (err)
+ return err;
+ crc = logfs_crc32(sh, sizeof(*sh), 4);
+ if (crc != sh->crc) {
+ printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+ "got %x\n", ofs, be32_to_cpu(sh->crc),
+ be32_to_cpu(crc));
+ return -EIO;
+ }
+ return 0;
+}
+#endif
+
+static int read_obj_header(struct super_block *sb, u64 ofs,
+ struct logfs_object_header *oh)
+{
+ __be32 crc;
+ int err;
+
+ err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+ if (err)
+ return err;
+ crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+ if (crc != oh->crc) {
+ printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+ "got %x\n", ofs, be32_to_cpu(oh->crc),
+ be32_to_cpu(crc));
+ return -EIO;
+ }
+ return 0;
+}
+
+static void move_btree_to_page(struct inode *inode, struct page *page,
+ __be64 *data)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct btree_head128 *head = &super->s_object_alias_tree;
+ struct logfs_block *block;
+ struct object_alias_item *item, *next;
+
+ if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+ return;
+
+ block = btree_remove128(head, inode->i_ino, page->index);
+ if (!block)
+ return;
+
+ log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+ block->ino, block->bix, block->level);
+ list_for_each_entry_safe(item, next, &block->item_list, list) {
+ data[item->child_no] = item->val;
+ list_del(&item->list);
+ mempool_free(item, super->s_alias_pool);
+ }
+ block->page = page;
+ SetPagePrivate(page);
+ page->private = (unsigned long)block;
+ block->ops = &indirect_block_ops;
+ initialize_block_counters(page, block, data, 0);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning. I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ return find_next_bit(addr, size, offset);
+}
+
+void move_page_to_btree(struct page *page)
+{
+ struct logfs_block *block = logfs_block(page);
+ struct super_block *sb = block->sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct object_alias_item *item;
+ unsigned long pos;
+ __be64 *child;
+ int err;
+
+ if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+ block->ops->free_block(sb, block);
+ return;
+ }
+ log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+ block->ino, block->bix, block->level);
+ super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+
+ for (pos = 0; ; pos++) {
+ pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+ if (pos >= LOGFS_BLOCK_FACTOR)
+ break;
+
+ item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+ BUG_ON(!item); /* mempool empty */
+ memset(item, 0, sizeof(*item));
+
+ child = kmap_atomic(page, KM_USER0);
+ item->val = child[pos];
+ kunmap_atomic(child, KM_USER0);
+ item->child_no = pos;
+ list_add(&item->list, &block->item_list);
+ }
+ block->page = NULL;
+ ClearPagePrivate(page);
+ page->private = 0;
+ block->ops = &btree_block_ops;
+ err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+ block);
+ BUG_ON(err); /* mempool empty */
+ ClearPageUptodate(page);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+ u64 ofs, u64 bix, level_t level)
+{
+ struct super_block *sb = inode->i_sb;
+ void *compressor_buf = logfs_super(sb)->s_compressed_je;
+ struct logfs_object_header oh;
+ __be32 crc;
+ u16 len;
+ int err, block_len;
+
+ block_len = obj_len(sb, obj_type(inode, level));
+ err = read_obj_header(sb, ofs, &oh);
+ if (err)
+ goto out_err;
+
+ err = -EIO;
+ if (be64_to_cpu(oh.ino) != inode->i_ino
+ || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+ printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+ "expected (%lx, %llx), got (%llx, %llx)\n",
+ ofs, inode->i_ino, bix,
+ be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+ goto out_err;
+ }
+
+ len = be16_to_cpu(oh.len);
+
+ switch (oh.compr) {
+ case COMPR_NONE:
+ err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+ if (err)
+ goto out_err;
+ crc = logfs_crc32(buf, len, 0);
+ if (crc != oh.data_crc) {
+ printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+ "%llx: expected %x, got %x\n", ofs,
+ be32_to_cpu(oh.data_crc),
+ be32_to_cpu(crc));
+ goto out_err;
+ }
+ break;
+ case COMPR_ZLIB:
+ mutex_lock(&logfs_super(sb)->s_journal_mutex);
+ err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+ compressor_buf);
+ if (err) {
+ mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+ goto out_err;
+ }
+ crc = logfs_crc32(compressor_buf, len, 0);
+ if (crc != oh.data_crc) {
+ printk(KERN_ERR"LOGFS: compressed data crc error at "
+ "%llx: expected %x, got %x\n", ofs,
+ be32_to_cpu(oh.data_crc),
+ be32_to_cpu(crc));
+ mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+ goto out_err;
+ }
+ err = logfs_uncompress(compressor_buf, buf, len, block_len);
+ mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+ if (err) {
+ printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+ goto out_err;
+ }
+ break;
+ default:
+ LOGFS_BUG(sb);
+ err = -EIO;
+ goto out_err;
+ }
+ return 0;
+
+out_err:
+ logfs_set_ro(sb);
+ printk(KERN_ERR"LOGFS: device is read-only now\n");
+ LOGFS_BUG(sb);
+ return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode: inode containing data
+ * @buf: data buffer
+ * @ofs: physical data offset
+ * @bix: block index
+ * @level: block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+ u64 ofs, u64 bix, level_t level)
+{
+ int err;
+ void *buf;
+
+ if (PageUptodate(page))
+ return 0;
+
+ ofs &= ~LOGFS_FULLY_POPULATED;
+
+ buf = kmap(page);
+ err = __logfs_segment_read(inode, buf, ofs, bix, level);
+ if (!err) {
+ move_btree_to_page(inode, page, buf);
+ SetPageUptodate(page);
+ }
+ kunmap(page);
+ log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+ inode->i_ino, bix, level, ofs, err);
+ return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+ struct super_block *sb = inode->i_sb;
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_object_header h;
+ u16 len;
+ int err;
+
+ super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+ BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+ BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+ if (!shadow->old_ofs)
+ return 0;
+
+ log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+ shadow->ino, shadow->bix, shadow->gc_level,
+ shadow->old_ofs, shadow->new_ofs,
+ shadow->old_len, shadow->new_len);
+ err = read_obj_header(sb, shadow->old_ofs, &h);
+ LOGFS_BUG_ON(err, sb);
+ LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+ LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+ shrink_level(shadow->gc_level)), sb);
+
+ if (shadow->gc_level == 0)
+ len = be16_to_cpu(h.len);
+ else
+ len = obj_len(sb, h.type);
+ shadow->old_len = len + sizeof(h);
+ return 0;
+}
+
+static void freeseg(struct super_block *sb, u32 segno)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping = super->s_mapping_inode->i_mapping;
+ struct page *page;
+ u64 ofs, start, end;
+
+ start = dev_ofs(sb, segno, 0);
+ end = dev_ofs(sb, segno + 1, 0);
+ for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+ page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+ if (!page)
+ continue;
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ }
+}
+
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_super *super = logfs_super(sb);
+ int err, closed = 0;
+
+ if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+ return 0;
+
+ if (area->a_is_open) {
+ u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+ u32 len = super->s_segsize - area->a_written_bytes;
+
+ log_gc("logfs_close_area(%x)\n", area->a_segno);
+ pad_wbuf(area, 1);
+ super->s_devops->writeseg(area->a_sb, ofs, len);
+ freeseg(sb, area->a_segno);
+ closed = 1;
+ }
+
+ area->a_used_bytes = 0;
+ area->a_written_bytes = 0;
+again:
+ area->a_ops->get_free_segment(area);
+ area->a_ops->get_erase_count(area);
+
+ log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+ err = area->a_ops->erase_segment(area);
+ if (err) {
+ printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+ area->a_segno);
+ logfs_mark_segment_bad(sb, area->a_segno);
+ goto again;
+ }
+ area->a_is_open = 1;
+ return closed;
+}
+
+void logfs_sync_area(struct logfs_area *area)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_super *super = logfs_super(sb);
+ u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+ u32 len = (area->a_used_bytes - area->a_written_bytes);
+
+ if (super->s_writesize)
+ len &= ~(super->s_writesize - 1);
+ if (len == 0)
+ return;
+ pad_wbuf(area, 0);
+ super->s_devops->writeseg(sb, ofs, len);
+ area->a_written_bytes += len;
+}
+
+void logfs_sync_segments(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ for_each_area(i)
+ logfs_sync_area(super->s_area[i]);
+}
+
+/*
+ * Pick a free segment to be used for this area. Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_super *super = logfs_super(sb);
+
+ if (super->s_free_list.count == 0) {
+ printk(KERN_ERR"LOGFS: ran out of free segments\n");
+ LOGFS_BUG(sb);
+ }
+
+ area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+ struct logfs_segment_entry se;
+ u32 ec_level;
+
+ logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+ BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+ se.valid == cpu_to_be32(RESERVED));
+
+ ec_level = be32_to_cpu(se.ec_level);
+ area->a_erase_count = (ec_level >> 4) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+ struct super_block *sb = area->a_sb;
+ struct logfs_segment_header sh;
+ u64 ofs;
+ int err;
+
+ err = logfs_erase_segment(sb, area->a_segno, 0);
+ if (err)
+ return err;
+
+ sh.pad = 0;
+ sh.type = SEG_OSTORE;
+ sh.level = (__force u8)area->a_level;
+ sh.segno = cpu_to_be32(area->a_segno);
+ sh.ec = cpu_to_be32(area->a_erase_count);
+ sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+ sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+ logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+ area->a_level);
+
+ ofs = dev_ofs(sb, area->a_segno, 0);
+ area->a_used_bytes = sizeof(sh);
+ logfs_buf_write(area, ofs, &sh, sizeof(sh));
+ return 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+ .get_free_segment = ostore_get_free_segment,
+ .get_erase_count = ostore_get_erase_count,
+ .erase_segment = ostore_erase_segment,
+};
+
+static void free_area(struct logfs_area *area)
+{
+ if (area)
+ freeseg(area->a_sb, area->a_segno);
+ kfree(area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+ struct logfs_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
+ if (!area)
+ return NULL;
+
+ area->a_sb = sb;
+ return area;
+}
+
+static void map_invalidatepage(struct page *page, unsigned long l)
+{
+ BUG();
+}
+
+static int map_releasepage(struct page *page, gfp_t g)
+{
+ /* Don't release these pages */
+ return 0;
+}
+
+static const struct address_space_operations mapping_aops = {
+ .invalidatepage = map_invalidatepage,
+ .releasepage = map_releasepage,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+int logfs_init_mapping(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct address_space *mapping;
+ struct inode *inode;
+
+ inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ super->s_mapping_inode = inode;
+ mapping = inode->i_mapping;
+ mapping->a_ops = &mapping_aops;
+ /* Would it be possible to use __GFP_HIGHMEM as well? */
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
+ return 0;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i = -1;
+
+ super->s_alias_pool = mempool_create_kmalloc_pool(600,
+ sizeof(struct object_alias_item));
+ if (!super->s_alias_pool)
+ return -ENOMEM;
+
+ super->s_journal_area = alloc_area(sb);
+ if (!super->s_journal_area)
+ goto err;
+
+ for_each_area(i) {
+ super->s_area[i] = alloc_area(sb);
+ if (!super->s_area[i])
+ goto err;
+ super->s_area[i]->a_level = GC_LEVEL(i);
+ super->s_area[i]->a_ops = &ostore_area_ops;
+ }
+ btree_init_mempool128(&super->s_object_alias_tree,
+ super->s_btree_pool);
+ return 0;
+
+err:
+ for (i--; i >= 0; i--)
+ free_area(super->s_area[i]);
+ free_area(super->s_journal_area);
+ mempool_destroy(super->s_alias_pool);
+ return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+ for_each_area(i)
+ free_area(super->s_area[i]);
+ free_area(super->s_journal_area);
+ destroy_meta_inode(super->s_mapping_inode);
+}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 00000000000..c66beab78de
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,650 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+ struct page *page;
+ int err;
+
+ page = read_cache_page(mapping, index, filler, NULL);
+ if (page)
+ return page;
+
+ /* No more pages available, switch to emergency page */
+ printk(KERN_INFO"Logfs: Using emergency page\n");
+ mutex_lock(&emergency_mutex);
+ err = filler(NULL, emergency_page);
+ if (err) {
+ mutex_unlock(&emergency_mutex);
+ printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+ return ERR_PTR(err);
+ }
+ return emergency_page;
+}
+
+void emergency_read_end(struct page *page)
+{
+ if (page == emergency_page)
+ mutex_unlock(&emergency_mutex);
+ else
+ page_cache_release(page);
+}
+
+static void dump_segfile(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_segment_entry se;
+ u32 segno;
+
+ for (segno = 0; segno < super->s_no_segs; segno++) {
+ logfs_get_segment_entry(sb, segno, &se);
+ printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+ be32_to_cpu(se.valid));
+ if (++segno < super->s_no_segs) {
+ logfs_get_segment_entry(sb, segno, &se);
+ printk(" %6x %8x", be32_to_cpu(se.ec_level),
+ be32_to_cpu(se.valid));
+ }
+ if (++segno < super->s_no_segs) {
+ logfs_get_segment_entry(sb, segno, &se);
+ printk(" %6x %8x", be32_to_cpu(se.ec_level),
+ be32_to_cpu(se.valid));
+ }
+ if (++segno < super->s_no_segs) {
+ logfs_get_segment_entry(sb, segno, &se);
+ printk(" %6x %8x", be32_to_cpu(se.ec_level),
+ be32_to_cpu(se.valid));
+ }
+ printk("\n");
+ }
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment. This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+ dump_segfile(sb);
+}
+
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+ const unsigned char *p = s;
+ while (n-- != 0)
+ if ((unsigned char)c != *p++)
+ return (void *)(p - 1);
+
+ return NULL;
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct logfs_super *super = logfs_super(sb);
+
+ stats->f_type = LOGFS_MAGIC_U32;
+ stats->f_bsize = sb->s_blocksize;
+ stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
+ stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
+ stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
+ stats->f_files = 0;
+ stats->f_ffree = 0;
+ stats->f_namelen = LOGFS_MAX_NAMELEN;
+ return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+ struct logfs_super *super = _super;
+
+ sb->s_fs_info = super;
+ sb->s_mtd = super->s_mtd;
+ sb->s_bdev = super->s_bdev;
+ return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+ struct logfs_super *super = _super;
+ struct mtd_info *mtd = super->s_mtd;
+
+ if (mtd && sb->s_mtd == mtd)
+ return 1;
+ if (super->s_bdev && sb->s_bdev == super->s_bdev)
+ return 1;
+ return 0;
+}
+
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+ u8 level, u32 segno, u32 ec)
+{
+ sh->pad = 0;
+ sh->type = type;
+ sh->level = level;
+ sh->segno = cpu_to_be32(segno);
+ sh->ec = cpu_to_be32(ec);
+ sh->gec = cpu_to_be64(segno);
+ sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+ u32 segno, u32 ec)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_segment_header *sh = &ds->ds_sh;
+ int i;
+
+ memset(ds, 0, sizeof(*ds));
+ set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+
+ ds->ds_ifile_levels = super->s_ifile_levels;
+ ds->ds_iblock_levels = super->s_iblock_levels;
+ ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
+ ds->ds_segment_shift = super->s_segshift;
+ ds->ds_block_shift = sb->s_blocksize_bits;
+ ds->ds_write_shift = super->s_writeshift;
+ ds->ds_filesystem_size = cpu_to_be64(super->s_size);
+ ds->ds_segment_size = cpu_to_be32(super->s_segsize);
+ ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
+ ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
+ ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+ ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
+ ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
+ ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
+ ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
+ journal_for_each(i)
+ ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+ ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
+ ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+ LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+
+static int write_one_sb(struct super_block *sb,
+ struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_disk_super *ds;
+ struct logfs_segment_entry se;
+ struct page *page;
+ u64 ofs;
+ u32 ec, segno;
+ int err;
+
+ page = find_sb(sb, &ofs);
+ if (!page)
+ return -EIO;
+ ds = page_address(page);
+ segno = seg_no(sb, ofs);
+ logfs_get_segment_entry(sb, segno, &se);
+ ec = be32_to_cpu(se.ec_level) >> 4;
+ ec++;
+ logfs_set_segment_erased(sb, segno, ec, 0);
+ logfs_write_ds(sb, ds, segno, ec);
+ err = super->s_devops->write_sb(sb, page);
+ page_cache_release(page);
+ return err;
+}
+
+int logfs_write_sb(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int err;
+
+ /* First superblock */
+ err = write_one_sb(sb, super->s_devops->find_first_sb);
+ if (err)
+ return err;
+
+ /* Last superblock */
+ err = write_one_sb(sb, super->s_devops->find_last_sb);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+ size_t len = sizeof(struct logfs_disk_super);
+
+ /* We know the segment headers differ, so ignore them */
+ len -= LOGFS_SEGMENT_HEADERSIZE;
+ ds0 += LOGFS_SEGMENT_HEADERSIZE;
+ ds1 += LOGFS_SEGMENT_HEADERSIZE;
+ return memcmp(ds0, ds1, len);
+}
+
+static int logfs_recover_sb(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct logfs_disk_super _ds0, *ds0 = &_ds0;
+ struct logfs_disk_super _ds1, *ds1 = &_ds1;
+ int err, valid0, valid1;
+
+ /* read first superblock */
+ err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+ if (err)
+ return err;
+ /* read last superblock */
+ err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+ if (err)
+ return err;
+ valid0 = logfs_check_ds(ds0) == 0;
+ valid1 = logfs_check_ds(ds1) == 0;
+
+ if (!valid0 && valid1) {
+ printk(KERN_INFO"First superblock is invalid - fixing.\n");
+ return write_one_sb(sb, super->s_devops->find_first_sb);
+ }
+ if (valid0 && !valid1) {
+ printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+ return write_one_sb(sb, super->s_devops->find_last_sb);
+ }
+ if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+ printk(KERN_INFO"Superblocks don't match - fixing.\n");
+ return write_one_sb(sb, super->s_devops->find_last_sb);
+ }
+ /* If neither is valid now, something's wrong. Didn't we properly
+ * check them before?!? */
+ BUG_ON(!valid0 && !valid1);
+ return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+ int err;
+
+ /* Repair any broken superblock copies */
+ err = logfs_recover_sb(sb);
+ if (err)
+ return err;
+
+ /* Check areas for trailing unaccounted data */
+ err = logfs_check_areas(sb);
+ if (err)
+ return err;
+
+ err = logfs_open_segfile(sb);
+ if (err)
+ return err;
+
+ /* Do one GC pass before any data gets dirtied */
+ logfs_gc_pass(sb);
+
+ /* after all initializations are done, replay the journal
+ * for rw-mounts, if necessary */
+ err = logfs_replay_journal(sb);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct inode *rootdir;
+ int err;
+
+ /* root dir */
+ rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+ if (IS_ERR(rootdir))
+ goto fail;
+
+ sb->s_root = d_alloc_root(rootdir);
+ if (!sb->s_root)
+ goto fail;
+
+ super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+ if (!super->s_erase_page)
+ goto fail2;
+ memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+
+ /* FIXME: check for read-only mounts */
+ err = logfs_make_writeable(sb);
+ if (err)
+ goto fail3;
+
+ log_super("LogFS: Finished mounting\n");
+ simple_set_mnt(mnt, sb);
+ return 0;
+
+fail3:
+ __free_page(super->s_erase_page);
+fail2:
+ iput(rootdir);
+fail:
+ iput(logfs_super(sb)->s_master_inode);
+ return -EIO;
+}
+
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+ struct logfs_segment_header *sh = &ds->ds_sh;
+
+ if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+ return -EINVAL;
+ if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+ return -EINVAL;
+ if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+ LOGFS_SEGMENT_HEADERSIZE + 12))
+ return -EINVAL;
+ return 0;
+}
+
+static struct page *find_super_block(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct page *first, *last;
+
+ first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+ if (!first || IS_ERR(first))
+ return NULL;
+ last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+ if (!last || IS_ERR(first)) {
+ page_cache_release(first);
+ return NULL;
+ }
+
+ if (!logfs_check_ds(page_address(first))) {
+ page_cache_release(last);
+ return first;
+ }
+
+ /* First one didn't work, try the second superblock */
+ if (!logfs_check_ds(page_address(last))) {
+ page_cache_release(first);
+ return last;
+ }
+
+ /* Neither worked, sorry folks */
+ page_cache_release(first);
+ page_cache_release(last);
+ return NULL;
+}
+
+static int __logfs_read_sb(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ struct page *page;
+ struct logfs_disk_super *ds;
+ int i;
+
+ page = find_super_block(sb);
+ if (!page)
+ return -EIO;
+
+ ds = page_address(page);
+ super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+ super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+ super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+ super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+ super->s_segsize = 1 << ds->ds_segment_shift;
+ super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+ super->s_segshift = ds->ds_segment_shift;
+ sb->s_blocksize = 1 << ds->ds_block_shift;
+ sb->s_blocksize_bits = ds->ds_block_shift;
+ super->s_writesize = 1 << ds->ds_write_shift;
+ super->s_writeshift = ds->ds_write_shift;
+ super->s_no_segs = super->s_size >> super->s_segshift;
+ super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+ super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+ super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+ super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+ super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+
+ journal_for_each(i)
+ super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+
+ super->s_ifile_levels = ds->ds_ifile_levels;
+ super->s_iblock_levels = ds->ds_iblock_levels;
+ super->s_data_levels = ds->ds_data_levels;
+ super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+ + super->s_data_levels;
+ page_cache_release(page);
+ return 0;
+}
+
+static int logfs_read_sb(struct super_block *sb, int read_only)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int ret;
+
+ super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+ if (!super->s_btree_pool)
+ return -ENOMEM;
+
+ btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+ btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+
+ ret = logfs_init_mapping(sb);
+ if (ret)
+ return ret;
+
+ ret = __logfs_read_sb(sb);
+ if (ret)
+ return ret;
+
+ if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
+ return -EIO;
+ if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
+ !read_only)
+ return -EIO;
+
+ mutex_init(&super->s_dirop_mutex);
+ mutex_init(&super->s_object_alias_mutex);
+ INIT_LIST_HEAD(&super->s_freeing_list);
+
+ ret = logfs_init_rw(sb);
+ if (ret)
+ return ret;
+
+ ret = logfs_init_areas(sb);
+ if (ret)
+ return ret;
+
+ ret = logfs_init_gc(sb);
+ if (ret)
+ return ret;
+
+ ret = logfs_init_journal(sb);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+
+ log_super("LogFS: Start unmounting\n");
+ /* Alias entries slow down mount, so evict as many as possible */
+ sync_filesystem(sb);
+ logfs_write_anchor(sb);
+
+ /*
+ * From this point on alias entries are simply dropped - and any
+ * writes to the object store are considered bugs.
+ */
+ super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+ log_super("LogFS: Now in shutdown\n");
+ generic_shutdown_super(sb);
+
+ BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+ logfs_cleanup_gc(sb);
+ logfs_cleanup_journal(sb);
+ logfs_cleanup_areas(sb);
+ logfs_cleanup_rw(sb);
+ if (super->s_erase_page)
+ __free_page(super->s_erase_page);
+ super->s_devops->put_device(sb);
+ mempool_destroy(super->s_btree_pool);
+ mempool_destroy(super->s_alias_pool);
+ kfree(super);
+ log_super("LogFS: Finished unmounting\n");
+}
+
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+ struct mtd_info *mtd, struct block_device *bdev,
+ const struct logfs_device_ops *devops, struct vfsmount *mnt)
+{
+ struct logfs_super *super;
+ struct super_block *sb;
+ int err = -ENOMEM;
+ static int mount_count;
+
+ log_super("LogFS: Start mount %x\n", mount_count++);
+ super = kzalloc(sizeof(*super), GFP_KERNEL);
+ if (!super)
+ goto err0;
+
+ super->s_mtd = mtd;
+ super->s_bdev = bdev;
+ err = -EINVAL;
+ sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+ if (IS_ERR(sb))
+ goto err0;
+
+ if (sb->s_root) {
+ /* Device is already in use */
+ err = 0;
+ simple_set_mnt(mnt, sb);
+ goto err0;
+ }
+
+ super->s_devops = devops;
+
+ /*
+ * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
+ * only covers 16TB and the upper 8TB are used for indirect blocks.
+ * On 64bit system we could bump up the limit, but that would make
+ * the filesystem incompatible with 32bit systems.
+ */
+ sb->s_maxbytes = (1ull << 43) - 1;
+ sb->s_op = &logfs_super_operations;
+ sb->s_flags = flags | MS_NOATIME;
+
+ err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
+ if (err)
+ goto err1;
+
+ sb->s_flags |= MS_ACTIVE;
+ err = logfs_get_sb_final(sb, mnt);
+ if (err)
+ goto err1;
+ return 0;
+
+err1:
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+ return err;
+err0:
+ kfree(super);
+ //devops->put_device(sb);
+ return err;
+}
+
+static int logfs_get_sb(struct file_system_type *type, int flags,
+ const char *devname, void *data, struct vfsmount *mnt)
+{
+ ulong mtdnr;
+
+ if (!devname)
+ return logfs_get_sb_bdev(type, flags, devname, mnt);
+ if (strncmp(devname, "mtd", 3))
+ return logfs_get_sb_bdev(type, flags, devname, mnt);
+
+ {
+ char *garbage;
+ mtdnr = simple_strtoul(devname+3, &garbage, 0);
+ if (*garbage)
+ return -EINVAL;
+ }
+
+ return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+}
+
+static struct file_system_type logfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "logfs",
+ .get_sb = logfs_get_sb,
+ .kill_sb = logfs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+
+};
+
+static int __init logfs_init(void)
+{
+ int ret;
+
+ emergency_page = alloc_pages(GFP_KERNEL, 0);
+ if (!emergency_page)
+ return -ENOMEM;
+
+ ret = logfs_compr_init();
+ if (ret)
+ goto out1;
+
+ ret = logfs_init_inode_cache();
+ if (ret)
+ goto out2;
+
+ return register_filesystem(&logfs_fs_type);
+out2:
+ logfs_compr_exit();
+out1:
+ __free_pages(emergency_page, 0);
+ return ret;
+}
+
+static void __exit logfs_exit(void)
+{
+ unregister_filesystem(&logfs_fs_type);
+ logfs_destroy_inode_cache();
+ logfs_compr_exit();
+ __free_pages(emergency_page, 0);
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d7216..756f8c93780 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/vfs.h>
+#include <linux/writeback.h>
-static int minix_write_inode(struct inode * inode, int wait);
+static int minix_write_inode(struct inode *inode,
+ struct writeback_control *wbc);
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
static int minix_remount (struct super_block * sb, int * flags, char * data);
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
return bh;
}
-static int minix_write_inode(struct inode *inode, int wait)
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err = 0;
struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
bh = V2_minix_update_inode(inode);
if (!bh)
return -EIO;
- if (wait && buffer_dirty(bh)) {
+ if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543..598d54e200e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -561,7 +561,7 @@ page_is_mapped:
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
- * and every writepage invokation because it may be mmapped.
+ * and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
* that is not a multiple of the page size, the remaining memory
* is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index 0741c69b331..1c0fca6e899 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
-#include <linux/quotaops.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
@@ -498,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *);
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
- int res = 0;
- char *name;
if (IS_ERR(link))
goto fail;
@@ -510,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
path_get(&nd->root);
}
- res = link_path_walk(link, nd);
- if (nd->depth || res || nd->last_type!=LAST_NORM)
- return res;
- /*
- * If it is an iterative symlinks resolution in open_namei() we
- * have to copy the last component. And all that crap because of
- * bloody create() on broken symlinks. Furrfu...
- */
- name = __getname();
- if (unlikely(!name)) {
- path_put(&nd->path);
- return -ENOMEM;
- }
- strcpy(name, nd->last.name);
- nd->last.name = name;
- return 0;
+ return link_path_walk(link, nd);
fail:
path_put(&nd->path);
return PTR_ERR(link);
@@ -547,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
nd->path.dentry = path->dentry;
}
-static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
+static __always_inline int
+__do_follow_link(struct path *path, struct nameidata *nd, void **p)
{
int error;
- void *cookie;
struct dentry *dentry = path->dentry;
touch_atime(path->mnt, dentry);
@@ -562,9 +544,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
}
mntget(path->mnt);
nd->last_type = LAST_BIND;
- cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
- error = PTR_ERR(cookie);
- if (!IS_ERR(cookie)) {
+ *p = dentry->d_inode->i_op->follow_link(dentry, nd);
+ error = PTR_ERR(*p);
+ if (!IS_ERR(*p)) {
char *s = nd_get_link(nd);
error = 0;
if (s)
@@ -574,8 +556,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
if (error)
path_put(&nd->path);
}
- if (dentry->d_inode->i_op->put_link)
- dentry->d_inode->i_op->put_link(dentry, nd, cookie);
}
return error;
}
@@ -589,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
*/
static inline int do_follow_link(struct path *path, struct nameidata *nd)
{
+ void *cookie;
int err = -ELOOP;
if (current->link_count >= MAX_NESTED_LINKS)
goto loop;
@@ -602,7 +583,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
current->link_count++;
current->total_link_count++;
nd->depth++;
- err = __do_follow_link(path, nd);
+ err = __do_follow_link(path, nd, &cookie);
+ if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
+ path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
path_put(path);
current->link_count--;
nd->depth--;
@@ -1375,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
-/*
- * O_DIRECTORY translates into forcing a directory lookup.
- */
-static inline int lookup_flags(unsigned int f)
-{
- unsigned long retval = LOOKUP_FOLLOW;
-
- if (f & O_NOFOLLOW)
- retval &= ~LOOKUP_FOLLOW;
-
- if (f & O_DIRECTORY)
- retval |= LOOKUP_DIRECTORY;
-
- return retval;
-}
-
/*
* p1 and p2 should be directories on the same fs.
*/
@@ -1448,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->create(dir, dentry, mode, nd);
if (!error)
fsnotify_create(dir, dentry);
@@ -1590,129 +1556,132 @@ static int open_will_truncate(int flag, struct inode *inode)
return (flag & O_TRUNC);
}
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
- int open_flag, int mode, int acc_mode)
+static struct file *finish_open(struct nameidata *nd,
+ int open_flag, int acc_mode)
{
struct file *filp;
- struct nameidata nd;
- int error;
- struct path path;
- struct dentry *dir;
- int count = 0;
int will_truncate;
- int flag = open_to_namei_flags(open_flag);
- int force_reval = 0;
+ int error;
+ will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+ if (will_truncate) {
+ error = mnt_want_write(nd->path.mnt);
+ if (error)
+ goto exit;
+ }
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error) {
+ if (will_truncate)
+ mnt_drop_write(nd->path.mnt);
+ goto exit;
+ }
+ filp = nameidata_to_filp(nd);
+ if (!IS_ERR(filp)) {
+ error = ima_file_check(filp, acc_mode);
+ if (error) {
+ fput(filp);
+ filp = ERR_PTR(error);
+ }
+ }
+ if (!IS_ERR(filp)) {
+ if (will_truncate) {
+ error = handle_truncate(&nd->path);
+ if (error) {
+ fput(filp);
+ filp = ERR_PTR(error);
+ }
+ }
+ }
/*
- * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
- * check for O_DSYNC if the need any syncing at all we enforce it's
- * always set instead of having to deal with possibly weird behaviour
- * for malicious applications setting only __O_SYNC.
+ * It is now safe to drop the mnt write
+ * because the filp has had a write taken
+ * on its behalf.
*/
- if (open_flag & __O_SYNC)
- open_flag |= O_DSYNC;
-
- if (!acc_mode)
- acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+ if (will_truncate)
+ mnt_drop_write(nd->path.mnt);
+ return filp;
- /* O_TRUNC implies we need access checks for write permissions */
- if (flag & O_TRUNC)
- acc_mode |= MAY_WRITE;
+exit:
+ if (!IS_ERR(nd->intent.open.file))
+ release_open_intent(nd);
+ path_put(&nd->path);
+ return ERR_PTR(error);
+}
- /* Allow the LSM permission hook to distinguish append
- access from general write access. */
- if (flag & O_APPEND)
- acc_mode |= MAY_APPEND;
+static struct file *do_last(struct nameidata *nd, struct path *path,
+ int open_flag, int acc_mode,
+ int mode, const char *pathname,
+ int *want_dir)
+{
+ struct dentry *dir = nd->path.dentry;
+ struct file *filp;
+ int error = -EISDIR;
- /*
- * The simplest case - just a plain lookup.
- */
- if (!(flag & O_CREAT)) {
- filp = get_empty_filp();
-
- if (filp == NULL)
- return ERR_PTR(-ENFILE);
- nd.intent.open.file = filp;
- filp->f_flags = open_flag;
- nd.intent.open.flags = flag;
- nd.intent.open.create_mode = 0;
- error = do_path_lookup(dfd, pathname,
- lookup_flags(flag)|LOOKUP_OPEN, &nd);
- if (IS_ERR(nd.intent.open.file)) {
- if (error == 0) {
- error = PTR_ERR(nd.intent.open.file);
- path_put(&nd.path);
+ switch (nd->last_type) {
+ case LAST_DOTDOT:
+ follow_dotdot(nd);
+ dir = nd->path.dentry;
+ if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+ if (!dir->d_op->d_revalidate(dir, nd)) {
+ error = -ESTALE;
+ goto exit;
}
- } else if (error)
- release_open_intent(&nd);
- if (error)
- return ERR_PTR(error);
+ }
+ /* fallthrough */
+ case LAST_DOT:
+ case LAST_ROOT:
+ if (open_flag & O_CREAT)
+ goto exit;
+ /* fallthrough */
+ case LAST_BIND:
+ audit_inode(pathname, dir);
goto ok;
}
- /*
- * Create - we need to know the parent.
- */
-reval:
- error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
- if (error)
- return ERR_PTR(error);
- if (force_reval)
- nd.flags |= LOOKUP_REVAL;
- error = path_walk(pathname, &nd);
- if (error) {
- if (nd.root.mnt)
- path_put(&nd.root);
- return ERR_PTR(error);
+ /* trailing slashes? */
+ if (nd->last.name[nd->last.len]) {
+ if (open_flag & O_CREAT)
+ goto exit;
+ *want_dir = 1;
}
- if (unlikely(!audit_dummy_context()))
- audit_inode(pathname, nd.path.dentry);
- /*
- * We have the parent and last component. First of all, check
- * that we are not asked to creat(2) an obvious directory - that
- * will not do.
- */
- error = -EISDIR;
- if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
- goto exit_parent;
+ /* just plain open? */
+ if (!(open_flag & O_CREAT)) {
+ error = do_lookup(nd, &nd->last, path);
+ if (error)
+ goto exit;
+ error = -ENOENT;
+ if (!path->dentry->d_inode)
+ goto exit_dput;
+ if (path->dentry->d_inode->i_op->follow_link)
+ return NULL;
+ error = -ENOTDIR;
+ if (*want_dir && !path->dentry->d_inode->i_op->lookup)
+ goto exit_dput;
+ path_to_nameidata(path, nd);
+ audit_inode(pathname, nd->path.dentry);
+ goto ok;
+ }
- error = -ENFILE;
- filp = get_empty_filp();
- if (filp == NULL)
- goto exit_parent;
- nd.intent.open.file = filp;
- filp->f_flags = open_flag;
- nd.intent.open.flags = flag;
- nd.intent.open.create_mode = mode;
- dir = nd.path.dentry;
- nd.flags &= ~LOOKUP_PARENT;
- nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
- if (flag & O_EXCL)
- nd.flags |= LOOKUP_EXCL;
+ /* OK, it's O_CREAT */
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(&nd);
- path.mnt = nd.path.mnt;
-do_last:
- error = PTR_ERR(path.dentry);
- if (IS_ERR(path.dentry)) {
+ path->dentry = lookup_hash(nd);
+ path->mnt = nd->path.mnt;
+
+ error = PTR_ERR(path->dentry);
+ if (IS_ERR(path->dentry)) {
mutex_unlock(&dir->d_inode->i_mutex);
goto exit;
}
- if (IS_ERR(nd.intent.open.file)) {
- error = PTR_ERR(nd.intent.open.file);
+ if (IS_ERR(nd->intent.open.file)) {
+ error = PTR_ERR(nd->intent.open.file);
goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
- if (!path.dentry->d_inode) {
+ if (!path->dentry->d_inode) {
/*
* This write is needed to ensure that a
* ro->rw transition does not occur between
@@ -1720,18 +1689,16 @@ do_last:
* a permanent write count is taken through
* the 'struct file' in nameidata_to_filp().
*/
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(nd->path.mnt);
if (error)
goto exit_mutex_unlock;
- error = __open_namei_create(&nd, &path, open_flag, mode);
+ error = __open_namei_create(nd, path, open_flag, mode);
if (error) {
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(nd->path.mnt);
goto exit;
}
- filp = nameidata_to_filp(&nd);
- mnt_drop_write(nd.path.mnt);
- if (nd.root.mnt)
- path_put(&nd.root);
+ filp = nameidata_to_filp(nd);
+ mnt_drop_write(nd->path.mnt);
if (!IS_ERR(filp)) {
error = ima_file_check(filp, acc_mode);
if (error) {
@@ -1746,150 +1713,181 @@ do_last:
* It already exists.
*/
mutex_unlock(&dir->d_inode->i_mutex);
- audit_inode(pathname, path.dentry);
+ audit_inode(pathname, path->dentry);
error = -EEXIST;
- if (flag & O_EXCL)
+ if (open_flag & O_EXCL)
goto exit_dput;
- if (__follow_mount(&path)) {
+ if (__follow_mount(path)) {
error = -ELOOP;
- if (flag & O_NOFOLLOW)
+ if (open_flag & O_NOFOLLOW)
goto exit_dput;
}
error = -ENOENT;
- if (!path.dentry->d_inode)
+ if (!path->dentry->d_inode)
goto exit_dput;
- if (path.dentry->d_inode->i_op->follow_link)
- goto do_link;
- path_to_nameidata(&path, &nd);
+ if (path->dentry->d_inode->i_op->follow_link)
+ return NULL;
+
+ path_to_nameidata(path, nd);
error = -EISDIR;
- if (S_ISDIR(path.dentry->d_inode->i_mode))
+ if (S_ISDIR(path->dentry->d_inode->i_mode))
goto exit;
ok:
+ filp = finish_open(nd, open_flag, acc_mode);
+ return filp;
+
+exit_mutex_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
+exit_dput:
+ path_put_conditional(path, nd);
+exit:
+ if (!IS_ERR(nd->intent.open.file))
+ release_open_intent(nd);
+ path_put(&nd->path);
+ return ERR_PTR(error);
+}
+
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
+ */
+struct file *do_filp_open(int dfd, const char *pathname,
+ int open_flag, int mode, int acc_mode)
+{
+ struct file *filp;
+ struct nameidata nd;
+ int error;
+ struct path path;
+ int count = 0;
+ int flag = open_to_namei_flags(open_flag);
+ int force_reval = 0;
+ int want_dir = open_flag & O_DIRECTORY;
+
+ if (!(open_flag & O_CREAT))
+ mode = 0;
+
/*
- * Consider:
- * 1. may_open() truncates a file
- * 2. a rw->ro mount transition occurs
- * 3. nameidata_to_filp() fails due to
- * the ro mount.
- * That would be inconsistent, and should
- * be avoided. Taking this mnt write here
- * ensures that (2) can not occur.
+ * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
+ * check for O_DSYNC if the need any syncing at all we enforce it's
+ * always set instead of having to deal with possibly weird behaviour
+ * for malicious applications setting only __O_SYNC.
*/
- will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode);
- if (will_truncate) {
- error = mnt_want_write(nd.path.mnt);
- if (error)
- goto exit;
- }
- error = may_open(&nd.path, acc_mode, open_flag);
+ if (open_flag & __O_SYNC)
+ open_flag |= O_DSYNC;
+
+ if (!acc_mode)
+ acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+
+ /* O_TRUNC implies we need access checks for write permissions */
+ if (open_flag & O_TRUNC)
+ acc_mode |= MAY_WRITE;
+
+ /* Allow the LSM permission hook to distinguish append
+ access from general write access. */
+ if (open_flag & O_APPEND)
+ acc_mode |= MAY_APPEND;
+
+ /* find the parent */
+reval:
+ error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+ if (error)
+ return ERR_PTR(error);
+ if (force_reval)
+ nd.flags |= LOOKUP_REVAL;
+
+ current->total_link_count = 0;
+ error = link_path_walk(pathname, &nd);
if (error) {
- if (will_truncate)
- mnt_drop_write(nd.path.mnt);
- goto exit;
- }
- filp = nameidata_to_filp(&nd);
- if (!IS_ERR(filp)) {
- error = ima_file_check(filp, acc_mode);
- if (error) {
- fput(filp);
- filp = ERR_PTR(error);
- }
+ filp = ERR_PTR(error);
+ goto out;
}
- if (!IS_ERR(filp)) {
- if (acc_mode & MAY_WRITE)
- vfs_dq_init(nd.path.dentry->d_inode);
+ if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+ audit_inode(pathname, nd.path.dentry);
- if (will_truncate) {
- error = handle_truncate(&nd.path);
- if (error) {
- fput(filp);
- filp = ERR_PTR(error);
- }
- }
- }
/*
- * It is now safe to drop the mnt write
- * because the filp has had a write taken
- * on its behalf.
+ * We have the parent and last component.
*/
- if (will_truncate)
- mnt_drop_write(nd.path.mnt);
+
+ error = -ENFILE;
+ filp = get_empty_filp();
+ if (filp == NULL)
+ goto exit_parent;
+ nd.intent.open.file = filp;
+ filp->f_flags = open_flag;
+ nd.intent.open.flags = flag;
+ nd.intent.open.create_mode = mode;
+ nd.flags &= ~LOOKUP_PARENT;
+ nd.flags |= LOOKUP_OPEN;
+ if (open_flag & O_CREAT) {
+ nd.flags |= LOOKUP_CREATE;
+ if (open_flag & O_EXCL)
+ nd.flags |= LOOKUP_EXCL;
+ }
+ filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+ while (unlikely(!filp)) { /* trailing symlink */
+ struct path holder;
+ struct inode *inode = path.dentry->d_inode;
+ void *cookie;
+ error = -ELOOP;
+ /* S_ISDIR part is a temporary automount kludge */
+ if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+ goto exit_dput;
+ if (count++ == 32)
+ goto exit_dput;
+ /*
+ * This is subtle. Instead of calling do_follow_link() we do
+ * the thing by hands. The reason is that this way we have zero
+ * link_count and path_walk() (called from ->follow_link)
+ * honoring LOOKUP_PARENT. After that we have the parent and
+ * last component, i.e. we are in the same situation as after
+ * the first path_walk(). Well, almost - if the last component
+ * is normal we get its copy stored in nd->last.name and we will
+ * have to putname() it when we are done. Procfs-like symlinks
+ * just set LAST_BIND.
+ */
+ nd.flags |= LOOKUP_PARENT;
+ error = security_inode_follow_link(path.dentry, &nd);
+ if (error)
+ goto exit_dput;
+ error = __do_follow_link(&path, &nd, &cookie);
+ if (unlikely(error)) {
+ /* nd.path had been dropped */
+ if (!IS_ERR(cookie) && inode->i_op->put_link)
+ inode->i_op->put_link(path.dentry, &nd, cookie);
+ path_put(&path);
+ release_open_intent(&nd);
+ filp = ERR_PTR(error);
+ goto out;
+ }
+ holder = path;
+ nd.flags &= ~LOOKUP_PARENT;
+ filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+ if (inode->i_op->put_link)
+ inode->i_op->put_link(holder.dentry, &nd, cookie);
+ path_put(&holder);
+ }
+out:
if (nd.root.mnt)
path_put(&nd.root);
+ if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+ force_reval = 1;
+ goto reval;
+ }
return filp;
-exit_mutex_unlock:
- mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
path_put_conditional(&path, &nd);
-exit:
if (!IS_ERR(nd.intent.open.file))
release_open_intent(&nd);
exit_parent:
- if (nd.root.mnt)
- path_put(&nd.root);
path_put(&nd.path);
- return ERR_PTR(error);
-
-do_link:
- error = -ELOOP;
- if (flag & O_NOFOLLOW)
- goto exit_dput;
- /*
- * This is subtle. Instead of calling do_follow_link() we do the
- * thing by hands. The reason is that this way we have zero link_count
- * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
- * After that we have the parent and last component, i.e.
- * we are in the same situation as after the first path_walk().
- * Well, almost - if the last component is normal we get its copy
- * stored in nd->last.name and we will have to putname() it when we
- * are done. Procfs-like symlinks just set LAST_BIND.
- */
- nd.flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, &nd);
- if (error)
- goto exit_dput;
- error = __do_follow_link(&path, &nd);
- path_put(&path);
- if (error) {
- /* Does someone understand code flow here? Or it is only
- * me so stupid? Anathema to whoever designed this non-sense
- * with "intent.open".
- */
- release_open_intent(&nd);
- if (nd.root.mnt)
- path_put(&nd.root);
- if (error == -ESTALE && !force_reval) {
- force_reval = 1;
- goto reval;
- }
- return ERR_PTR(error);
- }
- nd.flags &= ~LOOKUP_PARENT;
- if (nd.last_type == LAST_BIND)
- goto ok;
- error = -EISDIR;
- if (nd.last_type != LAST_NORM)
- goto exit;
- if (nd.last.name[nd.last.len]) {
- __putname(nd.last.name);
- goto exit;
- }
- error = -ELOOP;
- if (count++==32) {
- __putname(nd.last.name);
- goto exit;
- }
- dir = nd.path.dentry;
- mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(&nd);
- path.mnt = nd.path.mnt;
- __putname(nd.last.name);
- goto do_last;
+ filp = ERR_PTR(error);
+ goto out;
}
/**
@@ -1983,7 +1981,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->mknod(dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
@@ -2082,7 +2079,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->mkdir(dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -2168,8 +2164,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->rmdir)
return -EPERM;
- vfs_dq_init(dir);
-
mutex_lock(&dentry->d_inode->i_mutex);
dentry_unhash(dentry);
if (d_mountpoint(dentry))
@@ -2255,8 +2249,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->unlink)
return -EPERM;
- vfs_dq_init(dir);
-
mutex_lock(&dentry->d_inode->i_mutex);
if (d_mountpoint(dentry))
error = -EBUSY;
@@ -2369,7 +2361,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
if (error)
return error;
- vfs_dq_init(dir);
error = dir->i_op->symlink(dir, dentry, oldname);
if (!error)
fsnotify_create(dir, dentry);
@@ -2453,7 +2444,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
return error;
mutex_lock(&inode->i_mutex);
- vfs_dq_init(dir);
error = dir->i_op->link(old_dentry, dir, new_dentry);
mutex_unlock(&inode->i_mutex);
if (!error)
@@ -2554,7 +2544,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* e) conversion from fhandle to dentry may come in the wrong moment - when
* we are removing the target. Solution: we will have to grab ->i_mutex
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- * ->i_mutex on parents, which works but leads to some truely excessive
+ * ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2654,9 +2644,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old_dir->i_op->rename)
return -EPERM;
- vfs_dq_init(old_dir);
- vfs_dq_init(new_dir);
-
old_name = fsnotify_oldname_init(old_dentry->d_name.name);
if (is_dir)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 73ab220354d..36dfdae9512 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
dprintk("NFS: Callback listener port = %u (af %u)\n",
nfs_callback_tcpport, PF_INET);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
ret = svc_create_xprt(serv, "tcp", PF_INET6,
nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret > 0) {
@@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
ret = 0;
else
goto out_err;
-#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
return svc_prepare_thread(serv, &serv->sv_pools[0]);
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d4036be0b58..85a7cfd1b8d 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ struct cb_recallanyargs {
};
extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+
+struct cb_recallslotargs {
+ struct sockaddr *crsa_addr;
+ uint32_t crsa_target_max_slots;
+};
+extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+ void *dummy);
+
#endif /* CONFIG_NFS_V4_1 */
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index defa9b4c470..84761b5bb8e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -143,44 +143,49 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
* Return success if the sequenceID is one more than what we last saw on
* this slot, accounting for wraparound. Increments the slot's sequence.
*
- * We don't yet implement a duplicate request cache, so at this time
- * we will log replays, and process them as if we had not seen them before,
- * but we don't bump the sequence in the slot. Not too worried about it,
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
* since we only currently implement idempotent callbacks anyway.
*
* We have a single slot backchannel at this time, so we don't bother
* checking the used_slots bit array on the table. The lower layer guarantees
* a single outstanding callback request at a time.
*/
-static int
-validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
+static __be32
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
{
struct nfs4_slot *slot;
dprintk("%s enter. slotid %d seqid %d\n",
- __func__, slotid, seqid);
+ __func__, args->csa_slotid, args->csa_sequenceid);
- if (slotid > NFS41_BC_MAX_CALLBACKS)
+ if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
return htonl(NFS4ERR_BADSLOT);
- slot = tbl->slots + slotid;
+ slot = tbl->slots + args->csa_slotid;
dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
/* Normal */
- if (likely(seqid == slot->seq_nr + 1)) {
+ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
slot->seq_nr++;
return htonl(NFS4_OK);
}
/* Replay */
- if (seqid == slot->seq_nr) {
- dprintk("%s seqid %d is a replay - no DRC available\n",
- __func__, seqid);
- return htonl(NFS4_OK);
+ if (args->csa_sequenceid == slot->seq_nr) {
+ dprintk("%s seqid %d is a replay\n",
+ __func__, args->csa_sequenceid);
+ /* Signal process_op to set this error on next op */
+ if (args->csa_cachethis == 0)
+ return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ else if (args->csa_cachethis == 1)
+ return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
}
/* Wraparound */
- if (seqid == 1 && (slot->seq_nr + 1) == 0) {
+ if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
slot->seq_nr = 1;
return htonl(NFS4_OK);
}
@@ -225,27 +230,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
return NULL;
}
-/* FIXME: referring calls should be processed */
-unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match. If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+ uint32_t nrclists,
+ struct referring_call_list *rclists)
+{
+ bool status = 0;
+ int i, j;
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ struct referring_call_list *rclist;
+ struct referring_call *ref;
+
+ /*
+ * XXX When client trunking is implemented, this becomes
+ * a session lookup from within the loop
+ */
+ session = clp->cl_session;
+ tbl = &session->fc_slot_table;
+
+ for (i = 0; i < nrclists; i++) {
+ rclist = &rclists[i];
+ if (memcmp(session->sess_id.data,
+ rclist->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+ ref = &rclist->rcl_refcalls[j];
+
+ dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+ "slotid %u\n", __func__,
+ ((u32 *)&rclist->rcl_sessionid.data)[0],
+ ((u32 *)&rclist->rcl_sessionid.data)[1],
+ ((u32 *)&rclist->rcl_sessionid.data)[2],
+ ((u32 *)&rclist->rcl_sessionid.data)[3],
+ ref->rc_sequenceid, ref->rc_slotid);
+
+ spin_lock(&tbl->slot_tbl_lock);
+ status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+ tbl->slots[ref->rc_slotid].seq_nr ==
+ ref->rc_sequenceid);
+ spin_unlock(&tbl->slot_tbl_lock);
+ if (status)
+ goto out;
+ }
+ }
+
+out:
+ return status;
+}
+
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_sequenceres *res)
{
struct nfs_client *clp;
- int i, status;
-
- for (i = 0; i < args->csa_nrclists; i++)
- kfree(args->csa_rclists[i].rcl_refcalls);
- kfree(args->csa_rclists);
+ int i;
+ __be32 status;
status = htonl(NFS4ERR_BADSESSION);
clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
if (clp == NULL)
goto out;
- status = validate_seqid(&clp->cl_session->bc_slot_table,
- args->csa_slotid, args->csa_sequenceid);
+ status = validate_seqid(&clp->cl_session->bc_slot_table, args);
if (status)
goto out_putclient;
+ /*
+ * Check for pending referring calls. If a match is found, a
+ * related callback was received before the response to the original
+ * call.
+ */
+ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_putclient;
+ }
+
memcpy(&res->csr_sessionid, &args->csa_sessionid,
sizeof(res->csr_sessionid));
res->csr_sequenceid = args->csa_sequenceid;
@@ -256,15 +321,23 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
out_putclient:
nfs_put_client(clp);
out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- res->csr_status = status;
- return res->csr_status;
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+ res->csr_status = 0;
+ else
+ res->csr_status = status;
+ dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+ ntohl(status), ntohl(res->csr_status));
+ return status;
}
-unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
{
struct nfs_client *clp;
- int status;
+ __be32 status;
fmode_t flags = 0;
status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
@@ -289,4 +362,40 @@ out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+{
+ struct nfs_client *clp;
+ struct nfs4_slot_table *fc_tbl;
+ __be32 status;
+
+ status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ clp = nfs_find_client(args->crsa_addr, 4);
+ if (clp == NULL)
+ goto out;
+
+ dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ args->crsa_target_max_slots);
+
+ fc_tbl = &clp->cl_session->fc_slot_table;
+
+ status = htonl(NFS4ERR_BAD_HIGH_SLOT);
+ if (args->crsa_target_max_slots > fc_tbl->max_slots ||
+ args->crsa_target_max_slots < 1)
+ goto out_putclient;
+
+ status = htonl(NFS4_OK);
+ if (args->crsa_target_max_slots == fc_tbl->max_slots)
+ goto out_putclient;
+
+ fc_tbl->target_max_slots = args->crsa_target_max_slots;
+ nfs41_handle_recall_slot(clp);
+out_putclient:
+ nfs_put_client(clp); /* balance nfs_find_client */
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8e1a2511c8b..a2b8b4df125 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -24,10 +24,14 @@
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
#define NFSDBG_FACILITY NFSDBG_CALLBACK
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR 11050
+
typedef __be32 (*callback_process_op_t)(void *, void *);
typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -173,7 +177,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
__be32 *p;
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);
+ return htonl(NFS4ERR_RESOURCE_HDR);
*op = ntohl(*p);
return 0;
}
@@ -215,10 +219,10 @@ out:
#if defined(CONFIG_NFS_V4_1)
-static unsigned decode_sessionid(struct xdr_stream *xdr,
+static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
- uint32_t *p;
+ __be32 *p;
int len = NFS4_MAX_SESSIONID_LEN;
p = read_buf(xdr, len);
@@ -229,12 +233,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
return 0;
}
-static unsigned decode_rc_list(struct xdr_stream *xdr,
+static __be32 decode_rc_list(struct xdr_stream *xdr,
struct referring_call_list *rc_list)
{
- uint32_t *p;
+ __be32 *p;
int i;
- unsigned status;
+ __be32 status;
status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
if (status)
@@ -267,13 +271,13 @@ out:
return status;
}
-static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
struct cb_sequenceargs *args)
{
- uint32_t *p;
+ __be32 *p;
int i;
- unsigned status;
+ __be32 status;
status = decode_sessionid(xdr, &args->csa_sessionid);
if (status)
@@ -327,11 +331,11 @@ out_free:
goto out;
}
-static unsigned decode_recallany_args(struct svc_rqst *rqstp,
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
struct cb_recallanyargs *args)
{
- uint32_t *p;
+ __be32 *p;
args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
@@ -346,6 +350,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp,
return 0;
}
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_recallslotargs *args)
+{
+ __be32 *p;
+
+ args->crsa_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->crsa_target_max_slots = ntohl(*p++);
+ return 0;
+}
+
#endif /* CONFIG_NFS_V4_1 */
static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -465,7 +483,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
p = xdr_reserve_space(xdr, 8);
if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);
+ return htonl(NFS4ERR_RESOURCE_HDR);
*p++ = htonl(op);
*p = res;
return 0;
@@ -499,10 +517,10 @@ out:
#if defined(CONFIG_NFS_V4_1)
-static unsigned encode_sessionid(struct xdr_stream *xdr,
+static __be32 encode_sessionid(struct xdr_stream *xdr,
const struct nfs4_sessionid *sid)
{
- uint32_t *p;
+ __be32 *p;
int len = NFS4_MAX_SESSIONID_LEN;
p = xdr_reserve_space(xdr, len);
@@ -513,11 +531,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
return 0;
}
-static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
struct xdr_stream *xdr,
const struct cb_sequenceres *res)
{
- uint32_t *p;
+ __be32 *p;
unsigned status = res->csr_status;
if (unlikely(status != 0))
@@ -554,6 +572,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL:
case OP_CB_SEQUENCE:
case OP_CB_RECALL_ANY:
+ case OP_CB_RECALL_SLOT:
*op = &callback_ops[op_nr];
break;
@@ -562,7 +581,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
- case OP_CB_RECALL_SLOT:
case OP_CB_WANTS_CANCELLED:
case OP_CB_NOTIFY_LOCK:
return htonl(NFS4ERR_NOTSUPP);
@@ -602,20 +620,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
static __be32 process_op(uint32_t minorversion, int nop,
struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
- struct xdr_stream *xdr_out, void *resp)
+ struct xdr_stream *xdr_out, void *resp, int* drc_status)
{
struct callback_op *op = &callback_ops[0];
- unsigned int op_nr = OP_CB_ILLEGAL;
+ unsigned int op_nr;
__be32 status;
long maxlen;
__be32 res;
dprintk("%s: start\n", __func__);
status = decode_op_hdr(xdr_in, &op_nr);
- if (unlikely(status)) {
- status = htonl(NFS4ERR_OP_ILLEGAL);
- goto out;
- }
+ if (unlikely(status))
+ return status;
dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
__func__, minorversion, nop, op_nr);
@@ -624,19 +640,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
preprocess_nfs4_op(op_nr, &op);
if (status == htonl(NFS4ERR_OP_ILLEGAL))
op_nr = OP_CB_ILLEGAL;
-out:
+ if (status)
+ goto encode_hdr;
+
+ if (*drc_status) {
+ status = *drc_status;
+ goto encode_hdr;
+ }
+
maxlen = xdr_out->end - xdr_out->p;
if (maxlen > 0 && maxlen < PAGE_SIZE) {
- if (likely(status == 0 && op->decode_args != NULL))
- status = op->decode_args(rqstp, xdr_in, argp);
- if (likely(status == 0 && op->process_op != NULL))
+ status = op->decode_args(rqstp, xdr_in, argp);
+ if (likely(status == 0))
status = op->process_op(argp, resp);
} else
status = htonl(NFS4ERR_RESOURCE);
+ /* Only set by OP_CB_SEQUENCE processing */
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+ *drc_status = status;
+ status = 0;
+ }
+
+encode_hdr:
res = encode_op_hdr(xdr_out, op_nr, status);
- if (status == 0)
- status = res;
+ if (unlikely(res))
+ return res;
if (op->encode_res != NULL && status == 0)
status = op->encode_res(rqstp, xdr_out, resp);
dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -652,7 +681,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_compound_hdr_res hdr_res = { NULL };
struct xdr_stream xdr_in, xdr_out;
__be32 *p;
- __be32 status;
+ __be32 status, drc_status = 0;
unsigned int nops = 0;
dprintk("%s: start\n", __func__);
@@ -672,11 +701,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
return rpc_system_err;
while (status == 0 && nops != hdr_arg.nops) {
- status = process_op(hdr_arg.minorversion, nops,
- rqstp, &xdr_in, argp, &xdr_out, resp);
+ status = process_op(hdr_arg.minorversion, nops, rqstp,
+ &xdr_in, argp, &xdr_out, resp, &drc_status);
nops++;
}
+ /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+ * resource error in cb_compound status without returning op */
+ if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+ status = htonl(NFS4ERR_RESOURCE);
+ nops--;
+ }
+
*hdr_res.status = status;
*hdr_res.nops = htonl(nops);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -713,6 +749,11 @@ static struct callback_op callback_ops[] = {
.decode_args = (callback_decode_arg_t)decode_recallany_args,
.res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
},
+ [OP_CB_RECALL_SLOT] = {
+ .process_op = (callback_process_op_t)nfs4_callback_recallslot,
+ .decode_args = (callback_decode_arg_t)decode_recallslot_args,
+ .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+ },
#endif /* CONFIG_NFS_V4_1 */
};
@@ -741,6 +782,7 @@ struct svc_version nfs4_callback_version1 = {
.vs_proc = nfs4_callback_procedures1,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
.vs_dispatch = NULL,
+ .vs_hidden = 1,
};
struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713ce68..2274f173733 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -164,30 +164,7 @@ error_0:
return ERR_PTR(err);
}
-static void nfs4_shutdown_client(struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4
- if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
- nfs4_kill_renewd(clp);
- BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
- if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
- nfs_idmap_delete(clp);
-
- rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
-#endif
-}
-
-/*
- * Destroy the NFS4 callback service
- */
-static void nfs4_destroy_callback(struct nfs_client *clp)
-{
#ifdef CONFIG_NFS_V4
- if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
- nfs_callback_down(clp->cl_minorversion);
-#endif /* CONFIG_NFS_V4 */
-}
-
/*
* Clears/puts all minor version specific parts from an nfs_client struct
* reverting it to minorversion 0.
@@ -202,9 +179,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
clp->cl_call_sync = _nfs4_call_sync;
#endif /* CONFIG_NFS_V4_1 */
+}
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+ nfs_callback_down(clp->cl_minorversion);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+ nfs4_kill_renewd(clp);
+ nfs4_clear_client_minor_version(clp);
nfs4_destroy_callback(clp);
+ if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+ nfs_idmap_delete(clp);
+
+ rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
}
+#else
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4 */
/*
* Destroy a shared client record
@@ -213,7 +214,6 @@ static void nfs_free_client(struct nfs_client *clp)
{
dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
- nfs4_clear_client_minor_version(clp);
nfs4_shutdown_client(clp);
nfs_fscache_release_client_cookie(clp);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e..69e7b814012 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
}
#endif
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+ return nfs_have_delegation(inode, FMODE_READ) &&
+ !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c7f03b669f..c6f2750648f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
desc->entry = &my_entry;
nfs_block_sillyrename(dentry);
- res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (res < 0)
goto out;
@@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
cache = nfs_access_search_rbtree(inode, cred);
if (cache == NULL)
goto out;
- if (!nfs_have_delegation(inode, FMODE_READ) &&
+ if (!nfs_have_delegated_attributes(inode) &&
!time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
goto out_stale;
res->jiffies = cache->jiffies;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 95e1ca765d4..3f0cd4dfdda 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -36,6 +36,19 @@ struct nfs_dns_ent {
};
+static void nfs_dns_ent_update(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+}
+
static void nfs_dns_ent_init(struct cache_head *cnew,
struct cache_head *ckey)
{
@@ -49,8 +62,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
if (new->hostname) {
new->namelen = key->namelen;
- memcpy(&new->addr, &key->addr, key->addrlen);
- new->addrlen = key->addrlen;
+ nfs_dns_ent_update(cnew, ckey);
} else {
new->namelen = 0;
new->addrlen = 0;
@@ -234,7 +246,7 @@ static struct cache_detail nfs_dns_resolve = {
.cache_show = nfs_dns_show,
.match = nfs_dns_match,
.init = nfs_dns_ent_init,
- .update = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
.alloc = nfs_dns_ent_alloc,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63f2071d644..ae8d02294e4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name);
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
res = nfs_check_flags(filp->f_flags);
if (res)
return res;
- nfs_inc_stats(inode, NFSIOS_VFSOPEN);
res = nfs_open(inode, filp);
return res;
}
@@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
dentry->d_parent->d_name.name,
dentry->d_name.name);
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
- nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
/* Flush writes to the server and return any errors */
return nfs_do_fsync(ctx, inode);
@@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
(unsigned long) count, (unsigned long) pos);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
- nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
- if (!result)
+ if (!result) {
result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
return result;
}
@@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
(unsigned long) count, (unsigned long long) *ppos);
res = nfs_revalidate_mapping(inode, filp->f_mapping);
- if (!res)
+ if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+ if (res > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+ }
return res;
}
@@ -596,6 +601,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
+ unsigned long written = 0;
ssize_t result;
size_t count = iov_length(iov, nr_segs);
@@ -622,14 +628,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out;
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (result > 0)
+ written = result;
+
/* Return error values for O_DSYNC and IS_SYNC() */
if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
if (err < 0)
result = err;
}
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out:
return result;
@@ -644,6 +654,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
{
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ unsigned long written = 0;
ssize_t ret;
dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -654,14 +665,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
* The combination of splice and an O_APPEND destination is disallowed.
*/
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
-
ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+ if (ret > 0)
+ written = ret;
+
if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
if (err < 0)
ret = err;
}
+ if (ret > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
return ret;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7570573bdb3..e358df75a6a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
return ino;
}
-int nfs_write_inode(struct inode *inode, int sync)
-{
- int ret;
-
- if (sync) {
- ret = filemap_fdatawait(inode->i_mapping);
- if (ret == 0)
- ret = nfs_commit_inode(inode, FLUSH_SYNC);
- } else
- ret = nfs_commit_inode(inode, 0);
- if (ret >= 0)
- return 0;
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- return ret;
-}
-
void nfs_clear_inode(struct inode *inode)
{
/*
@@ -130,16 +114,12 @@ void nfs_clear_inode(struct inode *inode)
*/
int nfs_sync_mapping(struct address_space *mapping)
{
- int ret;
+ int ret = 0;
- if (mapping->nrpages == 0)
- return 0;
- unmap_mapping_range(mapping, 0, 0, 0);
- ret = filemap_write_and_wait(mapping);
- if (ret != 0)
- goto out;
- ret = nfs_wb_all(mapping->host);
-out:
+ if (mapping->nrpages != 0) {
+ unmap_mapping_range(mapping, 0, 0, 0);
+ ret = nfs_wb_all(mapping->host);
+ }
return ret;
}
@@ -511,17 +491,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
int err;
- /*
- * Flush out writes to the server in order to update c/mtime.
- *
- * Hold the i_mutex to suspend application writes temporarily;
- * this prevents long-running writing applications from blocking
- * nfs_wb_nocommit.
- */
+ /* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
- nfs_wb_nocommit(inode);
- mutex_unlock(&inode->i_mutex);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto out;
}
/*
@@ -545,6 +519,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
+out:
return err;
}
@@ -620,11 +595,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
__put_nfs_open_context(ctx, 0);
}
-static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
-{
- __put_nfs_open_context(ctx, 1);
-}
-
/*
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
@@ -671,7 +641,7 @@ static void nfs_file_clear_open_context(struct file *filp)
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
spin_unlock(&inode->i_lock);
- put_nfs_open_context_sync(ctx);
+ __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
}
}
@@ -759,7 +729,7 @@ int nfs_attribute_timeout(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
- if (nfs_have_delegation(inode, FMODE_READ))
+ if (nfs_have_delegated_attributes(inode))
return 0;
return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
@@ -779,7 +749,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
return __nfs_revalidate_inode(server, inode);
}
-static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -800,49 +770,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
return 0;
}
-static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
-{
- int ret = 0;
-
- mutex_lock(&inode->i_mutex);
- if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
- ret = nfs_sync_mapping(mapping);
- if (ret == 0)
- ret = nfs_invalidate_mapping_nolock(inode, mapping);
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
-/**
- * nfs_revalidate_mapping_nolock - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- int ret = 0;
-
- if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
- || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
- ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- if (ret < 0)
- goto out;
- }
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- ret = nfs_invalidate_mapping_nolock(inode, mapping);
-out:
- return ret;
-}
-
/**
* nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
- *
- * This version of the function will take the inode->i_mutex and attempt to
- * flush out all dirty data if it needs to invalidate the page cache.
*/
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{
@@ -1420,6 +1351,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
nfsi->npages = 0;
+ nfsi->ncommit = 0;
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b3..11f82f03c5d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
extern struct workqueue_struct *nfsiod_workqueue;
extern struct inode *nfs_alloc_inode(struct super_block *sb);
extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
extern void nfs_clear_inode(struct inode *);
#ifdef CONFIG_NFS_V4
extern void nfs4_clear_inode(struct inode *);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a05..24992f0a29f 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,14 +22,14 @@
#define NFSDBG_FACILITY NFSDBG_PROC
-/* A wrapper to handle the EJUKEBOX error message */
+/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
static int
nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
{
int res;
do {
res = rpc_call_sync(clnt, msg, flags);
- if (res != -EJUKEBOX)
+ if (res != -EJUKEBOX && res != -EKEYEXPIRED)
break;
schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
@@ -42,9 +42,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
static int
nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
{
- if (task->tk_status != -EJUKEBOX)
+ if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
return 0;
- nfs_inc_stats(inode, NFSIOS_DELAY);
+ if (task->tk_status == -EJUKEBOX)
+ nfs_inc_stats(inode, NFSIOS_DELAY);
task->tk_status = 0;
rpc_restart_call(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0c6fda33d66..a187200a7aa 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -46,6 +46,7 @@ enum nfs4_client_state {
NFS4CLNT_DELEGRETURN,
NFS4CLNT_SESSION_RESET,
NFS4CLNT_SESSION_DRAINING,
+ NFS4CLNT_RECALL_SLOT,
};
/*
@@ -280,6 +281,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_recall_slot(struct nfs_client *clp);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 84d83be25a9..f9254fb0c9d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -281,6 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
}
case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
+ case -EKEYEXPIRED:
ret = nfs4_delay(server->client, &exception->timeout);
if (ret != 0)
break;
@@ -418,7 +419,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
clp->cl_last_renewal = timestamp;
spin_unlock(&clp->cl_lock);
/* Check sequence flags */
- nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+ if (atomic_read(&clp->cl_count) > 1)
+ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
}
out:
/* The session may be reset by one of the error handlers. */
@@ -1163,7 +1165,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
int err;
do {
err = _nfs4_do_open_reclaim(ctx, state);
- if (err != -NFS4ERR_DELAY)
+ if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
break;
nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
@@ -1582,6 +1584,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
goto out;
case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
+ case -EKEYEXPIRED:
nfs4_handle_exception(server, err, &exception);
err = 0;
}
@@ -3145,10 +3148,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
* nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
* standalone procedure for queueing an asynchronous RENEW.
*/
+static void nfs4_renew_release(void *data)
+{
+ struct nfs_client *clp = data;
+
+ if (atomic_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+}
+
static void nfs4_renew_done(struct rpc_task *task, void *data)
{
- struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp;
- unsigned long timestamp = (unsigned long)data;
+ struct nfs_client *clp = data;
+ unsigned long timestamp = task->tk_start;
if (task->tk_status < 0) {
/* Unless we're shutting down, schedule state recovery! */
@@ -3164,6 +3176,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
static const struct rpc_call_ops nfs4_renew_ops = {
.rpc_call_done = nfs4_renew_done,
+ .rpc_release = nfs4_renew_release,
};
int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3174,8 +3187,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
.rpc_cred = cred,
};
+ if (!atomic_inc_not_zero(&clp->cl_count))
+ return -EIO;
return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
- &nfs4_renew_ops, (void *)jiffies);
+ &nfs4_renew_ops, clp);
}
int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3452,6 +3467,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
if (server)
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
+ case -EKEYEXPIRED:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
task->tk_status = 0;
return -EAGAIN;
@@ -3564,6 +3580,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
case -NFS4ERR_RESOURCE:
/* The IBM lawyers misread another document! */
case -NFS4ERR_DELAY:
+ case -EKEYEXPIRED:
err = nfs4_delay(clp->cl_rpcclient, &timeout);
}
} while (err == 0);
@@ -4179,7 +4196,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
- if (err != -NFS4ERR_DELAY)
+ if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
break;
nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
@@ -4204,6 +4221,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
goto out;
case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
+ case -EKEYEXPIRED:
nfs4_handle_exception(server, err, &exception);
err = 0;
}
@@ -4355,6 +4373,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
err = 0;
goto out;
case -NFS4ERR_DELAY:
+ case -EKEYEXPIRED:
break;
}
err = nfs4_handle_exception(server, err, &exception);
@@ -4500,7 +4519,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
- if (status != NFS4ERR_CLID_INUSE)
+ if (status != -NFS4ERR_CLID_INUSE)
break;
if (signalled())
@@ -4554,6 +4573,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
+ case -EKEYEXPIRED:
dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
rpc_delay(task, NFS4_POLL_RETRY_MIN);
task->tk_status = 0;
@@ -4611,26 +4631,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
/*
* Reset a slot table
*/
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
- int old_max_slots, int ivalue)
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+ int ivalue)
{
+ struct nfs4_slot *new = NULL;
int i;
int ret = 0;
- dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
+ dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+ max_reqs, tbl->max_slots);
- /*
- * Until we have dynamic slot table adjustment, insist
- * upon the same slot table size
- */
- if (max_slots != old_max_slots) {
- dprintk("%s reset slot table does't match old\n",
- __func__);
- ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
- goto out;
+ /* Does the newly negotiated max_reqs match the existing slot table? */
+ if (max_reqs != tbl->max_slots) {
+ ret = -ENOMEM;
+ new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
+ GFP_KERNEL);
+ if (!new)
+ goto out;
+ ret = 0;
+ kfree(tbl->slots);
}
spin_lock(&tbl->slot_tbl_lock);
- for (i = 0; i < max_slots; ++i)
+ if (new) {
+ tbl->slots = new;
+ tbl->max_slots = max_reqs;
+ }
+ for (i = 0; i < tbl->max_slots; ++i)
tbl->slots[i].seq_nr = ivalue;
spin_unlock(&tbl->slot_tbl_lock);
dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
@@ -4648,16 +4674,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
int status;
status = nfs4_reset_slot_table(&session->fc_slot_table,
- session->fc_attrs.max_reqs,
- session->fc_slot_table.max_slots,
- 1);
+ session->fc_attrs.max_reqs, 1);
if (status)
return status;
status = nfs4_reset_slot_table(&session->bc_slot_table,
- session->bc_attrs.max_reqs,
- session->bc_slot_table.max_slots,
- 0);
+ session->bc_attrs.max_reqs, 0);
return status;
}
@@ -4798,16 +4820,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.headerpadsz = 0;
args->fc_attrs.max_rqst_sz = mxrqst_sz;
args->fc_attrs.max_resp_sz = mxresp_sz;
- args->fc_attrs.max_resp_sz_cached = mxresp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
- "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+ "max_ops=%u max_reqs=%u\n",
__func__,
args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
- args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
- args->fc_attrs.max_reqs);
+ args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
/* Back channel attributes */
args->bc_attrs.headerpadsz = 0;
@@ -5016,7 +5036,16 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
&res, args.sa_cache_this, 1);
}
-void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+static void nfs41_sequence_release(void *data)
+{
+ struct nfs_client *clp = (struct nfs_client *)data;
+
+ if (atomic_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
{
struct nfs_client *clp = (struct nfs_client *)data;
@@ -5024,6 +5053,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
if (task->tk_status < 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
+ if (atomic_read(&clp->cl_count) == 1)
+ goto out;
if (_nfs4_async_handle_error(task, NULL, clp, NULL)
== -EAGAIN) {
@@ -5032,7 +5063,7 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
}
}
dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
-
+out:
kfree(task->tk_msg.rpc_argp);
kfree(task->tk_msg.rpc_resp);
@@ -5057,6 +5088,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
static const struct rpc_call_ops nfs41_sequence_ops = {
.rpc_call_done = nfs41_sequence_call_done,
.rpc_call_prepare = nfs41_sequence_prepare,
+ .rpc_release = nfs41_sequence_release,
};
static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -5069,12 +5101,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
.rpc_cred = cred,
};
+ if (!atomic_inc_not_zero(&clp->cl_count))
+ return -EIO;
args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- return -ENOMEM;
res = kzalloc(sizeof(*res), GFP_KERNEL);
- if (!res) {
+ if (!args || !res) {
kfree(args);
+ kfree(res);
+ nfs_put_client(clp);
return -ENOMEM;
}
res->sr_slotid = NFS4_MAX_SLOT_TABLE;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212..d87f10327b7 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
* as an rpc_task, not a real kernel thread, so it always runs in rpciod's
* context. There is one renewd per nfs_server.
*
- * TODO: If the send queue gets backlogged (e.g., if the server goes down),
- * we will keep filling the queue with periodic RENEW requests. We need a
- * mechanism for ensuring that if renewd successfully sends off a request,
- * then it only wakes up when the request is finished. Maybe use the
- * child task framework of the RPC layer?
*/
#include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
struct nfs_client *clp =
container_of(work, struct nfs_client, cl_renewd.work);
struct rpc_cred *cred;
- long lease, timeout;
+ long lease;
unsigned long last, now;
ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
lease = clp->cl_lease_time;
last = clp->cl_last_renewal;
now = jiffies;
- timeout = (2 * lease) / 3 + (long)last - (long)now;
/* Are we close to a lease timeout? */
if (time_after(now, last + lease/3)) {
cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
/* Queue an asynchronous RENEW. */
ops->sched_state_renewal(clp, cred);
put_rpccred(cred);
+ goto out_exp;
}
- timeout = (2 * lease) / 3;
- spin_lock(&clp->cl_lock);
- } else
+ } else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
__func__);
- if (timeout < 5 * HZ) /* safeguard */
- timeout = 5 * HZ;
- dprintk("%s: requeueing work. Lease period = %ld\n",
- __func__, (timeout + HZ - 1) / HZ);
- cancel_delayed_work(&clp->cl_renewd);
- schedule_delayed_work(&clp->cl_renewd, timeout);
- spin_unlock(&clp->cl_lock);
+ spin_unlock(&clp->cl_lock);
+ }
+ nfs4_schedule_state_renewal(clp);
+out_exp:
nfs_expire_unreferenced_delegations(clp);
out:
dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c1e2733f4fa..6c5ed51f105 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1249,26 +1249,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
}
#ifdef CONFIG_NFS_V4_1
+void nfs41_handle_recall_slot(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+ nfs4_schedule_state_recovery(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ clp->cl_boot_time = CURRENT_TIME;
+ nfs4_state_start_reclaim_nograce(clp);
+ nfs4_schedule_state_recovery(clp);
+ }
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ nfs4_state_start_reclaim_reboot(clp);
+ nfs4_schedule_state_recovery(clp);
+ }
+}
+
+static void nfs41_handle_state_revoked(struct nfs_client *clp)
+{
+ /* Temporary */
+ nfs4_reset_all_state(clp);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+ /* This will need to handle layouts too */
+ nfs_expire_all_delegations(clp);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+ nfs_expire_all_delegations(clp);
+ if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
+ nfs4_schedule_state_recovery(clp);
+}
+
void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
{
if (!flags)
return;
- else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) {
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- nfs4_state_start_reclaim_reboot(clp);
- nfs4_schedule_state_recovery(clp);
- } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+ else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+ nfs41_handle_server_reboot(clp);
+ else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
SEQ4_STATUS_ADMIN_STATE_REVOKED |
- SEQ4_STATUS_RECALLABLE_STATE_REVOKED |
- SEQ4_STATUS_LEASE_MOVED)) {
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- nfs4_state_start_reclaim_nograce(clp);
- nfs4_schedule_state_recovery(clp);
- } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+ SEQ4_STATUS_LEASE_MOVED))
+ nfs41_handle_state_revoked(clp);
+ else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+ nfs41_handle_recallable_state_revoked(clp);
+ else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
SEQ4_STATUS_BACKCHANNEL_FAULT |
SEQ4_STATUS_CB_PATH_DOWN_SESSION))
- nfs_expire_all_delegations(clp);
+ nfs41_handle_cb_path_down(clp);
}
static int nfs4_reset_session(struct nfs_client *clp)
@@ -1285,23 +1324,52 @@ static int nfs4_reset_session(struct nfs_client *clp)
memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
status = nfs4_proc_create_session(clp);
- if (status)
+ if (status) {
status = nfs4_recovery_handle_error(clp, status);
+ goto out;
+ }
+ /* create_session negotiated new slot table */
+ clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-out:
- /*
- * Let the state manager reestablish state
- */
- if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
- status == 0)
+ /* Let the state manager reestablish state */
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
nfs41_setup_state_renewal(clp);
-
+out:
return status;
}
+static int nfs4_recall_slot(struct nfs_client *clp)
+{
+ struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
+ struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
+ struct nfs4_slot *new, *old;
+ int i;
+
+ nfs4_begin_drain_session(clp);
+ new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ spin_lock(&fc_tbl->slot_tbl_lock);
+ for (i = 0; i < fc_tbl->target_max_slots; i++)
+ new[i].seq_nr = fc_tbl->slots[i].seq_nr;
+ old = fc_tbl->slots;
+ fc_tbl->slots = new;
+ fc_tbl->max_slots = fc_tbl->target_max_slots;
+ fc_tbl->target_max_slots = 0;
+ fc_attrs->max_reqs = fc_tbl->max_slots;
+ spin_unlock(&fc_tbl->slot_tbl_lock);
+
+ kfree(old);
+ nfs4_end_drain_session(clp);
+ return 0;
+}
+
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
#endif /* CONFIG_NFS_V4_1 */
/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1314,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
case -NFS4ERR_DELAY:
case -NFS4ERR_CLID_INUSE:
case -EAGAIN:
+ case -EKEYEXPIRED:
break;
case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1397,6 +1466,15 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs_client_return_marked_delegations(clp);
continue;
}
+ /* Recall session slots */
+ if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
+ && nfs4_has_session(clp)) {
+ status = nfs4_recall_slot(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
nfs4_clear_state_manager_bit(clp);
/* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5cd5184b56d..4d338be492c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1578,6 +1578,14 @@ static void encode_create_session(struct xdr_stream *xdr,
char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
uint32_t len;
struct nfs_client *clp = args->client;
+ u32 max_resp_sz_cached;
+
+ /*
+ * Assumes OPEN is the biggest non-idempotent compound.
+ * 2 is the verifier.
+ */
+ max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
+ RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
len = scnprintf(machine_name, sizeof(machine_name), "%s",
clp->cl_ipaddr);
@@ -1592,7 +1600,7 @@ static void encode_create_session(struct xdr_stream *xdr,
*p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
- *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
*p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
*p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
*p++ = cpu_to_be32(0); /* rdmachannel_attrs */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd..29d9d36cd5f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
*/
int nfs_set_page_tag_locked(struct nfs_page *req)
{
- struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
-
if (!nfs_lock_request_dontget(req))
return 0;
if (req->wb_page != NULL)
- radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+ radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
return 1;
}
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
*/
void nfs_clear_page_tag_locked(struct nfs_page *req)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
- struct nfs_inode *nfsi = NFS_I(inode);
-
if (req->wb_page != NULL) {
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
spin_lock(&inode->i_lock);
radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
* nfs_clear_request - Free up all resources allocated to the request
* @req:
*
- * Release page resources associated with a write request after it
- * has completed.
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
*/
void nfs_clear_request(struct nfs_page *req)
{
struct page *page = req->wb_page;
+ struct nfs_open_context *ctx = req->wb_context;
+
if (page != NULL) {
page_cache_release(page);
req->wb_page = NULL;
}
+ if (ctx != NULL) {
+ put_nfs_open_context(ctx);
+ req->wb_context = NULL;
+ }
}
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
{
struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
- /* Release struct file or cached credential */
+ /* Release struct file and open context */
nfs_clear_request(req);
- put_nfs_open_context(req->wb_context);
nfs_page_free(req);
}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d..c752d944fe9 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -47,6 +47,39 @@
#define NFSDBG_FACILITY NFSDBG_PROC
/*
+ * wrapper to handle the -EKEYEXPIRED error message. This should generally
+ * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
+ * support the NFSERR_JUKEBOX error code, but we handle this situation in the
+ * same way that we handle that error with NFSv3.
+ */
+static int
+nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+ int res;
+ do {
+ res = rpc_call_sync(clnt, msg, flags);
+ if (res != -EKEYEXPIRED)
+ break;
+ schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ res = -ERESTARTSYS;
+ } while (!fatal_signal_pending(current));
+ return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs_async_handle_expired_key(struct rpc_task *task)
+{
+ if (task->tk_status != -EKEYEXPIRED)
+ return 0;
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return 1;
+}
+
+/*
* Bare-bones access to getattr: this is for nfs_read_super.
*/
static int
@@ -307,6 +340,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
{
+ if (nfs_async_handle_expired_key(task))
+ return 0;
nfs_mark_for_revalidate(dir);
return 1;
}
@@ -560,6 +595,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
{
+ if (nfs_async_handle_expired_key(task))
+ return -EAGAIN;
+
nfs_invalidate_atime(data->inode);
if (task->tk_status >= 0) {
nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +617,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
+ if (nfs_async_handle_expired_key(task))
+ return -EAGAIN;
+
if (task->tk_status >= 0)
nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea7..6baf9a39346 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2214,7 +2214,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
} else {
error = nfs_bdi_register(server);
if (error)
- goto error_splat_super;
+ goto error_splat_bdi;
}
if (!s->s_root) {
@@ -2256,6 +2256,9 @@ out_err_nosb:
error_splat_root:
dput(mntroot);
error_splat_super:
+ if (server && !s->s_root)
+ bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
deactivate_locked_super(s);
goto out;
}
@@ -2326,7 +2329,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
} else {
error = nfs_bdi_register(server);
if (error)
- goto error_splat_super;
+ goto error_splat_bdi;
}
if (!s->s_root) {
@@ -2363,6 +2366,9 @@ out_err_noserver:
return error;
error_splat_super:
+ if (server && !s->s_root)
+ bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
deactivate_locked_super(s);
dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
return error;
@@ -2578,7 +2584,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
} else {
error = nfs_bdi_register(server);
if (error)
- goto error_splat_super;
+ goto error_splat_bdi;
}
if (!s->s_root) {
@@ -2616,6 +2622,9 @@ out_free:
error_splat_root:
dput(mntroot);
error_splat_super:
+ if (server && !s->s_root)
+ bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
deactivate_locked_super(s);
goto out;
}
@@ -2811,7 +2820,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
} else {
error = nfs_bdi_register(server);
if (error)
- goto error_splat_super;
+ goto error_splat_bdi;
}
if (!s->s_root) {
@@ -2847,6 +2856,9 @@ out_err_noserver:
return error;
error_splat_super:
+ if (server && !s->s_root)
+ bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
deactivate_locked_super(s);
dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
return error;
@@ -2893,7 +2905,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
} else {
error = nfs_bdi_register(server);
if (error)
- goto error_splat_super;
+ goto error_splat_bdi;
}
if (!s->s_root) {
@@ -2929,6 +2941,9 @@ out_err_noserver:
return error;
error_splat_super:
+ if (server && !s->s_root)
+ bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
deactivate_locked_super(s);
dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc..2ea9e5c27e5 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
struct page *page;
void *err;
- err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping));
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
goto read_failed;
page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d63d964a039..53ff70e2399 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
NFS_PAGE_TAG_COMMIT);
+ nfsi->ncommit++;
spin_unlock(&inode->i_lock);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
}
#endif
-/*
- * Wait for a request to complete.
- *
- * Interruptible by fatal signals only.
- */
-static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_page *req;
- pgoff_t idx_end, next;
- unsigned int res = 0;
- int error;
-
- if (npages == 0)
- idx_end = ~0;
- else
- idx_end = idx_start + npages - 1;
-
- next = idx_start;
- while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
- if (req->wb_index > idx_end)
- break;
-
- next = req->wb_index + 1;
- BUG_ON(!NFS_WBACK_BUSY(req));
-
- kref_get(&req->wb_kref);
- spin_unlock(&inode->i_lock);
- error = nfs_wait_on_request(req);
- nfs_release_request(req);
- spin_lock(&inode->i_lock);
- if (error < 0)
- return error;
- res++;
- }
- return res;
-}
-
-static void nfs_cancel_commit_list(struct list_head *head)
-{
- struct nfs_page *req;
-
- while(!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_clear_request_commit(req);
- nfs_inode_remove_request(req);
- nfs_unlock_request(req);
- }
-}
-
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static int
nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +523,17 @@ static int
nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ int ret;
if (!nfs_need_commit(nfsi))
return 0;
- return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+ if (ret > 0)
+ nfsi->ncommit -= ret;
+ if (nfs_need_commit(NFS_I(inode)))
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ return ret;
}
#else
static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
spin_lock(&inode->i_lock);
}
- if (nfs_clear_request_commit(req))
- radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
- req->wb_index, NFS_PAGE_TAG_COMMIT);
+ if (nfs_clear_request_commit(req) &&
+ radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+ req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
+ NFS_I(inode)->ncommit--;
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
@@ -1391,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
.rpc_release = nfs_commit_release,
};
-int nfs_commit_inode(struct inode *inode, int how)
+static int nfs_commit_inode(struct inode *inode, int how)
{
LIST_HEAD(head);
int res;
@@ -1406,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how)
}
return res;
}
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
- return 0;
-}
-#endif
-long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
{
- struct inode *inode = mapping->host;
- pgoff_t idx_start, idx_end;
- unsigned int npages = 0;
- LIST_HEAD(head);
- int nocommit = how & FLUSH_NOCOMMIT;
- long pages, ret;
-
- /* FIXME */
- if (wbc->range_cyclic)
- idx_start = 0;
- else {
- idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
- idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (idx_end > idx_start) {
- pgoff_t l_npages = 1 + idx_end - idx_start;
- npages = l_npages;
- if (sizeof(npages) != sizeof(l_npages) &&
- (pgoff_t)npages != l_npages)
- npages = 0;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int flags = FLUSH_SYNC;
+ int ret = 0;
+
+ /* Don't commit yet if this is a non-blocking flush and there are
+ * lots of outstanding writes for this mapping.
+ */
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ nfsi->ncommit <= (nfsi->npages >> 1))
+ goto out_mark_dirty;
+
+ if (wbc->nonblocking || wbc->for_background)
+ flags = 0;
+ ret = nfs_commit_inode(inode, flags);
+ if (ret >= 0) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (ret < wbc->nr_to_write)
+ wbc->nr_to_write -= ret;
+ else
+ wbc->nr_to_write = 0;
}
+ return 0;
}
- how &= ~FLUSH_NOCOMMIT;
- spin_lock(&inode->i_lock);
- do {
- ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
- if (ret != 0)
- continue;
- if (nocommit)
- break;
- pages = nfs_scan_commit(inode, &head, idx_start, npages);
- if (pages == 0)
- break;
- if (how & FLUSH_INVALIDATE) {
- spin_unlock(&inode->i_lock);
- nfs_cancel_commit_list(&head);
- ret = pages;
- spin_lock(&inode->i_lock);
- continue;
- }
- pages += nfs_scan_commit(inode, &head, 0, 0);
- spin_unlock(&inode->i_lock);
- ret = nfs_commit_list(inode, &head, how);
- spin_lock(&inode->i_lock);
-
- } while (ret >= 0);
- spin_unlock(&inode->i_lock);
+out_mark_dirty:
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-
-static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
+#else
+static int nfs_commit_inode(struct inode *inode, int how)
{
- int ret;
-
- ret = nfs_writepages(mapping, wbc);
- if (ret < 0)
- goto out;
- ret = nfs_sync_mapping_wait(mapping, wbc, how);
- if (ret < 0)
- goto out;
return 0;
-out:
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- return ret;
}
-/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
-static int nfs_write_mapping(struct address_space *mapping, int how)
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
{
- struct writeback_control wbc = {
- .bdi = mapping->backing_dev_info,
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ return 0;
+}
+#endif
- return __nfs_write_mapping(mapping, &wbc, how);
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return nfs_commit_unstable_pages(inode, wbc);
}
/*
@@ -1499,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
*/
int nfs_wb_all(struct inode *inode)
{
- return nfs_write_mapping(inode->i_mapping, 0);
-}
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
-int nfs_wb_nocommit(struct inode *inode)
-{
- return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
+ return sync_inode(inode, &wbc);
}
int nfs_wb_page_cancel(struct inode *inode, struct page *page)
{
struct nfs_page *req;
- loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
- struct writeback_control wbc = {
- .bdi = page->mapping->backing_dev_info,
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = range_start,
- .range_end = range_end,
- };
int ret = 0;
BUG_ON(!PageLocked(page));
for (;;) {
req = nfs_page_find_request(page);
if (req == NULL)
- goto out;
- if (test_bit(PG_CLEAN, &req->wb_flags)) {
- nfs_release_request(req);
break;
- }
if (nfs_lock_request_dontget(req)) {
nfs_inode_remove_request(req);
/*
@@ -1543,54 +1448,54 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
ret = nfs_wait_on_request(req);
nfs_release_request(req);
if (ret < 0)
- goto out;
+ break;
}
- if (!PagePrivate(page))
- return 0;
- ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
-out:
return ret;
}
-static int nfs_wb_page_priority(struct inode *inode, struct page *page,
- int how)
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
{
loff_t range_start = page_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
struct writeback_control wbc = {
- .bdi = page->mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
+ .nr_to_write = 0,
.range_start = range_start,
.range_end = range_end,
};
+ struct nfs_page *req;
+ int need_commit;
int ret;
- do {
+ while(PagePrivate(page)) {
if (clear_page_dirty_for_io(page)) {
ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out_error;
- } else if (!PagePrivate(page))
+ }
+ req = nfs_find_and_lock_request(page);
+ if (!req)
break;
- ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
- if (ret < 0)
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
goto out_error;
- } while (PagePrivate(page));
+ }
+ need_commit = test_bit(PG_CLEAN, &req->wb_flags);
+ nfs_clear_page_tag_locked(req);
+ if (need_commit) {
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out_error;
+ }
+ }
return 0;
out_error:
- __mark_inode_dirty(inode, I_DIRTY_PAGES);
return ret;
}
-/*
- * Write back all requests on one page - we do this before reading it.
- */
-int nfs_wb_page(struct inode *inode, struct page* page)
-{
- return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
-}
-
#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c6eed2a3b09..4bc22c763de 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -525,6 +525,8 @@ static struct rpc_cred *callback_cred;
int set_callback_cred(void)
{
+ if (callback_cred)
+ return 0;
callback_cred = rpc_lookup_machine_cred();
if (!callback_cred)
return -ENOMEM;
@@ -542,7 +544,8 @@ void do_probe_callback(struct nfs4_client *clp)
};
int status;
- status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
+ status = rpc_call_async(cb->cb_client, &msg,
+ RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
&nfsd4_cb_probe_ops, (void *)clp);
if (status) {
warn_no_callback_path(clp, status);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5a754f7b71e..98fb98e330b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -119,9 +119,7 @@ out_no_tfm:
static void
nfsd4_sync_rec_dir(void)
{
- mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
- nfsd_sync_dir(rec_dir.dentry);
- mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+ vfs_fsync(NULL, rec_dir.dentry, 0);
}
int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f19ed866c95..c97fddbd17d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1998,7 +1998,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
{
if (share_access & NFS4_SHARE_ACCESS_WRITE) {
drop_file_write_access(filp);
+ spin_lock(&filp->f_lock);
filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+ spin_unlock(&filp->f_lock);
}
}
@@ -2480,8 +2482,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
}
memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
- if (nfsd4_has_session(&resp->cstate))
+ if (nfsd4_has_session(&resp->cstate)) {
open->op_stateowner->so_confirmed = 1;
+ nfsd4_create_clid_dir(open->op_stateowner->so_client);
+ }
/*
* Attempt to hand out a delegation. No error return, because the
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bbf72d8f9fc..c47b4d7bafa 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1434,7 +1434,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
}
op->opnum = ntohl(*argp->p++);
- if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+ if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
op->status = ops->decoders[op->opnum](argp, &op->u);
else {
op->opnum = OP_ILLEGAL;
@@ -1528,7 +1528,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
} } while (0);
/* Encode as an array of strings the string given with components
- * seperated @sep.
+ * separated @sep.
*/
static __be32 nfsd4_encode_components(char sep, char *components,
__be32 **pp, int *buflen)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2604c3e70ea..0f0e77f2012 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -988,6 +988,7 @@ static ssize_t __write_ports_delfd(char *buf)
static ssize_t __write_ports_addxprt(char *buf)
{
char transport[16];
+ struct svc_xprt *xprt;
int port, err;
if (sscanf(buf, "%15s %4u", transport, &port) != 2)
@@ -1002,13 +1003,24 @@ static ssize_t __write_ports_addxprt(char *buf)
err = svc_create_xprt(nfsd_serv, transport,
PF_INET, port, SVC_SOCK_ANONYMOUS);
- if (err < 0) {
- /* Give a reasonable perror msg for bad transport string */
- if (err == -ENOENT)
- err = -EPROTONOSUPPORT;
- return err;
- }
+ if (err < 0)
+ goto out_err;
+
+ err = svc_create_xprt(nfsd_serv, transport,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS);
+ if (err < 0 && err != -EAFNOSUPPORT)
+ goto out_close;
return 0;
+out_close:
+ xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+ if (xprt != NULL) {
+ svc_close_xprt(xprt);
+ svc_xprt_put(xprt);
+ }
+out_err:
+ /* Decrease the count, but don't shut down the service */
+ nfsd_serv->sv_nrthreads--;
+ return err;
}
/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 15dc2deaac5..a11b0e8678e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -20,13 +20,14 @@
#include <linux/fcntl.h>
#include <linux/namei.h>
#include <linux/delay.h>
-#include <linux/quotaops.h>
#include <linux/fsnotify.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/ima.h>
#include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
@@ -271,6 +272,32 @@ out:
return err;
}
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+ struct inode *inode = fhp->fh_dentry->d_inode;
+ const struct export_operations *export_ops = inode->i_sb->s_export_op;
+ int error = 0;
+
+ if (!EX_ISSYNC(fhp->fh_export))
+ return 0;
+
+ if (export_ops->commit_metadata) {
+ error = export_ops->commit_metadata(inode);
+ } else {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0, /* metadata only */
+ };
+
+ error = sync_inode(inode, &wbc);
+ }
+
+ return error;
+}
/*
* Set various file attributes.
@@ -377,7 +404,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
put_write_access(inode);
goto out_nfserr;
}
- vfs_dq_init(inode);
}
/* sanitize the mode change */
@@ -745,8 +771,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
flags = O_RDWR|O_LARGEFILE;
else
flags = O_WRONLY|O_LARGEFILE;
-
- vfs_dq_init(inode);
}
*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
flags, current_cred());
@@ -771,43 +795,6 @@ nfsd_close(struct file *filp)
}
/*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
- const struct file_operations *fop)
-{
- struct inode *inode = dp->d_inode;
- int (*fsync) (struct file *, struct dentry *, int);
- int err;
-
- err = filemap_write_and_wait(inode->i_mapping);
- if (err == 0 && fop && (fsync = fop->fsync))
- err = fsync(filp, dp, 0);
- return err;
-}
-
-static int
-nfsd_sync(struct file *filp)
-{
- int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
- dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
- mutex_lock(&inode->i_mutex);
- err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
- mutex_unlock(&inode->i_mutex);
-
- return err;
-}
-
-int
-nfsd_sync_dir(struct dentry *dp)
-{
- return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
-}
-
-/*
* Obtain the readahead parameters for the file
* specified by (dev, ino).
*/
@@ -1010,7 +997,7 @@ static int wait_for_concurrent_writes(struct file *file)
if (inode->i_state & I_DIRTY) {
dprintk("nfsd: write sync %d\n", task_pid_nr(current));
- err = nfsd_sync(file);
+ err = vfs_fsync(file, file->f_path.dentry, 0);
}
last_ino = inode->i_ino;
last_dev = inode->i_sb->s_dev;
@@ -1158,8 +1145,9 @@ out:
#ifdef CONFIG_NFSD_V3
/*
* Commit all pending writes to stable storage.
- * Strictly speaking, we could sync just the indicated file region here,
- * but there's currently no way we can ask the VFS to do so.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
*
* Unfortunately we cannot lock the file to make sure we return full WCC
* data to the client, as locking happens lower down in the filesystem.
@@ -1169,23 +1157,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long count)
{
struct file *file;
- __be32 err;
+ loff_t end = LLONG_MAX;
+ __be32 err = nfserr_inval;
- if ((u64)count > ~(u64)offset)
- return nfserr_inval;
+ if (offset < 0)
+ goto out;
+ if (count != 0) {
+ end = offset + (loff_t)count - 1;
+ if (end < offset)
+ goto out;
+ }
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
- return err;
+ goto out;
if (EX_ISSYNC(fhp->fh_export)) {
- if (file->f_op && file->f_op->fsync) {
- err = nfserrno(nfsd_sync(file));
- } else {
+ int err2 = vfs_fsync_range(file, file->f_path.dentry,
+ offset, end, 0);
+
+ if (err2 != -EINVAL)
+ err = nfserrno(err2);
+ else
err = nfserr_notsupp;
- }
}
nfsd_close(file);
+out:
return err;
}
#endif /* CONFIG_NFSD_V3 */
@@ -1338,12 +1335,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
- if (EX_ISSYNC(fhp->fh_export)) {
- err = nfserrno(nfsd_sync_dir(dentry));
- write_inode_now(dchild->d_inode, 1);
- }
+ err = nfsd_create_setattr(rqstp, resfhp, iap);
- err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+ /*
+ * nfsd_setattr already committed the child. Transactional filesystems
+ * had a chance to commit changes for both parent and child
+ * simultaneously making the following commit_metadata a noop.
+ */
+ err2 = nfserrno(commit_metadata(fhp));
if (err2)
err = err2;
mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1375,7 +1374,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
__be32 err;
- __be32 err2;
int host_err;
__u32 v_mtime=0, v_atime=0;
@@ -1470,11 +1468,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (created)
*created = 1;
- if (EX_ISSYNC(fhp->fh_export)) {
- err = nfserrno(nfsd_sync_dir(dentry));
- /* setattr will sync the child (or not) */
- }
-
nfsd_check_ignore_resizing(iap);
if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1489,9 +1482,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
set_attr:
- err2 = nfsd_create_setattr(rqstp, resfhp, iap);
- if (err2)
- err = err2;
+ err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+ /*
+ * nfsd_setattr already committed the child (and possibly also the parent).
+ */
+ if (!err)
+ err = nfserrno(commit_metadata(fhp));
mnt_drop_write(fhp->fh_export->ex_path.mnt);
/*
@@ -1606,12 +1603,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
} else
host_err = vfs_symlink(dentry->d_inode, dnew, path);
-
- if (!host_err) {
- if (EX_ISSYNC(fhp->fh_export))
- host_err = nfsd_sync_dir(dentry);
- }
err = nfserrno(host_err);
+ if (!err)
+ err = nfserrno(commit_metadata(fhp));
fh_unlock(fhp);
mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1673,11 +1667,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
}
host_err = vfs_link(dold, dirp, dnew);
if (!host_err) {
- if (EX_ISSYNC(ffhp->fh_export)) {
- err = nfserrno(nfsd_sync_dir(ddir));
- write_inode_now(dest, 1);
- }
- err = 0;
+ err = nfserrno(commit_metadata(ffhp));
+ if (!err)
+ err = nfserrno(commit_metadata(tfhp));
} else {
if (host_err == -EXDEV && rqstp->rq_vers == 2)
err = nfserr_acces;
@@ -1773,10 +1765,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out_dput_new;
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
- if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
- host_err = nfsd_sync_dir(tdentry);
+ if (!host_err) {
+ host_err = commit_metadata(tfhp);
if (!host_err)
- host_err = nfsd_sync_dir(fdentry);
+ host_err = commit_metadata(ffhp);
}
mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1857,12 +1849,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dput(rdentry);
- if (host_err)
- goto out_drop;
- if (EX_ISSYNC(fhp->fh_export))
- host_err = nfsd_sync_dir(dentry);
+ if (!host_err)
+ host_err = commit_metadata(fhp);
-out_drop:
mnt_drop_write(fhp->fh_export->ex_path.mnt);
out_nfserr:
err = nfserrno(host_err);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f56..5cccf874d69 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
const struct buffer_head *, void *);
/**
- * nilfs_palloc_req - persistent alloctor request and reply
+ * nilfs_palloc_req - persistent allocator request and reply
* @pr_entry_nr: entry number (vblocknr or inode number)
* @pr_desc_bh: buffer head of the buffer containing block group descriptors
* @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9d1e5de91af..01314675568 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
* @vblocknrs and @nitems.
*
* Return Value: On success, 0 is returned. On error, one of the following
- * nagative error codes is returned.
+ * negative error codes is returned.
*
* %-EIO - I/O error.
*
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0092840492e..85c89dfc71f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
/* next page is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
nilfs_error(dir->i_sb, __func__,
- "dir %lu size %lld exceeds block cout %llu",
+ "dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
(unsigned long long)dir->i_blocks);
goto out;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa..8880a9e281e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
* gcinodes), and this file provides lookup function of the dummy
* inodes and their buffer read function.
*
- * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
* has to treat blocks that belong to a same file but have different
* checkpoint numbers. To avoid interference among generations, dummy
- * inodes are managed separatly from actual inodes, and their lookup
+ * inodes are managed separately from actual inodes, and their lookup
* function (nilfs_gc_iget) is designed to be specified with a
* checkpoint number argument as well as an inode number.
*
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b5..fc246dba112 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -292,7 +292,7 @@ void nilfs_free_private_page(struct page *page)
* @src: source page
* @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
*
- * This fuction is for both data pages and btnode pages. The dirty flag
+ * This function is for both data pages and btnode pages. The dirty flag
* should be treated by caller. The page must not be under i/o.
* Both src and dst page must be locked
*/
@@ -388,7 +388,7 @@ repeat:
}
/**
- * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
* @dmap: destination page cache
* @smap: source page cache
*
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index ab56fe44e37..636eaafd6ea 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -32,7 +32,7 @@
struct nilfs_write_info {
struct the_nilfs *nilfs;
struct bio *bio;
- int start, end; /* The region to be submitted */
+ int start, end; /* The region to be submitted */
int rest_blocks;
int max_pages;
int nr_vecs;
@@ -174,7 +174,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
}
/*
- * Setup segument summary
+ * Setup segment summary
*/
void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
{
@@ -470,8 +470,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
*
* %-ENOMEM - Insufficient memory available.
*/
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
- struct the_nilfs *nilfs)
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+ struct the_nilfs *nilfs)
{
struct nilfs_write_info wi;
struct buffer_head *bh;
@@ -514,7 +514,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
*
* %-EIO - I/O error
*/
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
{
int err = 0;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b947a..69576a95e13 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -141,7 +141,7 @@ int nilfs_init_transaction_cache(void)
}
/**
- * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ * nilfs_destroy_transaction_cache - destroy the cache for transaction info
*
* nilfs_destroy_transaction_cache() frees the slab cache for the struct
* nilfs_transaction_info.
@@ -201,7 +201,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
* This function allocates a nilfs_transaction_info struct to keep context
* information on it. It is initialized and hooked onto the current task in
* the outermost call. If a pre-allocated struct is given to @ti, it is used
- * instead; othewise a new struct is assigned from a slab.
+ * instead; otherwise a new struct is assigned from a slab.
*
* When @vacancy_check flag is set, this function will check the amount of
* free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -2214,7 +2214,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
}
/**
- * nilfs_secgtor_start_timer - set timer of background write
+ * nilfs_segctor_start_timer - set timer of background write
* @sci: nilfs_sc_info
*
* If the timer has already been set, it ignores the new request.
@@ -2854,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
* @sbi: nilfs_sb_info
*
* nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initilizes it, and starts the segment constructor.
+ * initializes it, and starts the segment constructor.
*
* Return Value: On success, 0 is returned. On error, one of the following
* negative error code is returned.
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3155e0c7f41..82dfd6a686b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
#include "sb.h"
/**
- * struct nilfs_recovery_info - Recovery infomation
+ * struct nilfs_recovery_info - Recovery information
* @ri_need_recovery: Recovery status
* @ri_super_root: Block number of the last super root
* @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
*/
struct nilfs_cstage {
int scnt;
- unsigned flags;
+ unsigned flags;
struct nilfs_inode_info *dirty_file_ptr;
struct nilfs_inode_info *gc_inode_ptr;
};
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc33..3c6cc6005c2 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Written by Koji Sato <koji@osrg.net>.
- * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
*/
#include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92579cc4c93..0cdbc5e7655 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -436,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* Compute the overhead
*
- * When distributing meta data blocks outside semgent structure,
+ * When distributing meta data blocks outside segment structure,
* We must count them as the overhead.
*/
overhead = 0;
@@ -866,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) &&
sbi->s_snapshot_cno != old_opts.snapshot_cno) {
printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount to a different snapshot. \n",
+ "remount to a different snapshot.\n",
sb->s_id);
err = -EINVAL;
goto restore_opts;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 92733d5651d..33871f7e4f0 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- printk(KERN_ERR "NILFS: too short segment. \n");
+ printk(KERN_ERR "NILFS: too short segment.\n");
return -EINVAL;
}
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e19437..00000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
-ToDo/Notes:
- - Find and fix bugs.
- - The only places in the kernel where a file is resized are
- ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
- held. Just have to be careful in read-/writepage and other helpers
- not running under i_mutex that we play nice. Also need to be careful
- with initialized_size extension in ntfs_file_write*() and writepage.
- UPDATE: The only things that need to be checked are the compressed
- write and the other attribute resize/write cases like index
- attributes, etc. For now none of these are implemented so are safe.
- - Implement filling in of holes in aops.c::ntfs_writepage() and its
- helpers.
- - Implement mft.c::sync_mft_mirror_umount(). We currently will just
- leave the volume dirty on umount if the final iput(vol->mft_ino)
- causes a write of any mirrored mft records due to the mft mirror
- inode having been discarded already. Whether this can actually ever
- happen is unclear however so it is worth waiting until someone hits
- the problem.
-
-2.1.29 - Fix a deadlock at mount time.
-
- - During mount the VFS holds s_umount lock on the superblock. So when
- we try to empty the journal $LogFile contents by calling
- ntfs_attr_set() when the machine does not have much memory and the
- journal is large ntfs_attr_set() results in the VM trying to balance
- dirty pages which in turn tries to that the s_umount lock and thus we
- get a deadlock. The solution is to not use ntfs_attr_set() and
- instead do the zeroing by hand at the block level rather than page
- cache level.
- - Fix sparse warnings.
-
-2.1.28 - Fix a deadlock.
-
- - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
- Vlasov for the report and detailed analysis of the deadlock. The fix
- involved getting rid of ntfs_put_inode() altogether and hence NTFS no
- longer has a ->put_inode super operation.
-
-2.1.27 - Various bug fixes and cleanups.
-
- - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for
- reporting them.
- - Fix an (innocent) off-by-one error in the runlist code.
- - Fix a buggette in an "should be impossible" case handling where we
- continued the attribute lookup loop instead of aborting it.
- - Use buffer_migrate_page() for the ->migratepage function of all ntfs
- address space operations.
- - Fix comparison of $MFT and $MFTMirr to not bail out when there are
- unused, invalid mft records which are the same in both $MFT and
- $MFTMirr.
- - Add support for sparse files which have a compression unit of 0.
- - Remove all the make_bad_inode() calls. This should only be called
- from read inode and new inode code paths.
- - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
- allowed by NTFS, i.e. 255 Unicode characters, not including the
- terminating NULL (which is not stored on disk).
- - Improve comments on file attribute flags in fs/ntfs/layout.h.
- - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
- forgot to update a temporary variable so loading index inodes which
- have an index allocation attribute failed.
- - Add a missing call to flush_dcache_mft_record_page() in
- fs/ntfs/inode.c::ntfs_write_inode().
- - Handle the recently introduced -ENAMETOOLONG return value from
- fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
- - Semaphore to mutex conversion. (Ingo Molnar)
-
-2.1.26 - Minor bug fixes and updates.
-
- - Fix a potential overflow in file.c where a cast to s64 was missing in
- a left shift of a page index.
- - The struct inode has had its i_sem semaphore changed to a mutex named
- i_mutex.
- - We have struct kmem_cache now so use it instead of the typedef
- kmem_cache_t. (Pekka Enberg)
- - Implement support for sector sizes above 512 bytes (up to the maximum
- supported by NTFS which is 4096 bytes).
- - Do more detailed reporting of why we cannot mount read-write by
- special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
- - Miscellaneous updates to layout.h.
- - Cope with attribute list attribute having invalid flags. Windows
- copes with this and even chkdsk does not detect or fix this so we
- have to cope with it, too. Thanks to Pawel Kot for reporting the
- problem.
-
-2.1.25 - (Almost) fully implement write(2) and truncate(2).
-
- - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
- {__,}ntfs_cluster_free() to also take an optional attribute search
- context as argument. This allows calling these functions with the
- mft record mapped. Update all callers.
- - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
- error handling by passing in the active search context when calling
- ntfs_cluster_free().
- - Change ntfs_cluster_alloc() to take an extra boolean parameter
- specifying whether the cluster are being allocated to extend an
- attribute or to fill a hole.
- - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
- with @is_extension set to TRUE and remove the runlist terminator
- fixup code as this is now done by ntfs_cluster_alloc().
- - Change ntfs_attr_make_non_resident to take the attribute value size
- as an extra parameter. This is needed since we need to know the size
- before we can map the mft record and our callers always know it. The
- reason we cannot simply read the size from the vfs inode i_size is
- that this is not necessarily uptodate. This happens when
- ntfs_attr_make_non_resident() is called in the ->truncate call path.
- - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
- which is zero for a resident attribute but should no longer be zero
- once the attribute is non-resident as it then has real clusters
- allocated.
- - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
- extend the allocation of an attributes. Optionally, the data size,
- but not the initialized size can be extended, too.
- - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
- uncompressed and unencrypted files and it never creates sparse files
- at least for the moment (making a file sparse requires us to modify
- its directory entries and we do not support directory operations at
- the moment). Also, support for highly fragmented files, i.e. ones
- whose data attribute is split across multiple extents, is severly
- limited. When such a case is encountered, EOPNOTSUPP is returned.
- - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
- the initial implementation of file truncation. Now both open(2)ing
- a file with the O_TRUNC flag and the {,f}truncate(2) system calls
- will resize a file appropriately. The limitations are that only
- uncompressed and unencrypted files are supported. Also, there is
- only very limited support for highly fragmented files (the ones whose
- $DATA attribute is split into multiple attribute extents).
- - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
- and cond_resched() in the main loop as we could be dirtying a lot of
- pages and this ensures we play nice with the VM and the system as a
- whole.
- - Implement file operations ->write, ->aio_write, ->writev for regular
- files. This replaces the old use of generic_file_write(), et al and
- the address space operations ->prepare_write and ->commit_write.
- This means that both sparse and non-sparse (unencrypted and
- uncompressed) files can now be extended using the normal write(2)
- code path. There are two limitations at present and these are that
- we never create sparse files and that we only have limited support
- for highly fragmented files, i.e. ones whose data attribute is split
- across multiple extents. When such a case is encountered,
- EOPNOTSUPP is returned.
- - $EA attributes can be both resident and non-resident.
- - Use %z for size_t to fix compilation warnings. (Andrew Morton)
- - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
- - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
- patch by Yura Pakhuchiy.)
-
-2.1.24 - Lots of bug fixes and support more clean journal states.
-
- - Support journals ($LogFile) which have been modified by chkdsk. This
- means users can boot into Windows after we marked the volume dirty.
- The Windows boot will run chkdsk and then reboot. The user can then
- immediately boot into Linux rather than having to do a full Windows
- boot first before rebooting into Linux and we will recognize such a
- journal and empty it as it is clean by definition. Note, this only
- works if chkdsk left the journal in an obviously clean state.
- - Support journals ($LogFile) with only one restart page as well as
- journals with two different restart pages. We sanity check both and
- either use the only sane one or the more recent one of the two in the
- case that both are valid.
- - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
- ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
- hence cannot fail.
- - Use ntfs_malloc_nofs_nofail() in the two critical regions in
- fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer
- need to panic() if the allocation fails as it now cannot fail.
- - Fix two nasty runlist merging bugs that had gone unnoticed so far.
- Thanks to Stefano Picerno for the bug report.
- - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
- - Fix handling of valid but empty mapping pairs array in
- fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
- - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
- messages and include the inode number. Thanks to Yura Pakhuchiy for
- pointing this out.
- - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
- length is zero.
- - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
- specified hole into a runlist.
- - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned
- index entry is in the index root, we forgot to set the @ir pointer in
- the index context. Thanks to Yura Pakhuchiy for finding this bug.
- - Remove bogus setting of PageError in ntfs_read_compressed_block().
- - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
- - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
- access to the allocated size in the ntfs inode with the size lock.
- - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
- return LCN_ENOENT when there is no runlist and the allocated size is
- zero.
- - Fix load_attribute_list() to handle the case of a NULL runlist.
- - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
- - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
- to ensure that these functions are never called for compressed or
- encrypted attributes.
- - Fix cluster (de)allocators to work when the runlist is NULL and more
- importantly to take a locked runlist rather than them locking it
- which leads to lock reversal.
- - Truncate {a,c,m}time to the ntfs supported time granularity when
- updating the times in the inode in ntfs_setattr().
- - Fixup handling of sparse, compressed, and encrypted attributes in
- fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
- fs/ntfs/aops.c::ntfs_{read,write}page().
- - Make ntfs_write_block() not instantiate sparse blocks if they contain
- only zeroes.
- - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
- lock protection over the buffer submission for i/o which allows the
- removal of the get_bh()/put_bh() pairs for each buffer.
- - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
- where a concurrent truncate has truncated the runlist under our feet.
- - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
- - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
- in the first buffer head instead of a driver global spin lock to
- improve scalability.
- - Minor fix to error handling and error message display in
- fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
- - Change the mount options {u,f,d}mask to always parse the number as
- an octal number to conform to how chmod(1) works, too. Thanks to
- Giuseppe Bilotta and Horst von Brand for pointing out the errors of
- my ways.
- - Fix various bugs in the runlist merging code. (Based on libntfs
- changes by Richard Russon.)
- - Fix sparse warnings that have crept in over time.
- - Change ntfs_cluster_free() to require a write locked runlist on entry
- since we otherwise get into a lock reversal deadlock if a read locked
- runlist is passed in. In the process also change it to take an ntfs
- inode instead of a vfs inode as parameter.
- - Fix the definition of the CHKD ntfs record magic. It had an off by
- two error causing it to be CHKB instead of CHKD.
- - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
- count to become negative and hence we had a wild memset() scribbling
- all over the system's ram.
-
-2.1.23 - Implement extension of resident files and make writing safe as well as
- many bug fixes, cleanups, and enhancements...
-
- - Add printk rate limiting for ntfs_warning() and ntfs_error() when
- compiled without debug. This avoids a possible denial of service
- attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
- out.
- - Fix compilation warnings on ia64. (Randy Dunlap)
- - Use i_size_{read,write}() instead of reading i_size by hand and cache
- the value where apropriate.
- - Add size_lock to the ntfs_inode structure. This is an rw spinlock
- and it locks against access to the inode sizes. Note, ->size_lock
- is also accessed from irq context so you must use the _irqsave and
- _irqrestore lock and unlock functions, respectively. Protect all
- accesses to allocated_size, initialized_size, and compressed_size.
- - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
- - Implement extension of resident files in the regular file write code
- paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present
- this only works until the data attribute becomes too big for the mft
- record after which we abort the write returning -EOPNOTSUPP from
- ntfs_prepare_write().
- - Add disable_sparse mount option together with a per volume sparse
- enable bit which is set appropriately and a per inode sparse disable
- bit which is preset on some system file inodes as appropriate.
- - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
- - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
- the creation of the unmapped runlist element for the base attribute
- extent.
- - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
- helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
- This allows us to map runlist fragments with the runlist lock already
- held without having to drop and reacquire it around the call. Adapt
- all callers.
- - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
- runlist. This allows us to find runlist elements with the runlist
- lock already held without having to drop and reacquire it around the
- call. Adapt all callers.
- - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
- warning in the do_div() call on sparc32. Thanks to Meelis Roos for
- the report and analysis of the warning.
- - Fix a nasty runlist merge bug when merging two holes.
- - Set the ntfs_inode->allocated_size to the real allocated size in the
- mft record for resident attributes (fs/ntfs/inode.c).
- - Small readability cleanup to use "a" instead of "ctx->attr"
- everywhere (fs/ntfs/inode.c).
- - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
- definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also,
- declare ntfs_export_ops in fs/ntfs/ntfs.h.
- - Correct sparse file handling. The compressed values need to be
- checked and set in the ntfs inode as done for compressed files and
- the compressed size needs to be used for vfs inode->i_blocks instead
- of the allocated size, again, as done for compressed files.
- - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
- non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
- - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
- write code.
- - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
- dropping the read lock and taking the write lock we were not checking
- whether someone else did not already do the work we wanted to do.
- - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
- ntfs_attr_find_vcn_nolock() and update all callers.
- - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
- - Fix sign of various error return values to be negative in
- fs/ntfs/lcnalloc.c.
- - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
- handle the case where an attribute is converted from resident to
- non-resident by a concurrent file write.
- - Remove checks for NULL before calling kfree() since kfree() does the
- checking itself. (Jesper Juhl)
- - Some utilities modify the boot sector but do not update the checksum.
- Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
- only emit a warning when the checksum is incorrect rather than
- refusing the mount. Thanks to Bernd Casimir for pointing this
- problem out.
- - Update attribute definition handling.
- - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
- - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
- - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
- better code generation and one less sparse warning in fs/ntfs/aops.c.
- - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg)
- - Use C99 style structure initialization after memory allocation where
- possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and
- Pekka Enberg.
- - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
- is active on the volume and we are mounting read-write or remounting
- from read-only to read-write.
- - Fix a bug in address space operations error recovery code paths where
- if the runlist was not mapped at all and a mapping error occured we
- would leave the runlist locked on exit to the function so that the
- next access to the same file would try to take the lock and deadlock.
- - Detect the case when Windows has been suspended to disk on the volume
- to be mounted and if this is the case do not allow (re)mounting
- read-write. This is done by parsing hiberfil.sys if present.
- - Fix several occurences of a bug where we would perform 'var & ~const'
- with a 64-bit variable and a int, i.e. 32-bit, constant. This causes
- the higher order 32-bits of the 64-bit variable to be zeroed. To fix
- this cast the 'const' to the same 64-bit type as 'var'.
- - Change the runlist terminator of the newly allocated cluster(s) to
- LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist
- code gets confused.
- - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
- and ntfs_mapping_pairs_build() to allow the runlist encoding to be
- partial which is desirable when filling holes in sparse attributes.
- Update all callers.
- - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
- if the requested vcn is inside it. Otherwise we get into problems
- when we try to map an out of bounds vcn because we then try to map
- the already mapped runlist fragment which causes
- ntfs_mapping_pairs_decompress() to fail and return error. Update
- ntfs_attr_find_vcn_nolock() accordingly.
- - Fix a nasty deadlock that appeared in recent kernels.
- The situation: VFS inode X on a mounted ntfs volume is dirty. For
- same inode X, the ntfs_inode is dirty and thus corresponding on-disk
- inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
- to the table of inodes, i.e. $MFT, inode 0.
- What happens:
- Process 1: sys_sync()/umount()/whatever... calls
- __sync_single_inode() for $MFT -> do_writepages() -> write_page for
- the dirty page containing the on-disk inode X, the page is now locked
- -> ntfs_write_mst_block() which clears PageUptodate() on the page to
- prevent anyone else getting hold of it whilst it does the write out.
- This is necessary as the on-disk inode needs "fixups" applied before
- the write to disk which are removed again after the write and
- PageUptodate is then set again. It then analyses the page looking
- for dirty on-disk inodes and when it finds one it calls
- ntfs_may_write_mft_record() to see if it is safe to write this
- on-disk inode. This then calls ilookup5() to check if the
- corresponding VFS inode is in icache(). This in turn calls ifind()
- which waits on the inode lock via wait_on_inode whilst holding the
- global inode_lock.
- Process 2: pdflush results in a call to __sync_single_inode for the
- same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
- then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
- read_cache_page() for the page (in page cache of table of inodes
- $MFT, inode 0) containing the on-disk inode. This page has
- PageUptodate() clear because of Process 1 (see above) so
- read_cache_page() blocks when it tries to take the page lock for the
- page so it can call ntfs_read_page().
- Thus Process 1 is holding the page lock on the page containing the
- on-disk inode X and it is waiting on the inode X to be unlocked in
- ifind() so it can write the page out and then unlock the page.
- And Process 2 is holding the inode lock on inode X and is waiting for
- the page to be unlocked so it can call ntfs_readpage() or discover
- that Process 1 set PageUptodate() again and use the page.
- Thus we have a deadlock due to ifind() waiting on the inode lock.
- The solution: The fix is to use the newly introduced
- ilookup5_nowait() which does not wait on the inode's lock and hence
- avoids the deadlock. This is safe as we do not care about the VFS
- inode and only use the fact that it is in the VFS inode cache and the
- fact that the vfs and ntfs inodes are one struct in memory to find
- the ntfs inode in memory if present. Also, the ntfs inode has its
- own locking so it does not matter if the vfs inode is locked.
- - Fix bug in mft record writing where we forgot to set the device in
- the buffers when mapping them after the VM had discarded them.
- Thanks to Martin MOKREJÃ… for the bug report.
-
-2.1.22 - Many bug and race fixes and error handling improvements.
-
- - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
- - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
- instead of void and provide a helper ntfs_truncate_vfs() for the
- vfs ->truncate method.
- - Add a new ntfs inode flag NInoTruncateFailed() and modify
- fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
- - Fix min_size and max_size definitions in ATTR_DEF structure in
- fs/ntfs/layout.h to be signed.
- - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
- ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
- ntfs_attr_can_be_resident(), which in turn use the new private helper
- ntfs_attr_find_in_attrdef().
- - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
- mapping->private_lock around the dirtying of the buffer heads
- analagous to the way it is done in __set_page_dirty_buffers().
- - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
- mount time as this cannot work with the current implementation.
- - Check for location of attribute name and improve error handling in
- general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
- - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
- i_size, i.e. race with truncate, invalidate the buffers on the page
- so that they become freeable and hence the page does not leak.
- - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
- Bunk)
- - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
- a NULL pointer dereference in the error code path when a corrupt
- attribute was found. (Thanks to Domen Puncer for the bug report.)
- - Add MODULE_VERSION() to fs/ntfs/super.c.
- - Make several functions and variables static. (Adrian Bunk)
- - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
- buffers for the page if they are not present and then marks the
- buffers belonging to the ntfs record dirty. This causes the buffers
- to become busy and hence they are safe from removal until the page
- has been written out.
- - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
- error handling code path that resulted in a BUG() due to trying to
- unmap an extent mft record when the mapping of it had failed and it
- thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
- - Drop the runlist lock after the vcn has been read in
- fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
- - Rewrite handling of multi sector transfer errors. We now do not set
- PageError() when such errors are detected in the async i/o handler
- fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
- protected attributes now check the magic of each ntfs record as they
- use it and act appropriately. This has the effect of making errors
- granular per ntfs record rather than per page which solves the case
- where we cannot access any of the ntfs records in a page when a
- single one of them had an mst error. (Thanks to Ken MacFerrin for
- the bug report.)
- - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
- where we failed to release i_mutex on the $Quota/$Q attribute inode.
- - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
- - Add mapping of unmapped buffers to all remaining code paths, i.e.
- fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
- and write_mft_record_nolock(). From now on we require that the
- complete runlist for the mft mirror is always mapped into memory.
- - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
- - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
- - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
- resident attribute will be smaller than a page which makes the code
- simpler. Also make the code more tolerant to concurrent ->truncate.
-
-2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
-
- - Implement extent mft record deallocation
- fs/ntfs/mft.c::ntfs_extent_mft_record_free().
- - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
- - Add vol->mft_data_pos and initialize it at mount time.
- - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
- ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
- ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
- ntfs_runlists_merge() and adapt all callers.
- - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
- ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
- and ntfs_mapping_pairs_build(), adapted from libntfs.
- - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
- static and add a declaration for it to lcnalloc.h.
- - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
- inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
- cluster bitmap lock for the duration of the call.
- - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
- - Implement the equivalent of memset() for an ntfs attribute in
- fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
- fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
- - Remove unnecessary casts from LCN_* constants.
- - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
- - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
- change MFT_RECORD to contain the NTFS 3.1+ specific fields.
- - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
- marks all buffers belonging to an ntfs record dirty, followed by
- marking the page the ntfs record is in dirty and also marking the vfs
- inode containing the ntfs record dirty (I_DIRTY_PAGES).
- - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
- new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
- longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
- - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
- include errors.
- - Move the typedefs for runlist_element and runlist from types.h to
- runlist.h and fix resulting include errors.
- - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
- - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
- mark_ntfs_record_dirty() which also changes the behaviour in that we
- now set the buffers belonging to the mft record dirty as well as the
- page itself.
- - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
- to cope with the fact that there now are dirty buffers in mft pages.
- - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
- mark_ntfs_record_dirty() and thus to set the buffers belonging to the
- mft record dirty as well as the page itself.
- - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
- slightly modified by me)
- - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
- the mft record is already locked and otherwise behaves the same way
- as fs/ntfs/mft.c::map_mft_record().
- - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
- writes the mft record if the buffers belonging to it are dirty.
- Otherwise we assume that it was written out by other means already.
- - Attempting to write outside initialized size is _not_ a bug so remove
- the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
- fact required to write outside initialized size when preparing to
- extend the initialized size.
- - Map the page instead of using page_address() before writing to it in
- fs/ntfs/aops.c::ntfs_mft_writepage().
- - Provide exclusion between opening an inode / mapping an mft record
- and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
- by setting the page not uptodate throughout ntfs_mft_writepage().
- - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
- to ensure noone can see the page whilst the mst fixups are applied.
- - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
- checks if an mft record may be written out safely obtaining any
- necessary locks in the process. This is used by
- fs/ntfs/aops.c::ntfs_write_mst_block().
- - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
- writing mft records and improve its error handling in the process.
- Now if any of the records in the page fail to be written out, all
- other records will be written out instead of aborting completely.
- - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
- - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
- ntfs_mst_aops for all inodes which are NInoMstProtected() and
- ntfs_aops for all other inodes.
- - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
- ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
- no longer require an ntfs inode to be present. Update all callers.
- - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
- - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
- to ensure noone can see the page whilst the mst fixups are applied.
- - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
- fs/ntfs/mft.c::try_map_mft_record().
- - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
- with the ntfs inode which contains the page rather than the ntfs
- inode the mft record of which is in the page.
- - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
- index inode bitmap inode release code from there to
- fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
- Hellwig for spotting this.)
- - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
- inode semaphore around the code that sets ni->itype.index.bmp_ino to
- NULL and reorganize the code to optimize it a bit. (Thanks to
- Christoph Hellwig for spotting this.)
- - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
- ntfs inode as a parameter as this is confusing and misleading and the
- needed ntfs inode is available via NTFS_I(page->mapping->host).
- Adapt all callers to this change.
- - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
- fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
- of the first buffer in a record and to take this as the ntfs record
- dirty state. We cannot look at the dirty state for subsequent
- buffers because we might be racing with
- fs/ntfs/aops.c::mark_ntfs_record_dirty().
- - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
- inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
- add a declaration for it to inode.h. Fix some compilation issues
- that resulted due to #includes and header file interdependencies.
- - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
- - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
- - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
- record sequence number if it is specified (i.e. not zero).
- - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
- functions used by it.
- - Update Documentation/filesystems/ntfs.txt with instructions on how to
- use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
- the linear raid problem with the Software RAID / MD driver when one
- or more of the devices has an odd number of sectors.
-
-2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
-
- - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
- where we did not clear ctx->al_entry but it was still set due to
- changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
- particular.
- - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
- where we forgot to unmap the extent mft record when we had finished
- enumerating an attribute which caused a bug check to trigger when the
- VFS calls ->clear_inode.
-
-2.1.19 - Many cleanups, improvements, and a minor bug fix.
-
- - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
- change the uid, gid, and mode of an inode as we do not support NTFS
- ACLs yet.
- - Remove BKL use from ntfs_setattr() syncing up with the rest of the
- kernel.
- - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
- and ntfs_filldir() as per suggestion from Al Viro.
- - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
- - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
- inode size has changed and to only output an error if so.
- - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
- - Add le{16,32,64} as well as sle{16,32,64} data types to
- fs/ntfs/types.h.
- - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
- - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
- respectively, to fs/ntfs/types.h.
- - Update endianness conversion macros in fs/ntfs/endian.h to use the
- new types as appropriate.
- - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
- and index.c.
- - Add leMFT_REF data type to fs/ntfs/layout.h.
- - Update all NTFS header files with the new little endian data types.
- Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
- - Do proper type casting when using ntfs_is_*_recordp() in
- fs/ntfs/logfile.c, mft.c, and super.c.
- - Fix all the sparse bitwise warnings. Had to change all the typedef
- enums storing little endian values to simple enums plus a typedef for
- the datatype to make sparse happy.
- - Fix a bug found by the new sparse bitwise warnings where the default
- upcase table was defined as a pointer to wchar_t rather than ntfschar
- in fs/ntfs/ntfs.h and super.c.
- - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
-
-2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
-
- - Remove vol->nr_mft_records as it was pretty meaningless and optimize
- the calculation of total/free inodes as used by statfs().
- - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
- because the code itself is using the ntfs_lock semaphore which
- provides safe locking. (Ingo Molnar)
- - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
- could occur in the future for when we start closing/freeing extent
- inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
- we free it.
- - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
- find_external_attr() to ntfs_external_attr_find() to cleanup the
- namespace a bit and to be more consistent with libntfs.
- - Rename {{re,}init,get,put}_attr_search_ctx() to
- ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
- attr_search_context to ntfs_attr_search_ctx.
- - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
- for the attribute list attribute itself.
- - Fix endianness bug in ntfs_external_attr_find().
- - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
- if the attribute is not found, and -EIO on real error. In the case
- of -ENOENT, the search context is updated to describe the attribute
- before which the attribute being searched for would need to be
- inserted if such an action were to be desired and in the case of
- ntfs_external_attr_find() the search context is also updated to
- indicate the attribute list entry before which the attribute list
- entry of the attribute being searched for would need to be inserted
- if such an action were to be desired. Also make ntfs_find_attr()
- static and remove its prototype from attrib.h as it is not used
- anywhere other than attrib.c. Update ntfs_attr_lookup() and all
- callers of ntfs_{external,}attr_{find,lookup}() for the new return
- values.
- - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
-
-2.1.17 - Fix bugs in mount time error code paths and other updates.
-
- - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
- includes functions to set/clear a single bit or a run of bits.
- - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
- runlist element containing a particular vcn. It also takes care of
- mapping any needed runlist fragments.
- - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
- - Load attribute definition table from $AttrDef at mount time.
- - Fix bugs in mount time error code paths involving (de)allocation of
- the default and volume upcase tables.
- - Remove ntfs_nr_mounts as it is no longer used.
-
-2.1.16 - Implement access time updates, file sync, async io, and read/writev.
-
- - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
- This is done by setting the appropriate file operations pointers to
- the generic helper functions provided by mm/filemap.c.
- - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
- and directories (fs/ntfs/dir.c).
- - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
- Note, except for the root directory and any other system files opened
- by the user, the system files will not have their access times
- updated as they are only accessed at the inode level an hence the
- file level functions which cause the times to be updated are never
- invoked.
-
-2.1.15 - Invalidate quotas when (re)mounting read-write.
-
- - Add new element itype.index.collation_rule to the ntfs inode
- structure and set it appropriately in ntfs_read_locked_inode().
- - Implement a new inode type "index" to allow efficient access to the
- indices found in various system files and adapt inode handling
- accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
- attribute inode (NInoAttr() is true) with an attribute type of
- AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
- ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
- there would be no way to distinguish between normal attribute inodes
- and index inodes. The function to obtain an index inode is
- ntfs_index_iget() and it uses the helper function
- ntfs_read_locked_index_inode(). Note, we do not overload
- ntfs_attr_iget() as indices consist of multiple attributes so using
- ntfs_attr_iget() to obtain an index inode would be confusing.
- - Ensure that there is no overflow when doing page->index <<
- PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
- - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
- and ntfs_read_block().
- - Use case sensitive attribute lookups instead of case insensitive ones.
- - Lock all page cache pages belonging to mst protected attributes while
- accessing them to ensure we never see corrupt data while the page is
- under writeout.
- - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
- We have ntfs_is_collation_rule_supported() to check if the collation
- rule you want to use is supported and ntfs_collation() which actually
- collates two data items. We currently only support COLLATION_BINARY
- and COLLATION_NTOFS_ULONG but support for other collation rules will
- be added as the need arises.
- - Add a new type, ntfs_index_context, to allow retrieval of an index
- entry using the corresponding index key. To get an index context,
- use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
- This also adds a new slab cache for the index contexts. To lookup a
- key in an index inode, use ntfs_index_lookup(). After modifying an
- index entry, call ntfs_index_entry_flush_dcache_page() followed by
- ntfs_index_entry_mark_dirty() to ensure the changes are written out
- to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
- an index entry is in the index allocation attribute rather than the
- index root attribute it will not be written out (you will get a
- warning message about discarded changes instead).
- - Load the quota file ($Quota) and check if quota tracking is enabled
- and if so, mark the quotas out of date. This causes windows to
- rescan the volume on boot and update all quota entries.
- - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
- It is simply set to __set_page_dirty_nobuffers() to make sure that
- running set_page_dirty() on a page containing mft/ntfs records will
- not affect the dirty state of the page buffers.
- - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
- buffers that are inside the ntfs record in the page dirty after which
- it sets the page dirty. This allows ->writepage to only write the
- dirty index records rather than having to write all the records in
- the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
- use this rather than __set_page_dirty_nobuffers().
- - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
- writing of page cache pages belonging to mst protected attributes
- like the index allocation attribute in directory indices and other
- indices like $Quota/$Q, etc. This means that the quota is now marked
- out of date on all volumes rather than only on ones where the quota
- defaults entry is in the index root attribute of the $Quota/$Q index.
-
-2.1.14 - Fix an NFSd caused deadlock reported by several users.
-
- - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
- to a buffer so that we can put the search context and unmap the mft
- record before calling the filldir() callback. We need to do this
- because of NFSd which calls ->lookup() from its filldir callback()
- and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
- of the directory and since ntfs_readdir() has got it mapped already
- ntfs_lookup() deadlocks.
-
-2.1.13 - Enable overwriting of resident files and housekeeping of system files.
-
- - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
- keeping the mft mirror in sync with the mft when mirrored mft records
- are written. The functions are write_mft_record{,_nolock}(). The
- implementation is quite rudimentary for now with lots of things not
- implemented yet but I am not sure any of them can actually occur so
- I will wait for people to hit each one and only then implement it.
- - Commit open system inodes at umount time. This should make it
- virtually impossible for sync_mft_mirror_umount() to ever be needed.
- - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
- ntfs super operations. This gives us inode writing via the VFS inode
- dirty code paths. Note: Access time updates are not implemented yet.
- - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
- fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
- finally enabling resident file overwrite! (-8 This also includes a
- placeholder for ->writepage (ntfs_mft_writepage()), which for now
- just redirties the page and returns. Also, at umount time, we for
- now throw away all mft data page cache pages after the last call to
- ntfs_commit_inode() in the hope that all inodes will have been
- written out by then and hence no dirty (meta)data will be lost. We
- also check for this case and emit an error message telling the user
- to run chkdsk.
- - Use set_page_writeback() and end_page_writeback() in the resident
- attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
- the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
- page is clean.
- - Implement ntfs_mft_writepage() so it now checks if any of the mft
- records in the page are dirty and if so redirties the page and
- returns. Otherwise it just returns (after doing set_page_writeback(),
- unlock_page(), end_page_writeback() or the radix-tree tag
- PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
- alowing the VM to do with the page as it pleases. Also, at umount
- time, now only throw away dirty mft (meta)data pages if dirty inodes
- are present and ask the user to email us if they see this happening.
- - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
- information flags (fs/ntfs/super.c).
- - Mark the volume dirty when (re)mounting read-write and mark it clean
- when unmounting or remounting read-only. If any volume errors are
- found, the volume is left marked dirty to force chkdsk to run.
- - Add code to set the NT4 compatibility flag when (re)mounting
- read-write for newer NTFS versions but leave it commented out for now
- since we do not make any modifications that are NTFS 1.2 specific yet
- and since setting this flag breaks Captive-NTFS which is not nice.
- This code must be enabled once we start writing NTFS 1.2 specific
- changes otherwise Windows NTFS driver might crash / cause corruption.
-
-2.1.12 - Fix the second fix to the decompression engine and some cleanups.
-
- - Add a new address space operations struct, ntfs_mst_aops, for mst
- protected attributes. This is because the default ntfs_aops do not
- make sense with mst protected data and were they to write anything to
- such an attribute they would cause data corruption so we provide
- ntfs_mst_aops which does not have any write related operations set.
- - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
- includes an adapted ntfs_commit_inode() and an implementation of
- ntfs_write_inode() which for now just cleans dirty inodes without
- writing them (it does emit a warning that this is happening).
- - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
- entry) as it was only fixing a theoretical bug but at the same time
- it badly broke the handling of sparse and uncompressed compression
- blocks.
-
-2.1.11 - Driver internal cleanups.
-
- - Only build logfile.o if building the driver with read-write support.
- - Really final white space cleanups.
- - Use generic_ffs() instead of ffs() in logfile.c which allows the
- log_page_size variable to be optimized by gcc into a constant.
- - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
- char as defined by POSIX and as found on some systems.
-
-2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
-
- - Finish off the white space cleanups (remove trailing spaces, etc).
- - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
- the kludges around the first iget(). Instead of (re)setting ->s_op
- we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
- insert_inode_hash() / call ntfs_read_inode_mount() directly. This
- kills the need for second super_operations and allows to return error
- from ntfs_read_inode_mount() without resorting to ugly "poisoning"
- tricks. (Al Viro)
- - Force read-only (re)mounting if any of the following bits are set in
- the volume information flags:
- VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
- VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
- VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
- To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
- above bits set so the test is made easy.
-
-2.1.9 - Fix two bugs in decompression engine.
-
- - Fix a bug where we would not always detect that we have reached the
- end of a compression block because we were ending at minus one byte
- which is effectively the same as being at the end. The fix is to
- check whether the uncompressed buffer has been fully filled and if so
- we assume we have reached the end of the compression block. A big
- thank you to Marcin Gibuła for the bug report, the assistance in
- tracking down the bug and testing the fix.
- - Fix a possible bug where when a compressed read is truncated to the
- end of the file, the offset inside the last page was not truncated.
-
-2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
-
- - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
- - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
- utc2ntfs() to work with struct timespec instead of time_t on the
- Linux UTC time side thus preserving the full precision of the NTFS
- time and only loosing up to 99 nano-seconds in the Linux UTC time.
- - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
- static inline.
- - Remove unused ntfs_dirty_inode().
- - Cleanup super operations declaration in fs/ntfs/super.c.
- - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
- - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
- fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
- - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
- fs/ntfs/inode.h so they can be used elsewhere.
- - Determine the mft mirror size as the number of mirrored mft records
- and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
- - Load the mft mirror at mount time and compare the mft records stored
- in it to the ones in the mft. Force a read-only mount if the two do
- not match (fs/ntfs/super.c).
- - Fix type casting related warnings on 64-bit architectures. Thanks
- to Meelis Roos for reporting them.
- - Move %L to %ll as %L is floating point and %ll is integer which is
- what we want.
- - Read the journal ($LogFile) and determine if the volume has been
- shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
- and fs/ntfs/logfile.c). This is a little bit of a crude check in
- that we only look at the restart areas and not at the actual log
- records so that there will be a very small number of cases where we
- think that a volume is dirty when in fact it is clean. This should
- only affect volumes that have not been shutdown cleanly and did not
- have any pending, non-check-pointed i/o.
- - If the $LogFile indicates a clean shutdown and a read-write (re)mount
- is requested, empty $LogFile by overwriting it with 0xff bytes to
- ensure that Windows cannot cause data corruption by replaying a stale
- journal after Linux has written to the volume.
-
-2.1.7 - Enable NFS exporting of mounted NTFS volumes.
-
- - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
- - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
- - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
- default doesn't allow inode number 0 which is a valid inode on NTFS
- and even if it did allow that it uses iget() instead of ntfs_iget()
- which makes it useless for us.
- - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
- default just returns -EACCES which is not very useful.
- - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
- and set them up in the super block at mount time (super.c) this
- allows mounted NTFS volumes to be exported via NFS.
- - Add missing return -EOPNOTSUPP; in
- fs/ntfs/aops.c::ntfs_commit_nonresident_write().
- - Enforce no atime and no dir atime updates at mount/remount time as
- they are not implemented yet anyway.
- - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
- after a NULL check. Thanks to Dave Jones for pointing this out.
-
-2.1.6 - Fix minor bug in handling of compressed directories.
-
- - Fix bug in handling of compressed directories. A compressed
- directory is not really compressed so when we set the ->i_blocks
- field of a compressed directory inode we were setting it from the
- non-existing field ni->itype.compressed.size which gave random
- results... For directories we now always use ni->allocated_size.
-
-2.1.5 - Fix minor bug in attribute list attribute handling.
-
- - Fix bug in attribute list handling. Actually it is not as much a bug
- as too much protection in that we were not allowing attribute lists
- which waste space on disk while Windows XP clearly allows it and in
- fact creates such attribute lists so our driver was failing.
- - Update NTFS documentation ready for 2.6 kernel release.
-
-2.1.4 - Reduce compiler requirements.
-
- - Remove all uses of unnamed structs and unions in the driver to make
- old and newer gcc versions happy. Makes it a bit uglier IMO but at
- least people will stop hassling me about it.
-
-2.1.3 - Important bug fixes in corner cases.
-
- - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
- clusters. (Philipp Thomas)
- - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
- multiple of the block_size but not the cluster size. (Szabolcs
- Szakacsits)
-
-2.1.2 - Important bug fixes aleviating the hangs in statfs.
-
- - Fix buggy free cluster and free inode determination logic.
-
-2.1.1 - Minor updates.
-
- - Add handling for initialized_size != data_size in compressed files.
- - Reduce function local stack usage from 0x3d4 bytes to just noise in
- fs/ntfs/upcase.c. (Randy Dunlap)
- - Remove compiler warnings for newer gcc.
- - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
- around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
- in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
- kmap_atomic(KM_USER0).
-
-2.1.0 - First steps towards write support: implement file overwrite.
-
- - Add configuration option for developmental write support with an
- appropriately scary configuration help text.
- - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
- helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
- overwriting of existing files on ntfs. Note: Resident files are
- only written into memory, and not written out to disk at present, so
- avoid writing to files smaller than about 1kiB.
- - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
- helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
- counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
- fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
- add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
- This enables write(2) based overwriting of existing files on ntfs.
- Note: As with mmap(2) based overwriting, resident files are only
- written into memory, and not written out to disk at present, so avoid
- writing to files smaller than about 1kiB.
- - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
- ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
- files with the purpose of intercepting and aborting all i_size
- changes which we do not support yet. ntfs_truncate() actually only
- emits a warning message but AFAICS our interception of i_size changes
- elsewhere means ntfs_truncate() never gets called for i_size changes.
- It is only called from generic_file_write() when we fail in
- ntfs_prepare_{,nonresident_}write() in order to discard any
- instantiated buffers beyond i_size. Thus i_size is not actually
- changed so our warning message is enough. Unfortunately it is not
- possible to easily determine if i_size is being changed or not hence
- we just emit an appropriately worded error message.
-
-2.0.25 - Small bug fixes and cleanups.
-
- - Unlock the page in an out of memory error code path in
- fs/ntfs/aops.c::ntfs_read_block().
- - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
- just unlock the page and return. (This can happen due to ->writepage
- clearing PageUptodate() during write out of MstProtected()
- attributes.
- - Remove leaked write code again.
-
-2.0.24 - Cleanups.
-
- - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
- inside BUG_ON(). (Adam J. Richter)
- - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
- calls for improved debugging. (Adam J. Richter)
- - Add errors flag to the ntfs volume state, accessed via
- NVol{,Set,Clear}Errors(vol).
- - Do not allow read-write remounts of read-only volumes with errors.
- - Clarify comment for ntfs file operation sendfile which was added by
- Christoph Hellwig a while ago (just using generic_file_sendfile())
- to say that ntfs ->sendfile is only used for the case where the
- source data is on the ntfs partition and the destination is
- somewhere else, i.e. nothing we need to concern ourselves with.
- - Add generic_file_write() as our ntfs file write operation.
-
-2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
-
- - Massive internal locking changes to mft record locking. Fixes lock
- recursion and replaces the mrec_lock read/write semaphore with a
- mutex. Also removes the now superfluous mft_count. This fixes several
- race conditions and deadlocks, especially in the future write code.
- - Fix ntfs over loopback for compressed files by adding an
- optimization barrier. (gcc was screwing up otherwise ?)
- - Miscellaneous cleanups all over the code and a fix or two in error
- handling code paths.
- Thanks go to Christoph Hellwig for pointing out the following two:
- - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
- - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
-
-2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
-
- - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
- at entry/exit respectively.
- - Use C99 initializers for structures.
- - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
-
-2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
-
- - Limit volume size at mount time to 2TiB on architectures where
- unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
- This is the most we can do without overflowing the 32-bit limit of
- the block device size imposed on us by sb_bread() and sb_getblk()
- for the time being.
- - Limit file/directory size at open() time to 16TiB on architectures
- where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
- fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
- overflowing the page cache page index.
-
-2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
-
- - Move the directory index bitmap to use an attribute inode instead of
- having special fields for it inside the ntfs inode structure. This
- means that the index bitmaps now use the page cache for i/o, too,
- and also as a side effect we get support for non-resident index
- bitmaps for free.
- - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
- fix a page leak that manifested itself in some cases.
- - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
- index bitmap inode on the final iput().
-
-2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
-
- - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
- to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
- - Drop the "file" from ntfs_file_read_compressed_block().
- - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
- ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
- - Update ntfs_end_buffer_async_read() with the improved logic from
- its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
- further logic improvements to better determine when we set PageError.
- - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
- check for the buffers being uptodate first in line with the updated
- fs/buffer.c::block_read_full_page(). This plugs a small race
- condition.
-
-2.0.18 - Fix race condition in reading of compressed files.
-
- - There was a narrow window between checking a buffer head for being
- uptodate and locking it in ntfs_file_read_compressed_block(). We now
- lock the buffer and then check whether it is uptodate or not.
-
-2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
-
- - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
- code and update callers, i.e. ntfs_iget(), to pass that error code
- up instead of just using -EIO.
- - Modifications to super.c to ensure that both mount and remount
- cannot set any write related options when the driver is compiled
- read-only.
- - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
- cache the current runlist element. This should improve performance
- when reading very large and/or very fragmented data.
-
-2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
-
- - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
- wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
- - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
- - Convert $MFT/$BITMAP access to attribute inode API and remove all
- remnants of the ugly mftbmp address space and operations hack. This
- means we finally have only one readpage function as well as only one
- async io completion handler. Yey! The mft bitmap is now just an
- attribute inode and is accessed from vol->mftbmp_ino just as if it
- were a normal file. Fake inodes rule. (-:
-
-2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
-
- - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
- remounts to fail when the partition had an entry in /etc/fstab and
- the entry specified the nls= option.
- - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
- expand all the helper functions NVolFoo(), NVolSetFoo(), and
- NVolClearFoo().
- - Move copyright statement from driver initialisation message to
- module description (fs/super.c). This makes the initialisation
- message fit on one line and fits in better with rest of kernel.
- - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
- attribute inodes, and both for files and directories.
- - Implement fake attribute inodes allowing all attribute i/o to go via
- the page cache and to use all the normal vfs/mm functionality:
- - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
- to fs/ntfs/inode.c.
- - Add needed cleanup code to ntfs_clear_big_inode().
- - Merge address space operations for files and directories (aops.c),
- now just have ntfs_aops:
- - Rename:
- end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
- ntfs_attr_read_block() -> ntfs_read_block(),
- ntfs_file_read_page() -> ntfs_readpage().
- - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
- attribute inodes, and both for files and directories.
- - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
-
-2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
-
- - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
- the locking out of super.c::get_nr_free_mft_records() and taking and
- dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
- - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
- current userspace ntfs library code. This means that if a merge
- fails the original runlists are always left unmodified instead of
- being silently corrupted.
- - Misc typo fixes.
-
-2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
-
- - Remove nr_mft_bits and the now superfluous union with nr_mft_records
- from ntfs_volume structure.
- - Remove nr_lcn_bits and the now superfluous union with nr_clusters
- from ntfs_volume structure.
- - Use iget5_locked() and friends instead of conventional iget(). Wrap
- the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
- to use ntfs_iget(). Leave only one iget() call at mount time so we
- don't need an ntfs_iget_mount().
- - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
- additional argument.
-
-2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
-
- - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
- fs/ntfs/aops.c::end_buffer_read_file_async() into one function
- fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
- to determine whether to apply mst fixups or not.
- - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
- and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
- fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
- fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
- the VFS readpage function prototype to the ntfs_attr_read_block()
- function prototype.
-
-2.0.11 - Initial preparations for fake inode based attribute i/o.
-
- - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
- do some macro magic (adapted from include/linux/buffer_head.h) to
- expand all the helper functions NInoFoo(), NInoSetFoo(), and
- NInoClearFoo().
- - Add new flag to ntfs_inode_state_bits: NI_Sparse.
- - Add new fields to ntfs_inode structure to allow use of fake inodes
- for attribute i/o: type, name, name_len. Also add new state bits:
- NI_Attr, which, if set, indicates the inode is a fake inode, and
- NI_MstProtected, which, if set, indicates the attribute uses multi
- sector transfer protection, i.e. fixups need to be applied after
- reads and before/after writes.
- - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
- ntfs_{new,clear,destroy}_extent_inode() and update callers.
- - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
- instead of ntfs_destroy_extent_inode().
- - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
- - Make all operations on ntfs inode state bits use the NIno* functions.
- - Set up the new ntfs inode fields and state bits in
- fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
- allocated memory to __ntfs_clear_inode().
- - Cleanup ntfs_inode structure a bit for better ordering of elements
- w.r.t. their size to allow better packing of the structure in memory.
-
-2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
-
- - Add check at mount time to verify that the number of inodes on the
- volume does not exceed 2^32 - 1, which is the maximum allowed for
- NTFS according to Microsoft.
- - Change mft_no member of ntfs_inode structure to be unsigned long.
- Update all users. This makes ntfs_inode->mft_no just a copy of struct
- inode->i_ino. But we can't just always use struct inode->i_ino and
- remove mft_no because extent inodes do not have an attached struct
- inode.
-
-2.0.9 - Decompression engine now uses a single buffer and other cleanups.
-
- - Change decompression engine to use a single buffer protected by a
- spin lock instead of per-CPU buffers. (Rusty Russell)
- - Do not update cb_pos when handling a partial final page during
- decompression of a sparse compression block, as the value is later
- reset without being read/used. (Rusty Russell)
- - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
- Morton)
- - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
- NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
- it also makes everything safer so it is a good thing.
- - Miscellaneous minor cleanups to comments.
-
-2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
-
- Big thanks go to Al Viro and other inhabitants of #kernel for investing
- their time to discuss the case sensitivity and dcache aliasing issues.
-
- - Remove unused source file fs/ntfs/attraops.c.
- - Remove show_inodes mount option(s), thus dropping support for
- displaying of short file names.
- - Remove deprecated mount option posix.
- - Restore show_sys_files mount option.
- - Add new mount option case_sensitive, to determine if the driver
- treats file names as case sensitive or not. If case sensitive, create
- file names in the POSIX namespace. Otherwise create file names in the
- LONG/WIN32 namespace. Note, files remain accessible via their short
- file name, if it exists.
- - Remove really dumb logic bug in boot sector recovery code.
- - Fix dcache aliasing issues wrt short/long file names via changes
- to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
- fs/ntfs/namei.c::ntfs_lookup():
- - Add additional argument to ntfs_lookup_inode_by_name() in which we
- return information about the matching file name if the case is not
- matching or the match is a short file name. See comments above the
- function definition for details.
- - Change ntfs_lookup() to only create dcache entries for the correctly
- cased file name and only for the WIN32 namespace counterpart of DOS
- namespace file names. This ensures we have only one dentry per
- directory and also removes all dcache aliasing issues between short
- and long file names once we add write support. See comments above
- function for details.
- - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
-
-2.0.7 - Minor cleanups and updates for changes in core kernel code.
-
- - Remove much of the NULL struct element initializers.
- - Various updates to make compatible with recent kernels.
- - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
- in fs/ntfs/ntfs.h instead.
- - Remove no longer needed KERNEL_VERSION checks. We are now in the
- kernel proper so they are no longer needed.
-
-2.0.6 - Major bugfix to make compatible with other kernel changes.
-
- - Initialize the mftbmp address space properly now that there are more
- fields in the struct address_space. This was leading to hangs and
- oopses on umount since 2.5.12 because of changes to other parts of
- the kernel. We probably want a kernel generic init_address_space()
- function...
- - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
- only caller of ->readdir() is vfs_readdir() which holds i_mutex
- during the call, and i_mutex is sufficient protection against changes
- in the directory inode (including ->i_size).
- - Use generic_file_llseek() for directories (as opposed to
- default_llseek()) as this downs i_mutex instead of the BKL which is
- what we now need for exclusion against ->f_pos changes considering we
- no longer take the BKL in ntfs_readdir().
-
-2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
-
- - No need to set old blocksize in super.c::ntfs_fill_super() as the
- VFS does so via invocation of deactivate_super() calling
- fs->fill_super() calling block_kill_super() which does it.
- - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
- -> Do we really need it? I don't think so as we have exclusion on
- the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
- move the ->f_pos accesses under the mrec_lock though. Check this...
- - Fix really, really, really stupid buffer overflow in extent inode
- handling in mft.c::map_extent_mft_record().
-
-2.0.4 - Cleanups and updates for kernel 2.5.11.
-
- - Add documentation on how to use the MD driver to be able to use NTFS
- stripe and volume sets in Linux and generally cleanup documentation
- a bit.
- Remove all uses of kdev_t in favour of struct block_device *:
- - Change compress.c::ntfs_file_read_compressed_block() to use
- sb_getblk() instead of getblk().
- - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
- of get_hardsect_size().
- - No need to get old blocksize in super.c::ntfs_fill_super() as
- fs/super.c::get_sb_bdev() already does this.
- - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
-
-2.0.3 - Small bug fixes, cleanups, and performance improvements.
-
- - Remove some dead code from mft.c.
- - Optimize readpage and read_block functions throughout aops.c so that
- only initialized blocks are read. Non-initialized ones have their
- buffer head mapped, zeroed, and set up to date, without scheduling
- any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
- Thanks go to Andrew Morton for spotting the below:
- - Fix buglet in allocate_compression_buffers() error code path.
- - Call flush_dcache_page() after modifying page cache page contents in
- ntfs_file_readpage().
- - Check for existence of page buffers throughout aops.c before calling
- create_empty_buffers(). This happens when an I/O error occurs and the
- read is retried. (It also happens once writing is implemented so that
- needed doing anyway but I had left it for later...)
- - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
- readpage and read_block functions. Reasoning same as above (i.e. I/O
- error retries and future write code paths.)
-
-2.0.2 - Minor updates and cleanups.
-
- - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
- and cleanup the code a bit, removing the unused size parameter.
- - Change default fmask to 0177 and update documentation.
- - Change attrib.c::get_attr_search_ctx() to return the search context
- directly instead of taking the address of a pointer. A return value
- of NULL means the allocation failed. Updated all callers
- appropriately.
- - Update to 2.5.9 kernel (preserving backwards compatibility) by
- replacing all occurences of page->buffers with page_buffers(page).
- - Fix minor bugs in runlist merging, also minor cleanup.
- - Updates to bootsector layout and mft mirror contents descriptions.
- - Small bug fix in error detection in unistr.c and some cleanups.
- - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
- bytes.
-
-2.0.1 - Minor updates.
-
- - Make default umask correspond to documentation.
- - Improve documentation.
- - Set default mode to include execute bit. The {u,f,d}mask can be used
- to take it away if desired. This allows binaries to be executed from
- a mounted ntfs partition.
-
-2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
-
- - Add kill_super, just keeping up with the vfs changes in the kernel.
- - Repeat some changes from tng-0.0.8 that somehow got lost on the way
- from the CVS import into BitKeeper.
- - Begin to implement proper handling of allocated_size vs
- initialized_size vs data_size (i.e. i_size). Done are
- mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
- and attrib.c::load_attribute_list().
- - Lock the runlist in attrib.c::load_attribute_list() while using it.
- - Fix memory leak in ntfs_file_read_compressed_block() and generally
- clean up compress.c a little, removing some uncommented/unused debug
- code.
- - Tidy up dir.c a little bit.
- - Don't bother getting the runlist in inode.c::ntfs_read_inode().
- - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
- creating aops.c::ntfs_mst_readpage(), improving the handling of
- holes and overflow in the process and implementing the correct
- equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
- I am aiming for correctness at the moment. Modularisation can come
- later.
- - Rename aops.c::end_buffer_read_index_async() to
- end_buffer_read_mst_async() and optimize the overflow checking and
- handling.
- - Use the host of the mftbmp address space mapping to hold the ntfs
- volume. This is needed so the async i/o completion handler can
- retrieve a pointer to the volume. Hopefully this will not cause
- problems elsewhere in the kernel... Otherwise will need to use a
- fake inode.
- - Complete implementation of proper handling of allocated_size vs
- initialized_size vs data_size (i.e. i_size) in whole driver.
- Basically aops.c is now completely rewritten.
- - Change NTFS driver name to just NTFS and set version number to 2.0.0
- to make a clear distinction from the old driver which is still on
- version 1.1.22.
-
-tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
-
- - Replace bdevname(sb->s_dev) with sb->s_id.
- - Remove now superfluous new-line characters in all callers of
- ntfs_debug().
- - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
- directories. Without this the "find" utility gets very upset which is
- fair enough as Linux/Unix do not support directory hard links.
- - Further runlist merging work. (Richard Russon)
- - Backwards compatibility for gcc-2.95. (Richard Russon)
- - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
- - Convert to new filesystem declaration using ->ntfs_get_sb() and
- replacing ntfs_read_super() with ntfs_fill_super().
- - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
- overflow on 32-bit architectures.
- - Cleanup upcase loading code to use ntfs_(un)map_page().
- - Disable/reenable preemtion in critical sections of compession engine.
- - Replace device size determination in ntfs_fill_super() with
- sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
- function super.c::get_nr_blocks().
- - Implement a mount time option (show_inodes) allowing choice of which
- types of inode names readdir() returns and modify ntfs_filldir()
- accordingly. There are several parameters to show_inodes:
- system: system files
- win32: long file names (including POSIX file names) [DEFAULT]
- long: same as win32
- dos: short file names only (excluding POSIX file names)
- short: same as dos
- posix: same as both win32 and dos
- all: all file names
- Note that the options are additive, i.e. specifying:
- -o show_inodes=system,show_inodes=win32,show_inodes=dos
- is the same as specifying:
- -o show_inodes=all
- Note that the "posix" and "all" options will show all directory
- names, BUT the link count on each directory inode entry is set to 1,
- due to Linux not supporting directory hard links. This may well
- confuse some userspace applications, since the directory names will
- have the same inode numbers. Thus it is NOT advisable to use the
- "posix" or "all" options. We provide them only for completeness sake.
- - Add copies of allocated_size, initialized_size, and compressed_size to
- the ntfs inode structure and set them up in
- inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
- for files and the index allocation attribute for directories.
- - Add copies of allocated_size and initialized_size to ntfs inode for
- $BITMAP attribute of large directories and set them up in
- inode.c::ntfs_read_inode().
- - Add copies of allocated_size and initialized_size to ntfs volume for
- $BITMAP attribute of $MFT and set them up in
- super.c::load_system_files().
- - Parse deprecated ntfs driver options (iocharset, show_sys_files,
- posix, and utf8) and tell user what the new options to use are. Note
- we still do support them but they will be removed with kernel 2.7.x.
- - Change all occurences of integer long long printf formatting to hex
- as printk() will not support long long integer format if/when the
- div64 patch goes into the kernel.
- - Make slab caches have stable names and change the names to what they
- were intended to be. These changes are required/made possible by the
- new slab cache name handling which removes the length limitation by
- requiring the caller of kmem_cache_create() to supply a stable name
- which is then referenced but not copied.
- - Rename run_list structure to run_list_element and create a new
- run_list structure containing a pointer to a run_list_element
- structure and a read/write semaphore. Adapt all users of runlists
- to new scheme and take and release the lock as needed. This fixes a
- nasty race as the run_list changes even when inodes are locked for
- reading and even when the inode isn't locked at all, so we really
- needed the serialization. We use a semaphore rather than a spinlock
- as memory allocations can sleep and doing everything GFP_ATOMIC
- would be silly.
- - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
- This can never happen due to the nature of lookup_attr() and how we
- support attribute lists. If it did happen it would imply the inode
- being corrupt.
- - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
- bad if found.
- - Update to 2.5.6-pre2 changes in struct address_space.
- - Use parent_ino() when accessing d_parent inode number in dir.c.
- - Import Sourceforge CVS repository into BitKeeper repository:
- http://linux-ntfs.bkbits.net/ntfs-tng-2.5
- - Update fs/Makefile, fs/Config.help, fs/Config.in, and
- Documentation/filesystems/ntfs.txt for NTFS TNG.
- - Create kernel configuration option controlling whether debugging
- is enabled or not.
- - Add the required export of end_buffer_io_sync() from the patches
- directory to the kernel code.
- - Update inode.c::ntfs_show_options() with show_inodes mount option.
- - Update errors mount option.
-
-tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
-
- - Cleanup mft.c and it's debug/error output in particular. Fix a minor
- bug in mapping of extent inodes. Update all the comments to fit all
- the recent code changes.
- - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
- - Cleanups in compress.c, mostly comments and folding help.
- - Implement attrib.c::map_run_list() as a generic helper.
- - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
- thus making code shorter and enabling attribute list support.
- - Cleanup incorrect use of [su]64 with %L printf format specifier in
- all source files. Type casts to [unsigned] long long added to correct
- the mismatches (important for architectures which have long long not
- being 64 bits).
- - Merge async io completion handlers for directory indexes and $MFT
- data into one by setting the index_block_size{_bits} of the ntfs
- inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
- - Cleanup aops.c, update comments.
- - Make ntfs_file_get_block() use map_run_list() so all files now
- support attribute lists.
- - Make ntfs_dir_readpage() almost verbatim copy of
- block_read_full_page() by using ntfs_file_get_block() with only real
- difference being the use of our own async io completion handler
- rather than the default one, thus reducing the amount of code and
- automatically enabling attribute list support for directory indices.
- - Fix bug in load_attribute_list() - forgot to call brelse in error
- code path.
- - Change parameters to find_attr() and lookup_attr(). We no longer
- pass in the upcase table and its length. These can be gotten from
- ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
- - Cleanups in attrib.c.
- - Implement merging of runlists, attrib.c::merge_run_lists() and its
- helpers. (Richard Russon)
- - Attribute lists part 2, attribute extents and multi part runlists:
- enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
- further runlist parts via attrib.c::map_run_list().
- - Tiny endianness bug fix in decompress_mapping_pairs().
-
-tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
-
- - Enable encrypted directories. (Their index root is marked encrypted
- to indicate that new files in that directory should be created
- encrypted.)
- - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
- - Enable $Extend system directory. Most (if not all) extended system
- files do not have unnamed data attributes so ntfs_read_inode() had to
- special case them but that is ok, as the special casing recovery
- happens inside an error code path so there is zero slow down in the
- normal fast path. The special casing is done by introducing a new
- function inode.c::ntfs_is_extended_system_file() which checks if any
- of the hard links in the inode point to $Extend as being their parent
- directory and if they do we assume this is an extended system file.
- - Create a sysctl/proc interface to allow {dis,en}abling of debug output
- when compiled with -DDEBUG. Default is debug messages to be disabled.
- To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
- (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
- interface is enabled). Inspired by old ntfs driver.
- - Add debug_msgs insmod/kernel boot parameter to set whether debug
- messages are {dis,en}abled. This is useful to enable debug messages
- during ntfs initialization and is the only way to activate debugging
- when the sysctl interface is not enabled.
- - Cleanup debug output in various places.
- - Remove all dollar signs ($) from the source (except comments) to
- enable compilation on architectures whose gcc compiler does not
- support dollar signs in the names of variables/constants. Attribute
- types now start with AT_ instead of $ and $I30 is now just I30.
- - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
- - Load complete runlist for $MFT/$BITMAP during mount and cleanup
- access functions. This means we now cope with $MFT/$BITMAP being
- spread accross several mft records.
- - Disable modification of mft_zone_multiplier on remount. We can always
- reenable this later on if we really want to, but we will need to make
- sure we readjust the mft_zone size / layout accordingly.
-
-tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
-
- - Use sb_set_blocksize() instead of set_blocksize() and verify the
- return value.
- - Use sb_bread() instead of bread() throughout.
- - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
- of a directory index block vcn. Apply resulting simplifications in
- dir.c everywhere.
- - Fix a small bug somewhere (but forgot what it was).
- - Change ntfs_{debug,error,warning} to enable gcc to do type checking
- on the printf-format parameter list and fix bugs reported by gcc
- as a result. (Richard Russon)
- - Move inode allocation strategy to Al's new stuff but maintain the
- divorce of ntfs_inode from struct inode. To achieve this we have two
- separate slab caches, one for big ntfs inodes containing a struct
- inode and pure ntfs inodes and at the same time fix some faulty
- error code paths in ntfs_read_inode().
- - Show mount options in proc (inode.c::ntfs_show_options()).
-
-tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
-
- - Modified (un)map_mft_record functions to be common for read and write
- case. To specify which is which, added extra parameter at front of
- parameter list. Pass either READ or WRITE to this, each has the
- obvious meaning.
- - General cleanups to allow for easier folding in vi.
- - attrib.c::decompress_mapping_pairs() now accepts the old runlist
- argument, and invokes attrib.c::merge_run_lists() to merge the old
- and the new runlists.
- - Removed attrib.c::find_first_attr().
- - Implemented loading of attribute list and complete runlist for $MFT.
- This means we now cope with $MFT being spread across several mft
- records.
- - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
- - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
- - Make ntfs_volume be allocated via kmalloc() instead of using a slab
- cache. There are too little ntfs_volume structures at any one time
- to justify a private slab cache.
- - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
- Use KM_BIO_IRQ on advice from IRC/kernel...
- - Use ntfs_map_page() in map_mft_record() and create ->readpage method
- for reading $MFT (ntfs_mft_readpage). In the process create dedicated
- address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
- removed the now superfluous exports from the kernel core patch.
- - Fix a bug where kfree() was used instead of ntfs_free().
- - Change map_mft_record() to take ntfs_inode as argument instead of
- vfs inode. Dito for unmap_mft_record(). Adapt all callers.
- - Add pointer to ntfs_volume to ntfs_inode.
- - Add mft record number and sequence number to ntfs_inode. Stop using
- i_ino and i_generation for in-driver purposes.
- - Implement attrib.c::merge_run_lists(). (Richard Russon)
- - Remove use of proper inodes by extent inodes. Move i_ino and
- i_generation to ntfs_inode to do this. Apply simplifications that
- result and remove iget_no_wait(), etc.
- - Pass ntfs_inode everywhere in the driver (used to be struct inode).
- - Add reference counting in ntfs_inode for the ntfs inode itself and
- for the mapped mft record.
- - Extend mft record mapping so we can (un)map extent mft records (new
- functions (un)map_extent_mft_record), and so mappings are reference
- counted and don't have to happen twice if already mapped - just ref
- count increases.
- - Add -o iocharset as alias to -o nls for backwards compatibility.
- - The latest core patch is now tiny. In fact just a single additional
- export is necessary over the base kernel.
-
-tng-0.0.3 - Cleanups, enhancements, bug fixes.
-
- - Work on attrib.c::decompress_mapping_pairs() to detect base extents
- and setup the runlist appropriately using knowledge provided by the
- sizes in the base attribute record.
- - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
- any more.
- - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
- page or use vmalloc depending on the amount of memory requested.
- - Cleanup error output. The __FUNCTION__ "(): " is now added
- automatically. Introduced a new header file debug.h to support this
- and also moved ntfs_debug() function into it.
- - Make reading of compressed files more intelligent and especially get
- rid of the vmalloc_nofs() from readpage(). This now uses per CPU
- buffers (allocated at first mount with cluster size <= 4kiB and
- deallocated on last umount with cluster size <= 4kiB), and
- asynchronous io for the compressed data using a list of buffer heads.
- Er, we use synchronous io as async io only works on whole pages
- covered by buffers and not on individual buffer heads...
- - Bug fix for reading compressed files with sparse compression blocks.
-
-tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
-
- - Fixed handling of directories when cluster size exceeds index block
- size.
- - Hide DOS only name space directory entries from readdir() but allow
- them in lookup(). This should fix the problem that Linux doesn't
- support directory hard links, while still allowing access to entries
- via their short file name. This also has the benefit of mimicking
- what Windows users are used to, so it is the ideal solution.
- - Implemented sync_page everywhere so no more hangs in D state when
- waiting for a page.
- - Stop using bforget() in favour of brelse().
- - Stop locking buffers unnecessarily.
- - Implemented compressed files (inode->mapping contains uncompressed
- data, raw compressed data is currently bread() into a vmalloc()ed
- memory buffer).
- - Enable compressed directories. (Their index root is marked compressed
- to indicate that new files in that directory should be created
- compressed.)
- - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
- functions. (Thanks to Will Dyson for pointing this out.)
- - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
- ntfs_sb_info) out of the common inode and super_block structures and
- started using the generic_ip and generic_sbp pointers instead. This
- makes ntfs entirely private with respect to the kernel tree.
- - Detect compiler version and abort with error message if gcc less than
- 2.96 is used.
- - Fix bug in name comparison function in unistr.c.
- - Implement attribute lists part 1, the infrastructure: search contexts
- and operations, find_external_attr(), lookup_attr()) and make the
- code use the infrastructure.
- - Fix stupid buffer overflow bug that became apparent on larger run
- list containing attributes.
- - Fix bugs in readdir() that became apparent on larger directories.
-
- The driver is now really useful and survives the test
- find . -type f -exec md5sum "{}" \;
- without any error messages on a over 1GiB sized partition with >16k
- files on it, including compressed files and directories and many files
- and directories with attribute lists.
-
-tng-0.0.1 - The first useful version.
-
- - Added ntfs_lookup().
- - Added default upcase generation and handling.
- - Added compile options to be shown on module init.
- - Many bug fixes that were "hidden" before.
- - Update to latest kernel.
- - Added ntfs_readdir().
- - Added file operations for mmap(), read(), open() and llseek(). We just
- use the generic ones. The whole point of going through implementing
- readpage() methods and where possible get_block() call backs is that
- this allows us to make use of the generic high level methods provided
- by the kernel.
-
- The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
- though and it doesn't implement accesssing compressed files yet. Also,
- accessing files with attribute list attributes is not implemented yet
- either. But for small or simple filesystems it should work and allow
- you to list directories, use stat on directory entries and the file
- system, open, read, mmap and llseek around in files. A big mile stone
- has been reached!
-
-tng-0.0.0 - Initial version tag.
-
- Initial driver implementation. The driver can mount and umount simple
- NTFS filesystems (i.e. ones without attribute lists in the system
- files). If the mount fails there might be problems in the error handling
- code paths, so be warned. Otherwise it seems to be loading the system
- files nicely and the mft record read mapping/unmapping seems to be
- working nicely, too. Proof of inode metadata in the page cache and non-
- resident file unnamed stream data in the page cache concepts is thus
- complete.
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e3..9173e82a45d 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
write_inode_now(bmp_vi, !datasync);
iput(bmp_vi);
}
- ret = ntfs_write_inode(vi, 1);
+ ret = __ntfs_write_inode(vi, 1);
write_inode_now(vi, !datasync);
err = sync_blockdev(vi->i_sb->s_bdev);
if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336..b681c71d706 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
BUG_ON(S_ISDIR(vi->i_mode));
if (!datasync || !NInoNonResident(NTFS_I(vi)))
- ret = ntfs_write_inode(vi, 1);
+ ret = __ntfs_write_inode(vi, 1);
write_inode_now(vi, !datasync);
/*
* NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d..4b57fb1eac2 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
*
* Return 0 on success and -errno on error.
*/
-int ntfs_write_inode(struct inode *vi, int sync)
+int __ntfs_write_inode(struct inode *vi, int sync)
{
sle64 nt;
ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a..9a113544605 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ntfs_write_inode(struct inode *vi, int sync);
+extern int __ntfs_write_inode(struct inode *vi, int sync);
static inline void ntfs_commit_inode(struct inode *vi)
{
if (!is_bad_inode(vi))
- ntfs_write_inode(vi, 1);
+ __ntfs_write_inode(vi, 1);
return;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e..0de1db6cddb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
#include <linux/vfs.h>
#include <linux/moduleparam.h>
#include <linux/smp_lock.h>
+#include <linux/bitmap.h>
#include "sysctl.h"
#include "logfile.h"
@@ -39,6 +40,7 @@
#include "dir.h"
#include "debug.h"
#include "index.h"
+#include "inode.h"
#include "aops.h"
#include "layout.h"
#include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
static s64 get_nr_free_clusters(ntfs_volume *vol)
{
s64 nr_free = vol->nr_clusters;
- u32 *kaddr;
struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
struct page *page;
pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
max_index, PAGE_CACHE_SIZE / 4);
for (index = 0; index < max_index; index++) {
- unsigned int i;
+ unsigned long *kaddr;
+
/*
* Read the page from page cache, getting it from backing store
* if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
nr_free -= PAGE_CACHE_SIZE * 8;
continue;
}
- kaddr = (u32*)kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page, KM_USER0);
/*
- * For each 4 bytes, subtract the number of set bits. If this
+ * Subtract the number of set bits. If this
* is the last page and it is partial we don't really care as
* it just means we do a little extra work but it won't affect
* the result as all out of range bytes are set to zero by
* ntfs_readpage().
*/
- for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
- nr_free -= (s64)hweight32(kaddr[i]);
+ nr_free -= bitmap_weight(kaddr,
+ PAGE_CACHE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr, KM_USER0);
page_cache_release(page);
}
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
s64 nr_free, const pgoff_t max_index)
{
- u32 *kaddr;
struct address_space *mapping = vol->mftbmp_ino->i_mapping;
struct page *page;
pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
for (index = 0; index < max_index; index++) {
- unsigned int i;
+ unsigned long *kaddr;
+
/*
* Read the page from page cache, getting it from backing store
* if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
nr_free -= PAGE_CACHE_SIZE * 8;
continue;
}
- kaddr = (u32*)kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page, KM_USER0);
/*
- * For each 4 bytes, subtract the number of set bits. If this
+ * Subtract the number of set bits. If this
* is the last page and it is partial we don't really care as
* it just means we do a little extra work but it won't affect
* the result as all out of range bytes are set to zero by
* ntfs_readpage().
*/
- for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
- nr_free -= (s64)hweight32(kaddr[i]);
+ nr_free -= bitmap_weight(kaddr,
+ PAGE_CACHE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr, KM_USER0);
page_cache_release(page);
}
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
return 0;
}
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+ return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
/**
* The complete super operations.
*/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2bbe1ecc08c..9f8bd913c51 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5713,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto out;
}
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(inode->i_sb, len));
ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6936,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
goto bail;
}
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7301,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
unsigned int page_end;
u64 phys;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7381,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4c2a6d282c4..21441ddb550 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1764,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
wc->w_handle = handle;
- if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
- ret = -EDQUOT;
- goto out_commit;
+ if (clusters_to_alloc) {
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+ if (ret)
+ goto out_commit;
}
/*
* We don't want this to fail in ocfs2_write_end(), so do it
@@ -1810,7 +1811,7 @@ success:
return 0;
out_quota:
if (clusters_to_alloc)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
out_commit:
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index b39da877b12..3bb928a2bf7 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -136,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
return mlog_mask_store(mlog_attr->mask, buf, count);
}
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
.show = mlog_show,
.store = mlog_store,
};
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d8d0c65ac03..73e743eea2c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
#include "tcp_internal.h"
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
- NIPQUAD(sc->sc_node->nd_ipv4_address), \
+ &sc->sc_node->nd_ipv4_address, \
ntohs(sc->sc_node->nd_ipv4_port)
/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 765d66c7098..efd77d071c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb,
- alloc + dx_alloc))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir, bytes_allocated);
+ dquot_free_space_nodirty(dir, bytes_allocated);
ocfs2_commit_trans(osb, handle);
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
status = 0;
bail:
if (did_quota && status < 0)
- vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+ dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
mlog_exit(status);
return status;
}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
/*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8298608d416..50c4ee805da 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1881,7 +1881,7 @@ out:
* ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
* flock() calls. The locking approach this requires is sufficiently
* different from all other cluster lock types that we implement a
- * seperate path to the "low-level" dlm calls. In particular:
+ * separate path to the "low-level" dlm calls. In particular:
*
* - No optimization of lock levels is done - we take at exactly
* what's been requested.
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 5328529e7fd..c562a7581cf 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -453,7 +453,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (i == -1) {
/*
* Holes can be larger than the maximum size of an
- * extent, so we return their lengths in a seperate
+ * extent, so we return their lengths in a separate
* field.
*/
if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5b52547d629..17947dc8341 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+ if (file->f_mode & FMODE_WRITE)
+ dquot_initialize(inode);
+
spin_lock(&oi->ip_lock);
/* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
}
restarted_transaction:
- if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
- clusters_to_add))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ if (status)
goto leave;
- }
did_quota = 1;
/* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Release unused quota reservation */
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
did_quota = 0;
@@ -710,7 +712,7 @@ restarted_transaction:
leave:
if (status < 0 && did_quota)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
ocfs2_commit_trans(osb, handle);
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
+ dquot_initialize(inode);
+
status = ocfs2_rw_lock(inode, 1);
if (status < 0) {
mlog_errno(status);
@@ -1020,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
/*
* Gather pointers to quota structures so that allocation /
* freeing of quota structures happens here and not inside
- * vfs_dq_transfer() where we have problems with lock ordering
+ * dquot_transfer() where we have problems with lock ordering
*/
if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1053,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock;
}
- status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ status = dquot_transfer(inode, attr);
if (status < 0)
goto bail_commit;
} else {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 88459bdd1ff..278a223aae1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
* here but we do it for safety anyway (it will most
@@ -1087,6 +1089,8 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ dquot_drop(inode);
+
/* To preven remote deletes we hold open lock before, now it
* is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f..d9cd4e373a5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
} else
inode->i_gid = current_fsgid();
inode->i_mode = mode;
- vfs_dq_init(inode);
+ dquot_initialize(inode);
return inode;
}
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
(unsigned long)dev, dentry->d_name.len,
dentry->d_name.name);
+ dquot_initialize(dir);
+
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto leave;
- }
did_quota_inode = 1;
mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
status = 0;
leave:
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
+ dquot_initialize(dir);
+
err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
if (err < 0) {
if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
dentry->d_name.len, dentry->d_name.name);
+ dquot_initialize(dir);
+
BUG_ON(dentry->d_parent->d_inode != dir);
mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
old_dentry->d_name.len, old_dentry->d_name.name,
new_dentry->d_name.len, new_dentry->d_name.name);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
osb = OCFS2_SB(old_dir->i_sb);
if (new_inode) {
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
dentry, symname, dentry->d_name.len, dentry->d_name.name);
+ dquot_initialize(dir);
+
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto bail;
- }
did_quota_inode = 1;
mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
d_instantiate(dentry, inode);
bail:
if (status < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -2099,13 +2101,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ status = dquot_alloc_inode(inode);
+ if (status)
goto leave;
- }
did_quota_inode = 1;
inode->i_nlink = 0;
@@ -2140,7 +2138,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
insert_inode_hash(inode);
leave:
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4ca..355f41d1d52 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
}
const struct dquot_operations ocfs2_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = ocfs2_write_dquot,
.acquire_dquot = ocfs2_acquire_dquot,
.release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 21f9e71223c..a6467f3d262 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -457,7 +457,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
break;
}
dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
- for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+ for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
qbh = NULL;
status = ocfs2_read_quota_block(lqinode,
ol_dqblk_block(sb, chunk, bit),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index fb6aa7acf54..9e96921dffd 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
}
mutex_lock(&inode->i_mutex);
- vfs_dq_init(dir);
+ dquot_initialize(dir);
error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
mutex_unlock(&inode->i_mutex);
if (!error)
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3..75d9b5ba1d4 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
+#include <linux/writeback.h>
#include <linux/crc-itu-t.h>
#include "omfs.h"
@@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
oi->i_head.h_check_xor = xor;
}
-static int omfs_write_inode(struct inode *inode, int wait)
+static int __omfs_write_inode(struct inode *inode, int wait)
{
struct omfs_inode *oi;
struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +163,14 @@ out:
return ret;
}
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
int omfs_sync_inode(struct inode *inode)
{
- return omfs_write_inode(inode, 1);
+ return __omfs_write_inode(inode, 1);
}
/*
diff --git a/fs/open.c b/fs/open.c
index e0b2d88b038..e17f54454b5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
-#include <linux/quotaops.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error)
error = security_path_truncate(&path, length, 0);
- if (!error) {
- vfs_dq_init(inode);
+ if (!error)
error = do_truncate(path.dentry, length, 0, NULL);
- }
put_write_and_out:
put_write_access(inode);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 18e20feee25..aa8637b8102 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -273,7 +273,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
rcu_read_lock(); /* FIXME: is this correct? */
qsize = atomic_read(&__task_cred(p)->user->sigpending);
rcu_read_unlock();
- qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
+ qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
}
@@ -420,7 +420,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
cutime = sig->cutime;
cstime = sig->cstime;
cgtime = sig->cgtime;
- rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+ rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
/* add up live thread stats at the group level */
if (whole) {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9580abeadeb..08f4d71dacd 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -291,19 +291,17 @@ static const struct inode_operations proc_file_inode_operations = {
* returns the struct proc_dir_entry for "/proc/tty/driver", and
* returns "serial" in residual.
*/
-static int xlate_proc_name(const char *name,
- struct proc_dir_entry **ret, const char **residual)
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
{
const char *cp = name, *next;
struct proc_dir_entry *de;
int len;
- int rtn = 0;
de = *ret;
if (!de)
de = &proc_root;
- spin_lock(&proc_subdir_lock);
while (1) {
next = strchr(cp, '/');
if (!next)
@@ -315,16 +313,25 @@ static int xlate_proc_name(const char *name,
break;
}
if (!de) {
- rtn = -ENOENT;
- goto out;
+ WARN(1, "name '%s'\n", name);
+ return -ENOENT;
}
cp += len + 1;
}
*residual = cp;
*ret = de;
-out:
+ return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
+{
+ int rv;
+
+ spin_lock(&proc_subdir_lock);
+ rv = __xlate_proc_name(name, ret, residual);
spin_unlock(&proc_subdir_lock);
- return rtn;
+ return rv;
}
static DEFINE_IDA(proc_inum_ida);
@@ -797,11 +804,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
int len;
- if (xlate_proc_name(name, &parent, &fn) != 0)
+ spin_lock(&proc_subdir_lock);
+ if (__xlate_proc_name(name, &parent, &fn) != 0) {
+ spin_unlock(&proc_subdir_lock);
return;
+ }
len = strlen(fn);
- spin_lock(&proc_subdir_lock);
for (p = &parent->subdir; *p; p=&(*p)->next ) {
if (proc_match(len, fn, *p)) {
de = *p;
@@ -811,8 +820,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
}
}
spin_unlock(&proc_subdir_lock);
- if (!de)
+ if (!de) {
+ WARN(1, "name '%s'\n", name);
return;
+ }
spin_lock(&de->pde_unload_lock);
/*
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f277c4a111c..183f8ff5f40 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -16,7 +16,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib;
+ unsigned long data, text, lib, swap;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, MM_SWAPENTS);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ swap << (PAGE_SHIFT-10));
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +68,11 @@ unsigned long task_vsize(struct mm_struct *mm)
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- *shared = get_mm_counter(mm, file_rss);
+ *shared = get_mm_counter(mm, MM_FILEPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = *shared + get_mm_counter(mm, anon_rss);
+ *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28c..277575ddc05 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
if (rootdir->di_fname != NULL) {
QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
- if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
+ if (!strcmp(rootdir->di_fname,
+ QNX4_BMNAME)) {
found = 1;
qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index efc02ebb8c7..dad7fb247dd 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -59,3 +59,8 @@ config QUOTACTL
bool
depends on XFS_QUOTA || QUOTA
default y
+
+config QUOTACTL_COMPAT
+ bool
+ depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+ default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc057..5f9e9e276af 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o
+obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 00000000000..fb1892fe3e5
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
+
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_u64 dqb_ihardlimit;
+ compat_u64 dqb_isoftlimit;
+ compat_u64 dqb_curinodes;
+ compat_u64 dqb_btime;
+ compat_u64 dqb_itime;
+ compat_uint_t dqb_valid;
+};
+
+/* XFS structures */
+struct compat_fs_qfilestat {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 qfs_nblks;
+ compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+ __s8 qs_version;
+ __u16 qs_flags;
+ __s8 qs_pad;
+ struct compat_fs_qfilestat qs_uquota;
+ struct compat_fs_qfilestat qs_gquota;
+ compat_uint_t qs_incoredqs;
+ compat_int_t qs_btimelimit;
+ compat_int_t qs_itimelimit;
+ compat_int_t qs_rtbtimelimit;
+ __u16 qs_bwarnlimit;
+ __u16 qs_iwarnlimit;
+};
+
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+ qid_t id, void __user *addr)
+{
+ unsigned int cmds;
+ struct if_dqblk __user *dqblk;
+ struct compat_if_dqblk __user *compat_dqblk;
+ struct fs_quota_stat __user *fsqstat;
+ struct compat_fs_quota_stat __user *compat_fsqstat;
+ compat_uint_t data;
+ u16 xdata;
+ long ret;
+
+ cmds = cmd >> SUBCMDSHIFT;
+
+ switch (cmds) {
+ case Q_GETQUOTA:
+ dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+ compat_dqblk = addr;
+ ret = sys_quotactl(cmd, special, id, dqblk);
+ if (ret)
+ break;
+ if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+ get_user(data, &dqblk->dqb_valid) ||
+ put_user(data, &compat_dqblk->dqb_valid))
+ ret = -EFAULT;
+ break;
+ case Q_SETQUOTA:
+ dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+ compat_dqblk = addr;
+ ret = -EFAULT;
+ if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+ get_user(data, &compat_dqblk->dqb_valid) ||
+ put_user(data, &dqblk->dqb_valid))
+ break;
+ ret = sys_quotactl(cmd, special, id, dqblk);
+ break;
+ case Q_XGETQSTAT:
+ fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+ compat_fsqstat = addr;
+ ret = sys_quotactl(cmd, special, id, fsqstat);
+ if (ret)
+ break;
+ ret = -EFAULT;
+ /* Copying qs_version, qs_flags, qs_pad */
+ if (copy_in_user(compat_fsqstat, fsqstat,
+ offsetof(struct compat_fs_quota_stat, qs_uquota)))
+ break;
+ /* Copying qs_uquota */
+ if (copy_in_user(&compat_fsqstat->qs_uquota,
+ &fsqstat->qs_uquota,
+ sizeof(compat_fsqstat->qs_uquota)) ||
+ get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+ put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+ break;
+ /* Copying qs_gquota */
+ if (copy_in_user(&compat_fsqstat->qs_gquota,
+ &fsqstat->qs_gquota,
+ sizeof(compat_fsqstat->qs_gquota)) ||
+ get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+ put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+ break;
+ /* Copying the rest */
+ if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+ &fsqstat->qs_incoredqs,
+ sizeof(struct compat_fs_quota_stat) -
+ offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+ get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+ put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+ break;
+ ret = 0;
+ break;
+ default:
+ ret = sys_quotactl(cmd, special, id, addr);
+ }
+ return ret;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3fc62b097be..e0b870f4749 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -100,9 +100,13 @@
*
* Any operation working on dquots via inode pointers must hold dqptr_sem. If
* operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock
- * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
*
* Each dquot has its dq_lock mutex. Locked dquots might not be referenced
* from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
+
static inline unsigned int
hashfn(const struct super_block *sb, unsigned int id, int type)
{
@@ -564,7 +571,7 @@ out:
}
EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type)
+int vfs_quota_sync(struct super_block *sb, int type, int wait)
{
struct list_head *dirty;
struct dquot *dquot;
@@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
spin_unlock(&dq_list_lock);
mutex_unlock(&dqopt->dqonoff_mutex);
+ if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+ return 0;
+
+ /* This is not very clever (and fast) but currently I don't know about
+ * any other simple way of getting quota data to disk and we must get
+ * them there for userspace to be visible... */
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+ sync_blockdev(sb->s_bdev);
+
+ /*
+ * Now when everything is written we can discard the pagecache so
+ * that userspace sees the changes.
+ */
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && cnt != type)
+ continue;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+ I_MUTEX_QUOTA);
+ truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+ mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+ }
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
return 0;
}
EXPORT_SYMBOL(vfs_quota_sync);
@@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type)
static void add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
+ int reserved = 0;
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ reserved = 1;
if (!atomic_read(&inode->i_writecount))
continue;
if (!dqinit_needed(inode, type))
@@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
spin_unlock(&inode_lock);
iput(old_inode);
- sb->dq_op->initialize(inode, type);
+ __dquot_initialize(inode, type);
/* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the inode_lock.
* We cannot iput the inode now as we can be holding the last
@@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
spin_unlock(&inode_lock);
iput(old_inode);
+
+ if (reserved) {
+ printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+ " was turned on thus quota information is probably "
+ "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+ }
}
/*
@@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
/*
* Claim reserved quota space
*/
-static void dquot_claim_reserved_space(struct dquot *dquot,
- qsize_t number)
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
{
- WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+ if (dquot->dq_dqb.dqb_rsvspace < number) {
+ WARN_ON_ONCE(1);
+ number = dquot->dq_dqb.dqb_rsvspace;
+ }
dquot->dq_dqb.dqb_curspace += number;
dquot->dq_dqb.dqb_rsvspace -= number;
}
@@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
- dquot->dq_dqb.dqb_rsvspace -= number;
+ if (dquot->dq_dqb.dqb_rsvspace >= number)
+ dquot->dq_dqb.dqb_rsvspace -= number;
+ else {
+ WARN_ON_ONCE(1);
+ dquot->dq_dqb.dqb_rsvspace = 0;
+ }
}
static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
*warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return QUOTA_OK;
+ return 0;
if (dquot->dq_dqb.dqb_ihardlimit &&
newinodes > dquot->dq_dqb.dqb_ihardlimit &&
!ignore_hardlimit(dquot)) {
*warntype = QUOTA_NL_IHARDWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
get_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
*warntype = QUOTA_NL_ISOFTLONGWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
}
- return QUOTA_OK;
+ return 0;
}
/* needs dq_data_lock */
@@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
*warntype = QUOTA_NL_NOWARN;
if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return QUOTA_OK;
+ return 0;
tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+ space;
@@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BHARDWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BSOFTLONGWARN;
- return NO_QUOTA;
+ return -EDQUOT;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
* We don't allow preallocation to exceed softlimit so exceeding will
* be always printed
*/
- return NO_QUOTA;
+ return -EDQUOT;
}
- return QUOTA_OK;
+ return 0;
}
static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
return QUOTA_NL_BHARDBELOW;
return QUOTA_NL_NOWARN;
}
+
/*
- * Initialize quota pointers in inode
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
+ * Initialize quota pointers in inode
+ *
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
*/
-int dquot_initialize(struct inode *inode, int type)
+static void __dquot_initialize(struct inode *inode, int type)
{
unsigned int id = 0;
- int cnt, ret = 0;
- struct dquot *got[MAXQUOTAS] = { NULL, NULL };
+ int cnt;
+ struct dquot *got[MAXQUOTAS];
struct super_block *sb = inode->i_sb;
+ qsize_t rsv;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return 0;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return;
/* First get references to structures we might need. */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
switch (cnt) {
@@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type)
}
down_write(&sb_dqopt(sb)->dqptr_sem);
- /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
if (IS_NOQUOTA(inode))
goto out_err;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type)
if (!inode->i_dquot[cnt]) {
inode->i_dquot[cnt] = got[cnt];
got[cnt] = NULL;
+ /*
+ * Make quota reservation system happy if someone
+ * did a write before quota was turned on
+ */
+ rsv = inode_get_rsv_space(inode);
+ if (unlikely(rsv))
+ dquot_resv_space(inode->i_dquot[cnt], rsv);
}
}
out_err:
up_write(&sb_dqopt(sb)->dqptr_sem);
/* Drop unused references */
dqput_all(got);
- return ret;
+}
+
+void dquot_initialize(struct inode *inode)
+{
+ __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);
/*
* Release all quotas referenced by inode
*/
-int dquot_drop(struct inode *inode)
+static void __dquot_drop(struct inode *inode)
{
int cnt;
struct dquot *put[MAXQUOTAS];
@@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode)
}
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
dqput_all(put);
- return 0;
}
-EXPORT_SYMBOL(dquot_drop);
-/* Wrapper to remove references to quota structures from inode */
-void vfs_dq_drop(struct inode *inode)
-{
- /* Here we can get arbitrary inode from clear_inode() so we have
- * to be careful. OTOH we don't need locking as quota operations
- * are allowed to change only at mount time */
- if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
- && inode->i_sb->dq_op->drop) {
- int cnt;
- /* Test before calling to rule out calls from proc and such
- * where we are not allowed to block. Note that this is
- * actually reliable test even without the lock - the caller
- * must assure that nobody can come after the DQUOT_DROP and
- * add quota pointers back anyway */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (inode->i_dquot[cnt])
- break;
- if (cnt < MAXQUOTAS)
- inode->i_sb->dq_op->drop(inode);
- }
-}
-EXPORT_SYMBOL(vfs_dq_drop);
+void dquot_drop(struct inode *inode)
+{
+ int cnt;
+
+ if (IS_NOQUOTA(inode))
+ return;
+
+ /*
+ * Test before calling to rule out calls from proc and such
+ * where we are not allowed to block. Note that this is
+ * actually reliable test even without the lock - the caller
+ * must assure that nobody can come after the DQUOT_DROP and
+ * add quota pointers back anyway.
+ */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (inode->i_dquot[cnt])
+ break;
+ }
+
+ if (cnt < MAXQUOTAS)
+ __dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
/*
* inode_reserved_space is managed internally by quota, and protected by
@@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode)
return inode->i_sb->dq_op->get_reserved_space(inode);
}
-static void inode_add_rsv_space(struct inode *inode, qsize_t number)
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_add_rsv_space);
-
-static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_claim_rsv_space);
-static void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL(inode_sub_rsv_space);
static qsize_t inode_get_rsv_space(struct inode *inode)
{
@@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
}
/*
- * Following four functions update i_blocks+i_bytes fields and
- * quota information (together with appropriate checks)
- * NOTE: We absolutely rely on the fact that caller dirties
- * the inode (usually macros in quotaops.h care about this) and
- * holds a handle for the current transaction so that dquot write and
- * inode write go into the same transaction.
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
*/
/*
* This operation can block, but only after everything is updated
*/
int __dquot_alloc_space(struct inode *inode, qsize_t number,
- int warn, int reserve)
+ int warn, int reserve)
{
- int cnt, ret = QUOTA_OK;
+ int cnt, ret = 0;
char warntype[MAXQUOTAS];
/*
* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex
*/
- if (IS_NOQUOTA(inode)) {
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_incr_space(inode, number, reserve);
goto out;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- inode_incr_space(inode, number, reserve);
- goto out_unlock;
- }
-
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
- == NO_QUOTA) {
- ret = NO_QUOTA;
+ ret = check_bdq(inode->i_dquot[cnt], number, !warn,
+ warntype+cnt);
+ if (ret) {
spin_unlock(&dq_data_lock);
goto out_flush_warn;
}
@@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
mark_all_dquot_dirty(inode->i_dquot);
out_flush_warn:
flush_warnings(inode->i_dquot, warntype);
-out_unlock:
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
out:
return ret;
}
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
- return __dquot_alloc_space(inode, number, warn, 0);
-}
-EXPORT_SYMBOL(dquot_alloc_space);
-
-int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
-{
- return __dquot_alloc_space(inode, number, warn, 1);
-}
-EXPORT_SYMBOL(dquot_reserve_space);
+EXPORT_SYMBOL(__dquot_alloc_space);
/*
* This operation can block, but only after everything is updated
*/
-int dquot_alloc_inode(const struct inode *inode, qsize_t number)
+int dquot_alloc_inode(const struct inode *inode)
{
- int cnt, ret = NO_QUOTA;
+ int cnt, ret = 0;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
- == NO_QUOTA)
+ ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+ if (ret)
goto warn_put_all;
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- dquot_incr_inodes(inode->i_dquot[cnt], number);
+ dquot_incr_inodes(inode->i_dquot[cnt], 1);
}
- ret = QUOTA_OK;
+
warn_put_all:
spin_unlock(&dq_data_lock);
- if (ret == QUOTA_OK)
+ if (ret == 0)
mark_all_dquot_dirty(inode->i_dquot);
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1528,23 +1576,19 @@ warn_put_all:
}
EXPORT_SYMBOL(dquot_alloc_inode);
-int dquot_claim_space(struct inode *inode, qsize_t number)
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
int cnt;
- int ret = QUOTA_OK;
- if (IS_NOQUOTA(inode)) {
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_claim_rsv_space(inode, number);
- goto out;
+ return 0;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- inode_claim_rsv_space(inode, number);
- goto out;
- }
-
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(dquot_claim_space);
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
/*
* This operation can block, but only after everything is updated
*/
-int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode)) {
-out_sub:
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
inode_decr_space(inode, number, reserve);
- return QUOTA_OK;
+ return;
}
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- goto out_sub;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
@@ -1603,56 +1640,34 @@ out_sub:
out_unlock:
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
-}
-
-int dquot_free_space(struct inode *inode, qsize_t number)
-{
- return __dquot_free_space(inode, number, 0);
}
-EXPORT_SYMBOL(dquot_free_space);
-
-/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
- __dquot_free_space(inode, number, 1);
-
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
+EXPORT_SYMBOL(__dquot_free_space);
/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_inode(const struct inode *inode, qsize_t number)
+void dquot_free_inode(const struct inode *inode)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
- if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+ return;
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
- if (IS_NOQUOTA(inode)) {
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
- }
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!inode->i_dquot[cnt])
continue;
- warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
- dquot_decr_inodes(inode->i_dquot[cnt], number);
+ warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+ dquot_decr_inodes(inode->i_dquot[cnt], 1);
}
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- return QUOTA_OK;
}
EXPORT_SYMBOL(dquot_free_inode);
@@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode);
* This operation can block, but only after everything is updated
* A transaction must be started when entering this function.
*/
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
{
qsize_t space, cur_space;
qsize_t rsv_space = 0;
struct dquot *transfer_from[MAXQUOTAS];
struct dquot *transfer_to[MAXQUOTAS];
- int cnt, ret = QUOTA_OK;
- int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
- chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
+ int cnt, ret = 0;
char warntype_to[MAXQUOTAS];
char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode))
- return QUOTA_OK;
+ return 0;
/* Initialize the arrays */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
transfer_from[cnt] = NULL;
transfer_to[cnt] = NULL;
warntype_to[cnt] = QUOTA_NL_NOWARN;
}
- if (chuid)
- transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
- USRQUOTA);
- if (chgid)
- transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
- GRPQUOTA);
-
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (mask & (1 << cnt))
+ transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
+ }
down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
- /* Now recheck reliably when holding dqptr_sem */
if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
goto put_all;
@@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (!transfer_to[cnt])
continue;
transfer_from[cnt] = inode->i_dquot[cnt];
- if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
- NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
- warntype_to + cnt) == NO_QUOTA)
+ ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+ if (ret)
+ goto over_quota;
+ ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+ if (ret)
goto over_quota;
}
@@ -1762,22 +1773,32 @@ over_quota:
/* Clear dquot pointers we don't want to dqput() */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
transfer_from[cnt] = NULL;
- ret = NO_QUOTA;
goto warn_put_all;
}
-EXPORT_SYMBOL(dquot_transfer);
-/* Wrapper for transferring ownership of an inode */
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+/* Wrapper for transferring ownership of an inode for uid/gid only
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
+ qid_t chid[MAXQUOTAS];
+ unsigned long mask = 0;
+
+ if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+ mask |= 1 << USRQUOTA;
+ chid[USRQUOTA] = iattr->ia_uid;
+ }
+ if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+ mask |= 1 << GRPQUOTA;
+ chid[GRPQUOTA] = iattr->ia_gid;
+ }
if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
- vfs_dq_init(inode);
- if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
- return 1;
+ dquot_initialize(inode);
+ return __dquot_transfer(inode, chid, mask);
}
return 0;
}
-EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(dquot_transfer);
/*
* Write info of quota file to disk
@@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
* Definitions of diskquota operations.
*/
const struct dquot_operations dquot_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = dquot_commit,
.acquire_dquot = dquot_acquire,
.release_dquot = dquot_release,
@@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = {
};
/*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = generic_file_open(inode, file);
+ if (!error && (file->f_mode & FMODE_WRITE))
+ dquot_initialize(inode);
+ return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
+
+/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
}
if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
- /* As we bypass the pagecache we must now flush the inode so
- * that we see all the changes from userspace... */
- write_inode_now(inode, 1);
- /* And now flush the block cache so that kernel sees the
- * changes */
+ /* As we bypass the pagecache we must now flush all the
+ * dirty data and invalidate caches so that kernel sees
+ * changes from userspace. It is not enough to just flush
+ * the quota file since if blocksize < pagesize, invalidation
+ * of the cache could fail because of other unrelated dirty
+ * data */
+ sync_filesystem(sb);
invalidate_bdev(sb->s_bdev);
}
mutex_lock(&dqopt->dqonoff_mutex);
@@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
/* We don't want quota and atime on quota files (deadlocks
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
- down_write(&dqopt->dqptr_sem);
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
S_NOQUOTA);
inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
mutex_unlock(&inode->i_mutex);
- up_write(&dqopt->dqptr_sem);
- sb->dq_op->drop(inode);
+ /*
+ * When S_NOQUOTA is set, remove dquot references as no more
+ * references can be added
+ */
+ __dquot_drop(inode);
}
error = -EIO;
@@ -2053,14 +2085,12 @@ out_file_init:
iput(inode);
out_lock:
if (oldflags != -1) {
- down_write(&dqopt->dqptr_sem);
mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
/* Set the flags back (in the case of accidental quotaon()
* on a wrong file we don't want to mess up the flags) */
inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
inode->i_flags |= oldflags;
mutex_unlock(&inode->i_mutex);
- up_write(&dqopt->dqptr_sem);
}
mutex_unlock(&dqopt->dqonoff_mutex);
out_fmt:
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 00000000000..2663ed90fb0
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,95 @@
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "VFS_DQUOT",
+ .version = 1,
+ .maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+ const char warntype)
+{
+ static atomic_t seq;
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+ int msg_size = 4 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u64));
+
+ /* We have to allocate using GFP_NOFS as we are called from a
+ * filesystem performing write and thus further recursion into
+ * the fs to free some data could cause deadlocks. */
+ skb = genlmsg_new(msg_size, GFP_NOFS);
+ if (!skb) {
+ printk(KERN_ERR
+ "VFS: Not enough memory to send quota warning.\n");
+ return;
+ }
+ msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+ &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+ if (!msg_head) {
+ printk(KERN_ERR
+ "VFS: Cannot store netlink header in quota warning.\n");
+ goto err_out;
+ }
+ ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+ if (ret)
+ goto attr_err_out;
+ genlmsg_end(skb, msg_head);
+
+ genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+ return;
+attr_err_out:
+ printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+ if (genl_register_family(&quota_genl_family) != 0)
+ printk(KERN_ERR
+ "VFS: Failed to create quota netlink interface.\n");
+ return 0;
+};
+
+module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ee91e275695..95388f9b735 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
#include <linux/slab.h>
#include <asm/current.h>
#include <asm/uaccess.h>
-#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -18,220 +17,205 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
-#include <net/netlink.h>
-#include <net/genetlink.h>
+#include <linux/writeback.h>
-/* Check validity of generic quotactl commands */
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
+ qid_t id)
{
- if (type >= MAXQUOTAS)
- return -EINVAL;
- if (!sb && cmd != Q_SYNC)
- return -ENODEV;
- /* Is operation supported? */
- if (sb && !sb->s_qcop)
- return -ENOSYS;
-
switch (cmd) {
- case Q_GETFMT:
- break;
- case Q_QUOTAON:
- if (!sb->s_qcop->quota_on)
- return -ENOSYS;
- break;
- case Q_QUOTAOFF:
- if (!sb->s_qcop->quota_off)
- return -ENOSYS;
- break;
- case Q_SETINFO:
- if (!sb->s_qcop->set_info)
- return -ENOSYS;
- break;
- case Q_GETINFO:
- if (!sb->s_qcop->get_info)
- return -ENOSYS;
- break;
- case Q_SETQUOTA:
- if (!sb->s_qcop->set_dqblk)
- return -ENOSYS;
- break;
- case Q_GETQUOTA:
- if (!sb->s_qcop->get_dqblk)
- return -ENOSYS;
- break;
- case Q_SYNC:
- if (sb && !sb->s_qcop->quota_sync)
- return -ENOSYS;
+ /* these commands do not require any special privilegues */
+ case Q_GETFMT:
+ case Q_SYNC:
+ case Q_GETINFO:
+ case Q_XGETQSTAT:
+ case Q_XQUOTASYNC:
+ break;
+ /* allow to query information for dquots we "own" */
+ case Q_GETQUOTA:
+ case Q_XGETQUOTA:
+ if ((type == USRQUOTA && current_euid() == id) ||
+ (type == GRPQUOTA && in_egroup_p(id)))
break;
- default:
- return -EINVAL;
+ /*FALLTHROUGH*/
+ default:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
}
- /* Is quota turned on for commands which need it? */
- switch (cmd) {
- case Q_GETFMT:
- case Q_GETINFO:
- case Q_SETINFO:
- case Q_SETQUOTA:
- case Q_GETQUOTA:
- /* This is just an informative test so we are satisfied
- * without the lock */
- if (!sb_has_quota_active(sb, type))
- return -ESRCH;
- }
+ return security_quotactl(cmd, type, id, sb);
+}
- /* Check privileges */
- if (cmd == Q_GETQUOTA) {
- if (((type == USRQUOTA && current_euid() != id) ||
- (type == GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
+static int quota_sync_all(int type)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (type >= MAXQUOTAS)
+ return -EINVAL;
+ ret = security_quotactl(Q_SYNC, type, 0, NULL);
+ if (ret)
+ return ret;
+
+ spin_lock(&sb_lock);
+restart:
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (!sb->s_qcop || !sb->s_qcop->quota_sync)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_read(&sb->s_umount);
+ if (sb->s_root)
+ sb->s_qcop->quota_sync(sb, type, 1);
+ up_read(&sb->s_umount);
+ spin_lock(&sb_lock);
+ if (__put_super_and_need_restart(sb))
+ goto restart;
}
- else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ spin_unlock(&sb_lock);
return 0;
}
-/* Check validity of XFS Quota Manager commands */
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+ void __user *addr)
{
- if (type >= XQM_MAXQUOTAS)
- return -EINVAL;
- if (!sb)
- return -ENODEV;
- if (!sb->s_qcop)
- return -ENOSYS;
+ char *pathname;
+ int ret = -ENOSYS;
+
+ pathname = getname(addr);
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+ if (sb->s_qcop->quota_on)
+ ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+ putname(pathname);
+ return ret;
+}
- switch (cmd) {
- case Q_XQUOTAON:
- case Q_XQUOTAOFF:
- case Q_XQUOTARM:
- if (!sb->s_qcop->set_xstate)
- return -ENOSYS;
- break;
- case Q_XGETQSTAT:
- if (!sb->s_qcop->get_xstate)
- return -ENOSYS;
- break;
- case Q_XSETQLIM:
- if (!sb->s_qcop->set_xquota)
- return -ENOSYS;
- break;
- case Q_XGETQUOTA:
- if (!sb->s_qcop->get_xquota)
- return -ENOSYS;
- break;
- case Q_XQUOTASYNC:
- if (!sb->s_qcop->quota_sync)
- return -ENOSYS;
- break;
- default:
- return -EINVAL;
- }
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
+{
+ __u32 fmt;
- /* Check privileges */
- if (cmd == Q_XGETQUOTA) {
- if (((type == XQM_USRQUOTA && current_euid() != id) ||
- (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ down_read(&sb_dqopt(sb)->dqptr_sem);
+ if (!sb_has_quota_active(sb, type)) {
+ up_read(&sb_dqopt(sb)->dqptr_sem);
+ return -ESRCH;
}
+ fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+ up_read(&sb_dqopt(sb)->dqptr_sem);
+ if (copy_to_user(addr, &fmt, sizeof(fmt)))
+ return -EFAULT;
+ return 0;
+}
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
+{
+ struct if_dqinfo info;
+ int ret;
+
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->get_info)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_info(sb, type, &info);
+ if (!ret && copy_to_user(addr, &info, sizeof(info)))
+ return -EFAULT;
+ return ret;
+}
+
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+ struct if_dqinfo info;
+
+ if (copy_from_user(&info, addr, sizeof(info)))
+ return -EFAULT;
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->set_info)
+ return -ENOSYS;
+ return sb->s_qcop->set_info(sb, type, &info);
+}
+
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct if_dqblk idq;
+ int ret;
+
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->get_dqblk)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+ if (ret)
+ return ret;
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
return 0;
}
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
- qid_t id)
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
{
- int error;
-
- if (XQM_COMMAND(cmd))
- error = xqm_quotactl_valid(sb, type, cmd, id);
- else
- error = generic_quotactl_valid(sb, type, cmd, id);
- if (!error)
- error = security_quotactl(cmd, type, id, sb);
- return error;
+ struct if_dqblk idq;
+
+ if (copy_from_user(&idq, addr, sizeof(idq)))
+ return -EFAULT;
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
+ if (!sb->s_qcop->set_dqblk)
+ return -ENOSYS;
+ return sb->s_qcop->set_dqblk(sb, type, id, &idq);
}
-#ifdef CONFIG_QUOTA
-void sync_quota_sb(struct super_block *sb, int type)
+static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
{
- int cnt;
+ __u32 flags;
- if (!sb->s_qcop->quota_sync)
- return;
+ if (copy_from_user(&flags, addr, sizeof(flags)))
+ return -EFAULT;
+ if (!sb->s_qcop->set_xstate)
+ return -ENOSYS;
+ return sb->s_qcop->set_xstate(sb, flags, cmd);
+}
- sb->s_qcop->quota_sync(sb, type);
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+ struct fs_quota_stat fqs;
+ int ret;
- if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
- return;
- /* This is not very clever (and fast) but currently I don't know about
- * any other simple way of getting quota data to disk and we must get
- * them there for userspace to be visible... */
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
- sync_blockdev(sb->s_bdev);
+ if (!sb->s_qcop->get_xstate)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_xstate(sb, &fqs);
+ if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+ return -EFAULT;
+ return ret;
+}
- /*
- * Now when everything is written we can discard the pagecache so
- * that userspace sees the changes.
- */
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (type != -1 && cnt != type)
- continue;
- if (!sb_has_quota_active(sb, cnt))
- continue;
- mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
- I_MUTEX_QUOTA);
- truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
- mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
- }
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct fs_disk_quota fdq;
+
+ if (copy_from_user(&fdq, addr, sizeof(fdq)))
+ return -EFAULT;
+ if (!sb->s_qcop->set_xquota)
+ return -ENOSYS;
+ return sb->s_qcop->set_xquota(sb, type, id, &fdq);
}
-#endif
-static void sync_dquots(int type)
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
{
- struct super_block *sb;
- int cnt;
+ struct fs_disk_quota fdq;
+ int ret;
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- /* This test just improves performance so it needn't be
- * reliable... */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (type != -1 && type != cnt)
- continue;
- if (!sb_has_quota_active(sb, cnt))
- continue;
- if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
- list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
- continue;
- break;
- }
- if (cnt == MAXQUOTAS)
- continue;
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root)
- sync_quota_sb(sb, type);
- up_read(&sb->s_umount);
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
+ if (!sb->s_qcop->get_xquota)
+ return -ENOSYS;
+ ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+ if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+ return -EFAULT;
+ return ret;
}
/* Copy parameters and call proper function */
@@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
{
int ret;
+ if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+ return -EINVAL;
+ if (!sb->s_qcop)
+ return -ENOSYS;
+
+ ret = check_quotactl_permission(sb, type, cmd, id);
+ if (ret < 0)
+ return ret;
+
switch (cmd) {
- case Q_QUOTAON: {
- char *pathname;
-
- pathname = getname(addr);
- if (IS_ERR(pathname))
- return PTR_ERR(pathname);
- ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
- putname(pathname);
- return ret;
- }
- case Q_QUOTAOFF:
- return sb->s_qcop->quota_off(sb, type, 0);
-
- case Q_GETFMT: {
- __u32 fmt;
-
- down_read(&sb_dqopt(sb)->dqptr_sem);
- if (!sb_has_quota_active(sb, type)) {
- up_read(&sb_dqopt(sb)->dqptr_sem);
- return -ESRCH;
- }
- fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
- up_read(&sb_dqopt(sb)->dqptr_sem);
- if (copy_to_user(addr, &fmt, sizeof(fmt)))
- return -EFAULT;
- return 0;
- }
- case Q_GETINFO: {
- struct if_dqinfo info;
-
- ret = sb->s_qcop->get_info(sb, type, &info);
- if (ret)
- return ret;
- if (copy_to_user(addr, &info, sizeof(info)))
- return -EFAULT;
- return 0;
- }
- case Q_SETINFO: {
- struct if_dqinfo info;
-
- if (copy_from_user(&info, addr, sizeof(info)))
- return -EFAULT;
- return sb->s_qcop->set_info(sb, type, &info);
- }
- case Q_GETQUOTA: {
- struct if_dqblk idq;
-
- ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
- if (ret)
- return ret;
- if (copy_to_user(addr, &idq, sizeof(idq)))
- return -EFAULT;
- return 0;
- }
- case Q_SETQUOTA: {
- struct if_dqblk idq;
-
- if (copy_from_user(&idq, addr, sizeof(idq)))
- return -EFAULT;
- return sb->s_qcop->set_dqblk(sb, type, id, &idq);
- }
- case Q_SYNC:
- if (sb)
- sync_quota_sb(sb, type);
- else
- sync_dquots(type);
- return 0;
-
- case Q_XQUOTAON:
- case Q_XQUOTAOFF:
- case Q_XQUOTARM: {
- __u32 flags;
-
- if (copy_from_user(&flags, addr, sizeof(flags)))
- return -EFAULT;
- return sb->s_qcop->set_xstate(sb, flags, cmd);
- }
- case Q_XGETQSTAT: {
- struct fs_quota_stat fqs;
-
- if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
- return ret;
- if (copy_to_user(addr, &fqs, sizeof(fqs)))
- return -EFAULT;
- return 0;
- }
- case Q_XSETQLIM: {
- struct fs_disk_quota fdq;
-
- if (copy_from_user(&fdq, addr, sizeof(fdq)))
- return -EFAULT;
- return sb->s_qcop->set_xquota(sb, type, id, &fdq);
- }
- case Q_XGETQUOTA: {
- struct fs_disk_quota fdq;
-
- ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
- if (ret)
- return ret;
- if (copy_to_user(addr, &fdq, sizeof(fdq)))
- return -EFAULT;
- return 0;
- }
- case Q_XQUOTASYNC:
- return sb->s_qcop->quota_sync(sb, type);
- /* We never reach here unless validity check is broken */
- default:
- BUG();
+ case Q_QUOTAON:
+ return quota_quotaon(sb, type, cmd, id, addr);
+ case Q_QUOTAOFF:
+ if (!sb->s_qcop->quota_off)
+ return -ENOSYS;
+ return sb->s_qcop->quota_off(sb, type, 0);
+ case Q_GETFMT:
+ return quota_getfmt(sb, type, addr);
+ case Q_GETINFO:
+ return quota_getinfo(sb, type, addr);
+ case Q_SETINFO:
+ return quota_setinfo(sb, type, addr);
+ case Q_GETQUOTA:
+ return quota_getquota(sb, type, id, addr);
+ case Q_SETQUOTA:
+ return quota_setquota(sb, type, id, addr);
+ case Q_SYNC:
+ if (!sb->s_qcop->quota_sync)
+ return -ENOSYS;
+ return sb->s_qcop->quota_sync(sb, type, 1);
+ case Q_XQUOTAON:
+ case Q_XQUOTAOFF:
+ case Q_XQUOTARM:
+ return quota_setxstate(sb, cmd, addr);
+ case Q_XGETQSTAT:
+ return quota_getxstate(sb, addr);
+ case Q_XSETQLIM:
+ return quota_setxquota(sb, type, id, addr);
+ case Q_XGETQUOTA:
+ return quota_getxquota(sb, type, id, addr);
+ case Q_XQUOTASYNC:
+ /* caller already holds s_umount */
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ writeback_inodes_sb(sb);
+ return 0;
+ default:
+ return -EINVAL;
}
- return 0;
}
/*
@@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
cmds = cmd >> SUBCMDSHIFT;
type = cmd & SUBCMDMASK;
- if (cmds != Q_SYNC || special) {
- sb = quotactl_block(special);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ /*
+ * As a special case Q_SYNC can be called without a specific device.
+ * It will iterate all superblocks that have quota enabled and call
+ * the sync action on each of them.
+ */
+ if (!special) {
+ if (cmds == Q_SYNC)
+ return quota_sync_all(type);
+ return -ENODEV;
}
- ret = check_quotactl_valid(sb, type, cmds, id);
- if (ret >= 0)
- ret = do_quotactl(sb, type, cmds, id, addr);
- if (sb)
- drop_super(sb);
+ sb = quotactl_block(special);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
- return ret;
-}
-
-#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
- compat_u64 dqb_bhardlimit;
- compat_u64 dqb_bsoftlimit;
- compat_u64 dqb_curspace;
- compat_u64 dqb_ihardlimit;
- compat_u64 dqb_isoftlimit;
- compat_u64 dqb_curinodes;
- compat_u64 dqb_btime;
- compat_u64 dqb_itime;
- compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
- compat_u64 dqb_bhardlimit;
- compat_u64 qfs_nblks;
- compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
- __s8 qs_version;
- __u16 qs_flags;
- __s8 qs_pad;
- struct compat_fs_qfilestat qs_uquota;
- struct compat_fs_qfilestat qs_gquota;
- compat_uint_t qs_incoredqs;
- compat_int_t qs_btimelimit;
- compat_int_t qs_itimelimit;
- compat_int_t qs_rtbtimelimit;
- __u16 qs_bwarnlimit;
- __u16 qs_iwarnlimit;
-};
-
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
-{
- unsigned int cmds;
- struct if_dqblk __user *dqblk;
- struct compat_if_dqblk __user *compat_dqblk;
- struct fs_quota_stat __user *fsqstat;
- struct compat_fs_quota_stat __user *compat_fsqstat;
- compat_uint_t data;
- u16 xdata;
- long ret;
+ ret = do_quotactl(sb, type, cmds, id, addr);
- cmds = cmd >> SUBCMDSHIFT;
-
- switch (cmds) {
- case Q_GETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = sys_quotactl(cmd, special, id, dqblk);
- if (ret)
- break;
- if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &dqblk->dqb_valid) ||
- put_user(data, &compat_dqblk->dqb_valid))
- ret = -EFAULT;
- break;
- case Q_SETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = -EFAULT;
- if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &compat_dqblk->dqb_valid) ||
- put_user(data, &dqblk->dqb_valid))
- break;
- ret = sys_quotactl(cmd, special, id, dqblk);
- break;
- case Q_XGETQSTAT:
- fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
- compat_fsqstat = addr;
- ret = sys_quotactl(cmd, special, id, fsqstat);
- if (ret)
- break;
- ret = -EFAULT;
- /* Copying qs_version, qs_flags, qs_pad */
- if (copy_in_user(compat_fsqstat, fsqstat,
- offsetof(struct compat_fs_quota_stat, qs_uquota)))
- break;
- /* Copying qs_uquota */
- if (copy_in_user(&compat_fsqstat->qs_uquota,
- &fsqstat->qs_uquota,
- sizeof(compat_fsqstat->qs_uquota)) ||
- get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
- break;
- /* Copying qs_gquota */
- if (copy_in_user(&compat_fsqstat->qs_gquota,
- &fsqstat->qs_gquota,
- sizeof(compat_fsqstat->qs_gquota)) ||
- get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
- break;
- /* Copying the rest */
- if (copy_in_user(&compat_fsqstat->qs_incoredqs,
- &fsqstat->qs_incoredqs,
- sizeof(struct compat_fs_quota_stat) -
- offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
- get_user(xdata, &fsqstat->qs_iwarnlimit) ||
- put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
- break;
- ret = 0;
- break;
- default:
- ret = sys_quotactl(cmd, special, id, addr);
- }
+ drop_super(sb);
return ret;
}
-#endif
-
-
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "VFS_DQUOT",
- .version = 1,
- .maxattr = QUOTA_NL_A_MAX,
-};
-
-/**
- * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
- * @dev: The device on which the fs is mounted (sb->s_dev)
- * @warntype: The type of the warning: QUOTA_NL_...
- *
- * This can be used by filesystems (including those which don't use
- * dquot) to send a message to userspace relating to quota limits.
- *
- */
-
-void quota_send_warning(short type, unsigned int id, dev_t dev,
- const char warntype)
-{
- static atomic_t seq;
- struct sk_buff *skb;
- void *msg_head;
- int ret;
- int msg_size = 4 * nla_total_size(sizeof(u32)) +
- 2 * nla_total_size(sizeof(u64));
-
- /* We have to allocate using GFP_NOFS as we are called from a
- * filesystem performing write and thus further recursion into
- * the fs to free some data could cause deadlocks. */
- skb = genlmsg_new(msg_size, GFP_NOFS);
- if (!skb) {
- printk(KERN_ERR
- "VFS: Not enough memory to send quota warning.\n");
- return;
- }
- msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
- &quota_genl_family, 0, QUOTA_NL_C_WARNING);
- if (!msg_head) {
- printk(KERN_ERR
- "VFS: Cannot store netlink header in quota warning.\n");
- goto err_out;
- }
- ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
- if (ret)
- goto attr_err_out;
- genlmsg_end(skb, msg_head);
-
- genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
- return;
-attr_err_out:
- printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
- kfree_skb(skb);
-}
-EXPORT_SYMBOL(quota_send_warning);
-
-static int __init quota_init(void)
-{
- if (genl_register_family(&quota_genl_family) != 0)
- printk(KERN_ERR
- "VFS: Failed to create quota netlink interface.\n");
- return 0;
-};
-
-module_init(quota_init);
-#endif
-
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 65c87276117..483442e66ed 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
return 0; // No free blocks in this bitmap
}
- /* search for a first zero bit -- beggining of a window */
+ /* search for a first zero bit -- beginning of a window */
*beg = reiserfs_find_next_zero_le_bit
((unsigned long *)(bh->b_data), boundary, *beg);
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
journal_mark_dirty(th, s, sbh);
if (for_unformatted)
- vfs_dq_free_block_nodirty(inode, 1);
+ dquot_free_block_nodirty(inode, 1);
}
void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
amount_needed, hint->inode->i_uid);
#endif
quota_ret =
- vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
+ dquot_alloc_block_nodirty(hint->inode, amount_needed);
if (quota_ret) /* Quota exceeded? */
return QUOTA_EXCEEDED;
if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
"reiserquota: allocating (prealloc) %d blocks id=%u",
hint->prealloc_size, hint->inode->i_uid);
#endif
- quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
+ quota_ret = dquot_prealloc_block_nodirty(hint->inode,
hint->prealloc_size);
if (quota_ret)
hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
hint->inode->i_uid);
#endif
/* Free not allocated blocks */
- vfs_dq_free_block_nodirty(hint->inode,
+ dquot_free_block_nodirty(hint->inode,
amount_needed + hint->prealloc_size -
nr_allocated);
}
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
REISERFS_I(hint->inode)->i_prealloc_count,
hint->inode->i_uid);
#endif
- vfs_dq_free_block_nodirty(hint->inode, amount_needed +
+ dquot_free_block_nodirty(hint->inode, amount_needed +
hint->prealloc_size - nr_allocated -
REISERFS_I(hint->inode)->
i_prealloc_count);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index da2dba082e2..1d9c12714c5 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = {
.compat_ioctl = reiserfs_compat_ioctl,
#endif
.mmap = reiserfs_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.release = reiserfs_file_release,
.fsync = reiserfs_sync_file,
.aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 2df0f5c7c60..d1da94b82d8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode)
int depth;
int err;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
depth = reiserfs_write_lock_once(inode->i_sb);
@@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode)
* after delete_object so that quota updates go into the same transaction as
* stat data deletion */
if (!err)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (journal_end(&th, inode->i_sb, jbegin_count))
goto out;
@@ -1615,7 +1618,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
** to properly mark inodes for datasync and such, but only actually
** does something when called for a synchronous update.
*/
-int reiserfs_write_inode(struct inode *inode, int do_sync)
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct reiserfs_transaction_handle th;
int jbegin_count = 1;
@@ -1627,7 +1630,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
** inode needs to reach disk for safety, and they can safely be
** ignored because the altered inode has already been logged.
*/
- if (do_sync && !(current->flags & PF_MEMALLOC)) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
reiserfs_write_lock(inode->i_sb);
if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
reiserfs_update_sd(&th, inode);
@@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- if (vfs_dq_alloc_inode(inode)) {
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
goto out_end_trans;
- }
if (!dir->i_nlink) {
err = -EPERM;
goto out_bad_inode;
@@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
INODE_PKEY(inode)->k_objectid = 0;
/* Quota change must be inside a transaction for journaling */
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
out_end_trans:
journal_end(th, th->t_super, th->t_blocks_allocated);
/* Drop can be outside and it needs more credits so it's better to have it outside */
- vfs_dq_drop(inode);
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
@@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
depth = reiserfs_write_lock_once(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
+ dquot_initialize(inode);
+
/* version 2 items will be caught by the s_maxbytes check
** done for us in vmtruncate
*/
@@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
jbegin_count);
if (error)
goto out;
- error =
- vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
journal_end(&th, inode->i_sb,
jbegin_count);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9d4dcf0b07c..96e4cbbfaa1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
*/
static int drop_new_inode(struct inode *inode)
{
- vfs_dq_drop(inode);
+ dquot_drop(inode);
make_bad_inode(inode);
inode->i_flags |= S_NOQUOTA;
iput(inode);
@@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode)
}
/* utility function that does setup for reiserfs_new_inode.
-** vfs_dq_init needs lots of credits so it's better to have it
+** dquot_initialize needs lots of credits so it's better to have it
** outside of a transaction, so we had to pull some bits of
** reiserfs_new_inode out into this func.
*/
@@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
} else {
inode->i_gid = current_fsgid();
}
- vfs_dq_init(inode);
+ dquot_initialize(inode);
return 0;
}
@@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
+ dquot_initialize(dir);
+
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
@@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
if (!new_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
@@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+ dquot_initialize(dir);
+
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
/* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
REISERFS_I(dir)->new_packing_locality = 1;
@@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+ dquot_initialize(dir);
+
reiserfs_write_lock(dir->i_sb);
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval)
@@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
unsigned long savelink;
int depth;
+ dquot_initialize(dir);
+
inode = dentry->d_inode;
/* in this transaction we can be doing at max two balancings and update
@@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+ dquot_initialize(parent_dir);
+
if (!(inode = new_inode(parent_dir->i_sb))) {
return -ENOMEM;
}
@@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
JOURNAL_PER_BALANCE_CNT * 3 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+ dquot_initialize(dir);
+
reiserfs_write_lock(dir->i_sb);
if (inode->i_nlink >= REISERFS_LINK_MAX) {
//FIXME: sd_nlink is 32 bit for new files
@@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
JOURNAL_PER_BALANCE_CNT * 3 + 5 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_inode = old_dentry->d_inode;
new_dentry_inode = new_dentry->d_inode;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e..313d39d639e 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
"reiserquota delete_item(): freeing %u, id=%u type=%c",
quota_cut_bytes, inode->i_uid, head2type(&s_ih));
#endif
- vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+ dquot_free_space_nodirty(inode, quota_cut_bytes);
/* Return deleted body length */
return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
quota_cut_bytes, inode->i_uid,
key2type(key));
#endif
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
quota_cut_bytes);
}
break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
"reiserquota cut_from_item(): freeing %u id=%u type=%c",
quota_cut_bytes, inode->i_uid, '?');
#endif
- vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+ dquot_free_space_nodirty(inode, quota_cut_bytes);
return ret_value;
}
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
key2type(&(key->on_disk_key)));
#endif
- if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+ retval = dquot_alloc_space_nodirty(inode, pasted_size);
+ if (retval) {
pathrelse(search_path);
- return -EDQUOT;
+ return retval;
}
init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
pasted_size, inode->i_uid,
key2type(&(key->on_disk_key)));
#endif
- vfs_dq_free_space_nodirty(inode, pasted_size);
+ dquot_free_space_nodirty(inode, pasted_size);
return retval;
}
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
#endif
/* We can't dirty inode here. It would be immediately written but
* appropriate stat item isn't inserted yet... */
- if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+ retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+ if (retval) {
pathrelse(path);
- return -EDQUOT;
+ return retval;
}
}
init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
quota_bytes, inode->i_uid, head2type(ih));
#endif
if (inode)
- vfs_dq_free_space_nodirty(inode, quota_bytes);
+ dquot_free_space_nodirty(inode, quota_bytes);
return retval;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb..04bf5d791bd 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s)
retval = remove_save_link_only(s, &save_link_key, 0);
continue;
}
- vfs_dq_init(inode);
+ dquot_initialize(inode);
if (truncate && S_ISDIR(inode->i_mode)) {
/* We got a truncate request for a dir which is impossible.
@@ -578,6 +578,11 @@ out:
reiserfs_write_unlock_once(inode->i_sb, lock_depth);
}
+static void reiserfs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
#ifdef CONFIG_QUOTA
static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
size_t, loff_t);
@@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = {
.destroy_inode = reiserfs_destroy_inode,
.write_inode = reiserfs_write_inode,
.dirty_inode = reiserfs_dirty_inode,
+ .clear_inode = reiserfs_clear_inode,
.delete_inode = reiserfs_delete_inode,
.put_super = reiserfs_put_super,
.write_super = reiserfs_write_super,
@@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int);
static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
static const struct dquot_operations reiserfs_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
.write_dquot = reiserfs_write_dquot,
.acquire_dquot = reiserfs_acquire_dquot,
.release_dquot = reiserfs_release_dquot,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 81f09fab8ae..37d034ca7d9 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -61,7 +61,6 @@
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
return dir->i_op->create(dir, dentry, mode, NULL);
}
#endif
@@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
return dir->i_op->mkdir(dir, dentry, mode);
}
@@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
@@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- vfs_dq_init(dir);
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e..500a669f779 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
}
#endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+ unsigned long n;
+ fd_set __user *inp, *outp, *exp;
+ struct timeval __user *tvp;
+};
+
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+ struct sel_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
+
struct poll_list {
struct poll_list *next;
int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct poll_list *walk = head;
unsigned long todo = nfds;
- if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5afd554efad..e1f437be6c3 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -734,7 +734,7 @@ EXPORT_SYMBOL(seq_hlist_start_head);
* seq_hlist_next - move to the next position of the hlist
* @v: the current iterator
* @head: the head of the hlist
- * @pos: the current posision
+ * @ppos: the current position
*
* Called at seq_file->op->next().
*/
@@ -800,7 +800,7 @@ EXPORT_SYMBOL(seq_hlist_start_head_rcu);
* seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
* @v: the current iterator
* @head: the head of the hlist
- * @pos: the current posision
+ * @ppos: the current position
*
* Called at seq_file->op->next().
*
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30..df8a19ef870 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a796031034..1cb0d81b164 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
-#include <linux/mutex.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "decompressor.h"
/*
* Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
}
if (compressed) {
- int zlib_err = 0, zlib_init = 0;
-
- /*
- * Uncompress block.
- */
-
- mutex_lock(&msblk->read_data_mutex);
-
- msblk->stream.avail_out = 0;
- msblk->stream.avail_in = 0;
-
- bytes = length;
- do {
- if (msblk->stream.avail_in == 0 && k < b) {
- avail = min(bytes, msblk->devblksize - offset);
- bytes -= avail;
- wait_on_buffer(bh[k]);
- if (!buffer_uptodate(bh[k]))
- goto release_mutex;
-
- if (avail == 0) {
- offset = 0;
- put_bh(bh[k++]);
- continue;
- }
-
- msblk->stream.next_in = bh[k]->b_data + offset;
- msblk->stream.avail_in = avail;
- offset = 0;
- }
-
- if (msblk->stream.avail_out == 0 && page < pages) {
- msblk->stream.next_out = buffer[page++];
- msblk->stream.avail_out = PAGE_CACHE_SIZE;
- }
-
- if (!zlib_init) {
- zlib_err = zlib_inflateInit(&msblk->stream);
- if (zlib_err != Z_OK) {
- ERROR("zlib_inflateInit returned"
- " unexpected result 0x%x,"
- " srclength %d\n", zlib_err,
- srclength);
- goto release_mutex;
- }
- zlib_init = 1;
- }
-
- zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
-
- if (msblk->stream.avail_in == 0 && k < b)
- put_bh(bh[k++]);
- } while (zlib_err == Z_OK);
-
- if (zlib_err != Z_STREAM_END) {
- ERROR("zlib_inflate error, data probably corrupt\n");
- goto release_mutex;
- }
-
- zlib_err = zlib_inflateEnd(&msblk->stream);
- if (zlib_err != Z_OK) {
- ERROR("zlib_inflate error, data probably corrupt\n");
- goto release_mutex;
- }
- length = msblk->stream.total_out;
- mutex_unlock(&msblk->read_data_mutex);
+ length = squashfs_decompress(msblk, buffer, bh, b, offset,
+ length, srclength, pages);
+ if (length < 0)
+ goto read_failure;
} else {
/*
* Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
kfree(bh);
return length;
-release_mutex:
- mutex_unlock(&msblk->read_data_mutex);
-
block_release:
for (; k < b; k++)
put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d..57314bee905 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
-#include <linux/zlib.h>
#include <linux/pagemap.h>
#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 00000000000..157478da6ac
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+ NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+
+static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+ NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+ NULL, NULL, NULL, 0, "unknown", 0
+};
+
+static const struct squashfs_decompressor *decompressor[] = {
+ &squashfs_zlib_comp_ops,
+ &squashfs_lzma_unsupported_comp_ops,
+ &squashfs_lzo_unsupported_comp_ops,
+ &squashfs_unknown_comp_ops
+};
+
+
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+ int i;
+
+ for (i = 0; decompressor[i]->id; i++)
+ if (id == decompressor[i]->id)
+ break;
+
+ return decompressor[i];
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 00000000000..7425f80783f
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+
+struct squashfs_decompressor {
+ void *(*init)(struct squashfs_sb_info *);
+ void (*free)(void *);
+ int (*decompress)(struct squashfs_sb_info *, void **,
+ struct buffer_head **, int, int, int, int, int);
+ int id;
+ char *name;
+ int supported;
+};
+
+static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
+{
+ return msblk->decompressor->init(msblk);
+}
+
+static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
+ void *s)
+{
+ if (msblk->decompressor)
+ msblk->decompressor->free(s);
+}
+
+static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
+ void **buffer, struct buffer_head **bh, int b, int offset, int length,
+ int srclength, int pages)
+{
+ return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
+ length, srclength, pages);
+}
+#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed86..12b933ac658 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e03..7f93d5a9ee0 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
#include <linux/vfs.h>
#include <linux/dcache.h>
#include <linux/exportfs.h>
-#include <linux/zlib.h>
#include <linux/slab.h>
#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831d..a25c5060bdc 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/mutex.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc..7c90bbd6879 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba2..b7f64bcd2b7 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39e..49daaf669e4 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
#include <linux/fs.h>
#include <linux/vfs.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22..5266bd8ad93 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/dcache.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7..fe2587af551 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
u64, int);
extern int squashfs_read_table(struct super_block *, void *, u64, int);
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+
/* export.c */
extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
extern int squashfs_read_inode(struct inode *, long long);
/*
- * Inodes and files operations
+ * Inodes, files and decompressor operations
*/
/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
/* symlink.c */
extern const struct address_space_operations squashfs_symlink_aops;
+
+/* zlib_wrapper.c */
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568..79024245ea0 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
#define SQUASHFS_MAX_FILE_SIZE (1LL << \
(SQUASHFS_MAX_FILE_SIZE_LOG - 2))
-#define SQUASHFS_MARKER_BYTE 0xff
-
/* meta index cache */
#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
/*
* definitions for structures on disk
*/
-#define ZLIB_COMPRESSION 1
+#define ZLIB_COMPRESSION 1
+#define LZMA_COMPRESSION 2
+#define LZO_COMPRESSION 3
struct squashfs_super_block {
__le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1..2e77dc547e2 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
};
struct squashfs_sb_info {
- int devblksize;
- int devblksize_log2;
- struct squashfs_cache *block_cache;
- struct squashfs_cache *fragment_cache;
- struct squashfs_cache *read_page;
- int next_meta_index;
- __le64 *id_table;
- __le64 *fragment_index;
- unsigned int *fragment_index_2;
- struct mutex read_data_mutex;
- struct mutex meta_index_mutex;
- struct meta_index *meta_index;
- z_stream stream;
- __le64 *inode_lookup_table;
- u64 inode_table;
- u64 directory_table;
- unsigned int block_size;
- unsigned short block_log;
- long long bytes_used;
- unsigned int inodes;
+ const struct squashfs_decompressor *decompressor;
+ int devblksize;
+ int devblksize_log2;
+ struct squashfs_cache *block_cache;
+ struct squashfs_cache *fragment_cache;
+ struct squashfs_cache *read_page;
+ int next_meta_index;
+ __le64 *id_table;
+ __le64 *fragment_index;
+ struct mutex read_data_mutex;
+ struct mutex meta_index_mutex;
+ struct meta_index *meta_index;
+ void *stream;
+ __le64 *inode_lookup_table;
+ u64 inode_table;
+ u64 directory_table;
+ unsigned int block_size;
+ unsigned short block_log;
+ long long bytes_used;
+ unsigned int inodes;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53ad..3550aec2f65 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/zlib.h>
#include <linux/magic.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "decompressor.h"
static struct file_system_type squashfs_fs_type;
static const struct super_operations squashfs_super_ops;
-static int supported_squashfs_filesystem(short major, short minor, short comp)
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+ major, short minor, short id)
{
+ const struct squashfs_decompressor *decompressor;
+
if (major < SQUASHFS_MAJOR) {
ERROR("Major/Minor mismatch, older Squashfs %d.%d "
"filesystems are unsupported\n", major, minor);
- return -EINVAL;
+ return NULL;
} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
ERROR("Major/Minor mismatch, trying to mount newer "
"%d.%d filesystem\n", major, minor);
ERROR("Please update your kernel\n");
- return -EINVAL;
+ return NULL;
}
- if (comp != ZLIB_COMPRESSION)
- return -EINVAL;
+ decompressor = squashfs_lookup_decompressor(id);
+ if (!decompressor->supported) {
+ ERROR("Filesystem uses \"%s\" compression. This is not "
+ "supported\n", decompressor->name);
+ return NULL;
+ }
- return 0;
+ return decompressor;
}
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
}
msblk = sb->s_fs_info;
- msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
- GFP_KERNEL);
- if (msblk->stream.workspace == NULL) {
- ERROR("Failed to allocate zlib workspace\n");
- goto failure;
- }
-
sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
if (sblk == NULL) {
ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ err = -EINVAL;
+
/* Check it is a SQUASHFS superblock */
sb->s_magic = le32_to_cpu(sblk->s_magic);
if (sb->s_magic != SQUASHFS_MAGIC) {
if (!silent)
ERROR("Can't find a SQUASHFS superblock on %s\n",
bdevname(sb->s_bdev, b));
- err = -EINVAL;
goto failed_mount;
}
- /* Check the MAJOR & MINOR versions and compression type */
- err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+ /* Check the MAJOR & MINOR versions and lookup compression type */
+ msblk->decompressor = supported_squashfs_filesystem(
+ le16_to_cpu(sblk->s_major),
le16_to_cpu(sblk->s_minor),
le16_to_cpu(sblk->compression));
- if (err < 0)
+ if (msblk->decompressor == NULL)
goto failed_mount;
- err = -EINVAL;
-
/*
* Check if there's xattrs in the filesystem. These are not
* supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
err = -ENOMEM;
+ msblk->stream = squashfs_decompressor_init(msblk);
+ if (msblk->stream == NULL)
+ goto failed_mount;
+
msblk->block_cache = squashfs_cache_init("metadata",
SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
+ squashfs_decompressor_free(msblk, msblk->stream);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
- kfree(msblk->stream.workspace);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
kfree(sblk);
return err;
failure:
- kfree(msblk->stream.workspace);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
+ squashfs_decompressor_free(sbi, sbi->stream);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
- kfree(sbi->stream.workspace);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac..e80be2022a7 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -36,7 +36,6 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/pagemap.h>
-#include <linux/zlib.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 00000000000..4dd70e04333
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,150 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+static void *zlib_init(struct squashfs_sb_info *dummy)
+{
+ z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+ if (stream == NULL)
+ goto failed;
+ stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+ GFP_KERNEL);
+ if (stream->workspace == NULL)
+ goto failed;
+
+ return stream;
+
+failed:
+ ERROR("Failed to allocate zlib workspace\n");
+ kfree(stream);
+ return NULL;
+}
+
+
+static void zlib_free(void *strm)
+{
+ z_stream *stream = strm;
+
+ if (stream)
+ kfree(stream->workspace);
+ kfree(stream);
+}
+
+
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+ struct buffer_head **bh, int b, int offset, int length, int srclength,
+ int pages)
+{
+ int zlib_err = 0, zlib_init = 0;
+ int avail, bytes, k = 0, page = 0;
+ z_stream *stream = msblk->stream;
+
+ mutex_lock(&msblk->read_data_mutex);
+
+ stream->avail_out = 0;
+ stream->avail_in = 0;
+
+ bytes = length;
+ do {
+ if (stream->avail_in == 0 && k < b) {
+ avail = min(bytes, msblk->devblksize - offset);
+ bytes -= avail;
+ wait_on_buffer(bh[k]);
+ if (!buffer_uptodate(bh[k]))
+ goto release_mutex;
+
+ if (avail == 0) {
+ offset = 0;
+ put_bh(bh[k++]);
+ continue;
+ }
+
+ stream->next_in = bh[k]->b_data + offset;
+ stream->avail_in = avail;
+ offset = 0;
+ }
+
+ if (stream->avail_out == 0 && page < pages) {
+ stream->next_out = buffer[page++];
+ stream->avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (!zlib_init) {
+ zlib_err = zlib_inflateInit(stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflateInit returned unexpected "
+ "result 0x%x, srclength %d\n",
+ zlib_err, srclength);
+ goto release_mutex;
+ }
+ zlib_init = 1;
+ }
+
+ zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
+
+ if (stream->avail_in == 0 && k < b)
+ put_bh(bh[k++]);
+ } while (zlib_err == Z_OK);
+
+ if (zlib_err != Z_STREAM_END) {
+ ERROR("zlib_inflate error, data probably corrupt\n");
+ goto release_mutex;
+ }
+
+ zlib_err = zlib_inflateEnd(stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflate error, data probably corrupt\n");
+ goto release_mutex;
+ }
+
+ mutex_unlock(&msblk->read_data_mutex);
+ return stream->total_out;
+
+release_mutex:
+ mutex_unlock(&msblk->read_data_mutex);
+
+ for (; k < b; k++)
+ put_bh(bh[k]);
+
+ return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+ .init = zlib_init,
+ .free = zlib_free,
+ .decompress = zlib_uncompress,
+ .id = ZLIB_COMPRESSION,
+ .name = "zlib",
+ .supported = 1
+};
+
diff --git a/fs/sync.c b/fs/sync.c
index 418727a2a23..f557d71cb09 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait)
if (!sb->s_bdi)
return 0;
- /* Avoid doing twice syncing and cache pruning for quota sync */
- if (!wait) {
- writeout_quota_sb(sb, -1);
- writeback_inodes_sb(sb);
- } else {
- sync_quota_sb(sb, -1);
+ if (sb->s_qcop && sb->s_qcop->quota_sync)
+ sb->s_qcop->quota_sync(sb, -1, wait);
+
+ if (wait)
sync_inodes_sb(sb);
- }
+ else
+ writeback_inodes_sb(sb);
+
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a..e9d293593e5 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
int rc;
/* need attr_sd for attr, its parent for kobj */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
rc = -EIO;
if (attr->read)
rc = attr->read(kobj, attr, buffer, off, count);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return rc;
}
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
int rc;
/* need attr_sd for attr, its parent for kobj */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
rc = -EIO;
if (attr->write)
rc = attr->write(kobj, attr, buffer, offset, count);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return rc;
}
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
if (!bb->vm_ops || !bb->vm_ops->open)
return;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return;
bb->vm_ops->open(vma);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
}
static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
if (!bb->vm_ops || !bb->vm_ops->close)
return;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return;
bb->vm_ops->close(vma);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
}
static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!bb->vm_ops || !bb->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return VM_FAULT_SIGBUS;
ret = bb->vm_ops->fault(vma, vmf);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return ret;
}
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!bb->vm_ops->page_mkwrite)
return 0;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return VM_FAULT_SIGBUS;
ret = bb->vm_ops->page_mkwrite(vma, vmf);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return ret;
}
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
if (!bb->vm_ops || !bb->vm_ops->access)
return -EINVAL;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -EINVAL;
ret = bb->vm_ops->access(vma, addr, buf, len, write);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return ret;
}
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
if (!bb->vm_ops || !bb->vm_ops->set_policy)
return 0;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -EINVAL;
ret = bb->vm_ops->set_policy(vma, new);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return ret;
}
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
if (!bb->vm_ops || !bb->vm_ops->get_policy)
return vma->vm_policy;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return vma->vm_policy;
pol = bb->vm_ops->get_policy(vma, addr);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return pol;
}
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
if (!bb->vm_ops || !bb->vm_ops->migrate)
return 0;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return 0;
ret = bb->vm_ops->migrate(vma, from, to, flags);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return ret;
}
#endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
/* need attr_sd for attr, its parent for kobj */
rc = -ENODEV;
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
goto out_unlock;
rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
bb->vm_ops = vma->vm_ops;
vma->vm_ops = &bin_vm_ops;
out_put:
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
out_unlock:
mutex_unlock(&bb->mutex);
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
int error;
/* binary file operations requires both @sd and its parent */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
mutex_unlock(&sysfs_bin_lock);
/* open succeeded, put active references */
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return 0;
err_out:
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
kfree(bb);
return error;
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 699f371b9f1..590717861c7 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
* RETURNS:
* Pointer to @sd on success, NULL on failure.
*/
-static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
{
if (unlikely(!sd))
return NULL;
@@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
* Put an active reference to @sd. This function is noop if @sd
* is NULL.
*/
-static void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct sysfs_dirent *sd)
{
struct completion *cmpl;
int v;
@@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
}
/**
- * sysfs_get_active_two - get active references to sysfs_dirent and parent
- * @sd: sysfs_dirent of interest
- *
- * Get active reference to @sd and its parent. Parent's active
- * reference is grabbed first. This function is noop if @sd is
- * NULL.
- *
- * RETURNS:
- * Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
-{
- if (sd) {
- if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
- return NULL;
- if (unlikely(!sysfs_get_active(sd))) {
- sysfs_put_active(sd->s_parent);
- return NULL;
- }
- }
- return sd;
-}
-
-/**
- * sysfs_put_active_two - put active references to sysfs_dirent and parent
- * @sd: sysfs_dirent of interest
- *
- * Put active references to @sd and its parent. This function is
- * noop if @sd is NULL.
- */
-void sysfs_put_active_two(struct sysfs_dirent *sd)
-{
- if (sd) {
- sysfs_put_active(sd);
- sysfs_put_active(sd->s_parent);
- }
-}
-
-/**
* sysfs_deactivate - deactivate sysfs_dirent
* @sd: sysfs_dirent to deactivate
*
@@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
int v;
BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+
+ if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+ return;
+
sd->s_sibling = (void *)&wait;
rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
@@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
atomic_set(&sd->s_count, 1);
atomic_set(&sd->s_active, 0);
- sysfs_dirent_init_lockdep(sd);
sd->s_name = name;
sd->s_mode = mode;
@@ -681,7 +645,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
}
/* attach dentry and inode */
- inode = sysfs_get_inode(sd);
+ inode = sysfs_get_inode(dir->i_sb, sd);
if (!inode) {
ret = ERR_PTR(-ENOMEM);
goto out_unlock;
@@ -837,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
return (sd->s_mode >> 12) & 15;
}
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+ sysfs_put(filp->private_data);
+ return 0;
+}
+
+static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+ ino_t ino, struct sysfs_dirent *pos)
+{
+ if (pos) {
+ int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+ pos->s_parent == parent_sd &&
+ ino == pos->s_ino;
+ sysfs_put(pos);
+ if (valid)
+ return pos;
+ }
+ pos = NULL;
+ if ((ino > 1) && (ino < INT_MAX)) {
+ pos = parent_sd->s_dir.children;
+ while (pos && (ino > pos->s_ino))
+ pos = pos->s_sibling;
+ }
+ return pos;
+}
+
+static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
+ ino_t ino, struct sysfs_dirent *pos)
+{
+ pos = sysfs_dir_pos(parent_sd, ino, pos);
+ if (pos)
+ pos = pos->s_sibling;
+ return pos;
+}
+
static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
struct dentry *dentry = filp->f_path.dentry;
struct sysfs_dirent * parent_sd = dentry->d_fsdata;
- struct sysfs_dirent *pos;
+ struct sysfs_dirent *pos = filp->private_data;
ino_t ino;
if (filp->f_pos == 0) {
@@ -857,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
filp->f_pos++;
}
- if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
- mutex_lock(&sysfs_mutex);
-
- /* Skip the dentries we have already reported */
- pos = parent_sd->s_dir.children;
- while (pos && (filp->f_pos > pos->s_ino))
- pos = pos->s_sibling;
-
- for ( ; pos; pos = pos->s_sibling) {
- const char * name;
- int len;
-
- name = pos->s_name;
- len = strlen(name);
- filp->f_pos = ino = pos->s_ino;
+ mutex_lock(&sysfs_mutex);
+ for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+ pos;
+ pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
+ const char * name;
+ unsigned int type;
+ int len, ret;
+
+ name = pos->s_name;
+ len = strlen(name);
+ ino = pos->s_ino;
+ type = dt_type(pos);
+ filp->f_pos = ino;
+ filp->private_data = sysfs_get(pos);
- if (filldir(dirent, name, len, filp->f_pos, ino,
- dt_type(pos)) < 0)
- break;
- }
- if (!pos)
- filp->f_pos = INT_MAX;
mutex_unlock(&sysfs_mutex);
+ ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+ mutex_lock(&sysfs_mutex);
+ if (ret < 0)
+ break;
+ }
+ mutex_unlock(&sysfs_mutex);
+ if ((filp->f_pos > 1) && !pos) { /* EOF */
+ filp->f_pos = INT_MAX;
+ filp->private_data = NULL;
}
return 0;
}
@@ -888,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
const struct file_operations sysfs_dir_operations = {
.read = generic_read_dir,
.readdir = sysfs_readdir,
+ .release = sysfs_dir_release,
.llseek = generic_file_llseek,
};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e3168..e222b258274 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
size_t count;
loff_t pos;
char * page;
- struct sysfs_ops * ops;
+ const struct sysfs_ops * ops;
struct mutex mutex;
int needs_read_fill;
int event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
{
struct sysfs_dirent *attr_sd = dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- struct sysfs_ops * ops = buffer->ops;
+ const struct sysfs_ops * ops = buffer->ops;
int ret = 0;
ssize_t count;
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
return -ENOMEM;
/* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
buffer->event = atomic_read(&attr_sd->s_attr.open->event);
count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
/*
* The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
{
struct sysfs_dirent *attr_sd = dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- struct sysfs_ops * ops = buffer->ops;
+ const struct sysfs_ops * ops = buffer->ops;
int rc;
/* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return rc;
}
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
struct sysfs_buffer *buffer;
- struct sysfs_ops *ops;
+ const struct sysfs_ops *ops;
int error = -EACCES;
char *p;
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
return -ENODEV;
/* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
goto err_free;
/* open succeeded, put active references */
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return 0;
err_free:
kfree(buffer);
err_out:
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
return error;
}
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
struct sysfs_open_dirent *od = attr_sd->s_attr.open;
/* need parent for the kobj, grab both */
- if (!sysfs_get_active_two(attr_sd))
+ if (!sysfs_get_active(attr_sd))
goto trigger;
poll_wait(filp, &od->poll, wait);
- sysfs_put_active_two(attr_sd);
+ sysfs_put_active(attr_sd);
if (buffer->event != atomic_read(&od->event))
goto trigger;
@@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
if (!sd)
return -ENOMEM;
sd->s_attr.attr = (void *)attr;
+ sysfs_dirent_init_lockdep(sd);
sysfs_addrm_start(&acxt, dir_sd);
rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
}
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+ int err = 0;
+ int i;
+
+ for (i = 0; ptr[i] && !err; i++)
+ err = sysfs_create_file(kobj, ptr[i]);
+ if (err)
+ while (--i >= 0)
+ sysfs_remove_file(kobj, ptr[i]);
+ return err;
+}
/**
* sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -614,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
sysfs_hash_and_remove(kobj->sd, attr->name);
}
+void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+{
+ int i;
+ for (i = 0; ptr[i]; i++)
+ sysfs_remove_file(kobj, ptr[i]);
+}
/**
* sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -732,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
EXPORT_SYMBOL_GPL(sysfs_create_file);
EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 6a06a1d1ea7..082daaecac1 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -111,20 +111,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
if (!sd)
return -EINVAL;
+ mutex_lock(&sysfs_mutex);
error = inode_change_ok(inode, iattr);
if (error)
- return error;
+ goto out;
iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
error = inode_setattr(inode, iattr);
if (error)
- return error;
+ goto out;
- mutex_lock(&sysfs_mutex);
error = sysfs_sd_setattr(sd, iattr);
+out:
mutex_unlock(&sysfs_mutex);
-
return error;
}
@@ -283,6 +283,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
/**
* sysfs_get_inode - get inode for sysfs_dirent
+ * @sb: super block
* @sd: sysfs_dirent to allocate inode for
*
* Get inode for @sd. If such inode doesn't exist, a new inode
@@ -295,11 +296,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
* RETURNS:
* Pointer to allocated inode on success, NULL on failure.
*/
-struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
+struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
{
struct inode *inode;
- inode = iget_locked(sysfs_sb, sd->s_ino);
+ inode = iget_locked(sb, sd->s_ino);
if (inode && (inode->i_state & I_NEW))
sysfs_init_inode(sd, inode);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955cca..0cb10884a2f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,6 @@
static struct vfsmount *sysfs_mount;
-struct super_block * sysfs_sb = NULL;
struct kmem_cache *sysfs_dir_cachep;
static const struct super_operations sysfs_ops = {
@@ -50,11 +49,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = SYSFS_MAGIC;
sb->s_op = &sysfs_ops;
sb->s_time_gran = 1;
- sysfs_sb = sb;
/* get root inode, initialize and unlock it */
mutex_lock(&sysfs_mutex);
- inode = sysfs_get_inode(&sysfs_root);
+ inode = sysfs_get_inode(sb, &sysfs_root);
mutex_unlock(&sysfs_mutex);
if (!inode) {
pr_debug("sysfs: could not get root inode\n");
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41..1b9a3a1e8a1 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -123,6 +123,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
sysfs_hash_and_remove(parent_sd, name);
}
+/**
+ * sysfs_rename_link - rename symlink in object's directory.
+ * @kobj: object we're acting for.
+ * @targ: object we're pointing to.
+ * @old: previous name of the symlink.
+ * @new: new name of the symlink.
+ *
+ * A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
+ const char *old, const char *new)
+{
+ struct sysfs_dirent *parent_sd, *sd = NULL;
+ int result;
+
+ if (!kobj)
+ parent_sd = &sysfs_root;
+ else
+ parent_sd = kobj->sd;
+
+ result = -ENOENT;
+ sd = sysfs_get_dirent(parent_sd, old);
+ if (!sd)
+ goto out;
+
+ result = -EINVAL;
+ if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+ goto out;
+ if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+ goto out;
+
+ result = sysfs_rename(sd, parent_sd, new);
+
+out:
+ sysfs_put(sd);
+ return result;
+}
+
static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
struct sysfs_dirent *target_sd, char *path)
{
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cdd9377a6e0..30f5a44fb5d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -66,8 +66,8 @@ struct sysfs_dirent {
};
unsigned int s_flags;
+ unsigned short s_mode;
ino_t s_ino;
- umode_t s_mode;
struct sysfs_inode_attrs *s_iattr;
};
@@ -79,6 +79,7 @@ struct sysfs_dirent {
#define SYSFS_KOBJ_BIN_ATTR 0x0004
#define SYSFS_KOBJ_LINK 0x0008
#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
#define SYSFS_FLAG_REMOVED 0x0200
@@ -91,9 +92,12 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_dirent_init_lockdep(sd) \
do { \
- static struct lock_class_key __key; \
+ struct attribute *attr = sd->s_attr.attr; \
+ struct lock_class_key *key = attr->key; \
+ if (!key) \
+ key = &attr->skey; \
\
- lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \
+ lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
} while(0)
#else
#define sysfs_dirent_init_lockdep(sd) do {} while(0)
@@ -111,7 +115,6 @@ struct sysfs_addrm_cxt {
* mount.c
*/
extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
extern struct kmem_cache *sysfs_dir_cachep;
/*
@@ -124,8 +127,8 @@ extern const struct file_operations sysfs_dir_operations;
extern const struct inode_operations sysfs_dir_inode_operations;
struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
-void sysfs_put_active_two(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
struct sysfs_dirent *parent_sd);
int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -168,7 +171,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
/*
* inode.c
*/
-struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
void sysfs_delete_inode(struct inode *inode);
int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
int sysfs_permission(struct inode *inode, int mask);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a..4573734d723 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
+#include <linux/writeback.h>
#include <linux/namei.h>
#include <asm/byteorder.h>
#include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
return ERR_PTR(-EIO);
}
-int sysv_write_inode(struct inode *inode, int wait)
+static int __sysv_write_inode(struct inode *inode, int wait)
{
struct super_block * sb = inode->i_sb;
struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
return 0;
}
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
int sysv_sync_inode(struct inode *inode)
{
- return sysv_write_inode(inode, 1);
+ return __sysv_write_inode(inode, 1);
}
static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf6..94cb9b4d76c 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
/* inode.c */
extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
extern int sysv_sync_inode(struct inode *);
extern void sysv_set_inode(struct inode *, dev_t);
extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111ff..401e503d44a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (release)
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
- err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+ err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
return err;
out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330e..e26c02ab6cd 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
/* Is the page fully inside @i_size? */
if (page->index < end_index) {
if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
- err = inode->i_sb->s_op->write_inode(inode, 1);
+ err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_unlock;
/*
@@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
kunmap_atomic(kaddr, KM_USER0);
if (i_size > synced_i_size) {
- err = inode->i_sb->s_op->write_inode(inode, 1);
+ err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_unlock;
}
@@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
if (release)
ubifs_release_budget(c, &req);
if (IS_SYNC(inode))
- err = inode->i_sb->s_op->write_inode(inode, 1);
+ err = inode->i_sb->s_op->write_inode(inode, NULL);
return err;
out:
@@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
* the inode unless this is a 'datasync()' call.
*/
if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
- err = inode->i_sb->s_op->write_inode(inode, 1);
+ err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
return err;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f3..4d2f2157dd3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
/*
* Note, Linux write-back code calls this without 'i_mutex'.
*/
-static int ubifs_write_inode(struct inode *inode, int wait)
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err = 0;
struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b2d96f45c12..19626e2491c 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
-#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
#define udf_find_next_one_bit(addr, size, offset) \
- find_next_one_bit(addr, size, offset)
-
-#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
-#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
-#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
-#define uintBPL_t uint(BITS_PER_LONG)
-#define uint(x) xuint(x)
-#define xuint(x) __le ## x
-
-static inline int find_next_one_bit(void *addr, int size, int offset)
-{
- uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
- int result = offset & ~(BITS_PER_LONG - 1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset &= (BITS_PER_LONG - 1);
- if (offset) {
- tmp = leBPL_to_cpup(p++);
- tmp &= ~0UL << offset;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = leBPL_to_cpup(p++);
- if (tmp)
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = leBPL_to_cpup(p);
-found_first:
- tmp &= ~0UL >> (BITS_PER_LONG - size);
-found_middle:
- return result + ffz(~tmp);
-}
-
-#define find_first_one_bit(addr, size)\
- find_next_one_bit((addr), (size), 0)
+ ext2_find_next_bit(addr, size, offset)
static int read_block_bitmap(struct super_block *sb,
struct udf_bitmap *bitmap, unsigned int block,
@@ -208,7 +161,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
((char *)bh->b_data)[(bit + i) >> 3]);
} else {
if (inode)
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
udf_add_free_space(sb, sbi->s_partition, 1);
}
}
@@ -260,11 +213,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
while (bit < (sb->s_blocksize << 3) && block_count > 0) {
if (!udf_test_bit(bit, bh->b_data))
goto out;
- else if (vfs_dq_prealloc_block(inode, 1))
+ else if (dquot_prealloc_block(inode, 1))
goto out;
else if (!udf_clear_bit(bit, bh->b_data)) {
udf_debug("bit already cleared for block %d\n", bit);
- vfs_dq_free_block(inode, 1);
+ dquot_free_block(inode, 1);
goto out;
}
block_count--;
@@ -390,10 +343,14 @@ got_block:
/*
* Check quota for allocation of this block.
*/
- if (inode && vfs_dq_alloc_block(inode, 1)) {
- mutex_unlock(&sbi->s_alloc_mutex);
- *err = -EDQUOT;
- return 0;
+ if (inode) {
+ int ret = dquot_alloc_block(inode, 1);
+
+ if (ret) {
+ mutex_unlock(&sbi->s_alloc_mutex);
+ *err = ret;
+ return 0;
+ }
}
newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -449,7 +406,7 @@ static void udf_table_free_blocks(struct super_block *sb,
/* We do this up front - There are some error conditions that
could occure, but.. oh well */
if (inode)
- vfs_dq_free_block(inode, count);
+ dquot_free_block(inode, count);
udf_add_free_space(sb, sbi->s_partition, count);
start = bloc->logicalBlockNum + offset;
@@ -694,7 +651,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
epos.offset -= adsize;
alloc_count = (elen >> sb->s_blocksize_bits);
- if (inode && vfs_dq_prealloc_block(inode,
+ if (inode && dquot_prealloc_block(inode,
alloc_count > block_count ? block_count : alloc_count))
alloc_count = 0;
else if (alloc_count > block_count) {
@@ -797,12 +754,13 @@ static int udf_table_new_block(struct super_block *sb,
newblock = goal_eloc.logicalBlockNum;
goal_eloc.logicalBlockNum++;
goal_elen -= sb->s_blocksize;
-
- if (inode && vfs_dq_alloc_block(inode, 1)) {
- brelse(goal_epos.bh);
- mutex_unlock(&sbi->s_alloc_mutex);
- *err = -EDQUOT;
- return 0;
+ if (inode) {
+ *err = dquot_alloc_block(inode, 1);
+ if (*err) {
+ brelse(goal_epos.bh);
+ mutex_unlock(&sbi->s_alloc_mutex);
+ return 0;
+ }
}
if (goal_elen)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f311d509b6a..1eb06774ed9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
#include <linux/errno.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/aio.h>
@@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.ioctl = udf_ioctl,
- .open = generic_file_open,
+ .open = dquot_file_open,
.mmap = generic_file_mmap,
.write = do_sync_write,
.aio_write = udf_file_aio_write,
@@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = {
.llseek = generic_file_llseek,
};
+static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = inode_change_ok(inode, iattr);
+ if (error)
+ return error;
+
+ if (iattr->ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
+
+ if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+ error = dquot_transfer(inode, iattr);
+ if (error)
+ return error;
+ }
+
+ return inode_setattr(inode, iattr);
+}
+
const struct inode_operations udf_file_inode_operations = {
- .truncate = udf_truncate,
+ .truncate = udf_truncate,
+ .setattr = udf_setattr,
};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e..fb68c9cd0c3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
* Note: we must free any quota before locking the superblock,
* as writing the quota to disk may need the lock as well.
*/
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode(inode);
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
struct inode *inode;
- int block;
+ int block, ret;
uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
struct udf_inode_info *iinfo;
struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
insert_inode_hash(inode);
mark_inode_dirty(inode);
- if (vfs_dq_alloc_inode(inode)) {
- vfs_dq_drop(inode);
+ dquot_initialize(inode);
+ ret = dquot_alloc_inode(inode);
+ if (ret) {
+ dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
iput(inode);
- *err = -EDQUOT;
+ *err = ret;
return NULL;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 378a7592257..bb863fe579a 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
void udf_delete_inode(struct inode *inode)
{
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -102,12 +106,14 @@ void udf_clear_inode(struct inode *inode)
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
inode->i_size != iinfo->i_lenExtents) {
printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
- "inode size %llu different from extent lenght %llu. "
+ "inode size %llu different from extent length %llu. "
"Filesystem need not be standards compliant.\n",
inode->i_sb->s_id, inode->i_ino, inode->i_mode,
(unsigned long long)inode->i_size,
(unsigned long long)iinfo->i_lenExtents);
}
+
+ dquot_drop(inode);
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
}
@@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
return mode;
}
-int udf_write_inode(struct inode *inode, int sync)
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret;
lock_kernel();
- ret = udf_update_inode(inode, sync);
+ ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
unlock_kernel();
return ret;
@@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
struct udf_inode_info *iinfo = UDF_I(inode);
- bh = udf_tread(inode->i_sb,
- udf_get_lb_pblock(inode->i_sb,
- &iinfo->i_location, 0));
+ bh = udf_tgetblk(inode->i_sb,
+ udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
if (!bh) {
- udf_debug("bread failure\n");
- return -EIO;
+ udf_debug("getblk failure\n");
+ return -ENOMEM;
}
- memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
-
+ lock_buffer(bh);
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
fe = (struct fileEntry *)bh->b_data;
efe = (struct extendedFileEntry *)bh->b_data;
- if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+ if (iinfo->i_use) {
struct unallocSpaceEntry *use =
(struct unallocSpaceEntry *)bh->b_data;
@@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
+ use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+ use->descTag.tagLocation =
+ cpu_to_le32(iinfo->i_location.logicalBlockNum);
crclen = sizeof(struct unallocSpaceEntry) +
iinfo->i_lenAlloc - sizeof(struct tag);
- use->descTag.tagLocation = cpu_to_le32(
- iinfo->i_location.
- logicalBlockNum);
use->descTag.descCRCLength = cpu_to_le16(crclen);
use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
sizeof(struct tag),
crclen));
use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
- mark_buffer_dirty(bh);
- brelse(bh);
- return err;
+ goto out;
}
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
fe->descTag.tagLocation = cpu_to_le32(
iinfo->i_location.logicalBlockNum);
- crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
- sizeof(struct tag);
+ crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
fe->descTag.descCRCLength = cpu_to_le16(crclen);
fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
crclen));
fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
+out:
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
/* write the data blocks */
mark_buffer_dirty(bh);
if (do_sync) {
sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ if (buffer_write_io_error(bh)) {
printk(KERN_WARNING "IO error syncing udf inode "
"[%s:%08lx]\n", inode->i_sb->s_id,
inode->i_ino);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7c56ff00cd5..db423ab078b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
int err;
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
inode = udf_new_inode(dir, mode, &err);
if (!inode) {
@@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
if (!old_valid_dev(rdev))
return -EINVAL;
+ dquot_initialize(dir);
+
lock_kernel();
err = -EIO;
inode = udf_new_inode(dir, mode, &err);
@@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
err = -EMLINK;
if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
+ dquot_initialize(dir);
+
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc cfi;
struct kernel_lb_addr tloc;
+ dquot_initialize(dir);
+
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
struct buffer_head *bh;
struct udf_inode_info *iinfo;
+ dquot_initialize(dir);
+
lock_kernel();
inode = udf_new_inode(dir, S_IFLNK, &err);
if (!inode)
@@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
int err;
struct buffer_head *bh;
+ dquot_initialize(dir);
+
lock_kernel();
if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
unlock_kernel();
@@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
lock_kernel();
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee..4223ac855da 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
extern void udf_read_inode(struct inode *);
extern void udf_delete_inode(struct inode *);
extern void udf_clear_inode(struct inode *);
-extern int udf_write_inode(struct inode *, int);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
extern int udf_extend_file(struct inode *, struct extent_position *,
struct kernel_long_ad *, sector_t);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95df..5cfa4d85ccf 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
"bit already cleared for fragment %u", i);
}
- vfs_dq_free_block(inode, count);
+ dquot_free_block(inode, count);
fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, 1);
- vfs_dq_free_block(inode, uspi->s_fpb);
+ dquot_free_block(inode, uspi->s_fpb);
fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, fragno, fragoff, count, fragsize, i;
+ int ret;
UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
(unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
for (i = oldcount; i < newcount; i++)
ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
- if (vfs_dq_alloc_block(inode, count)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, count);
+ if (ret) {
+ *err = ret;
return 0;
}
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
struct ufs_cylinder_group * ucg;
unsigned oldcg, i, j, k, allocsize;
u64 result;
+ int ret;
UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
for (i = count; i < uspi->s_fpb; i++)
ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
i = uspi->s_fpb - count;
- vfs_dq_free_block(inode, i);
+ dquot_free_block(inode, i);
fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
if (result == INVBLOCK)
return 0;
- if (vfs_dq_alloc_block(inode, count)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, count);
+ if (ret) {
+ *err = ret;
return 0;
}
for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct ufs_super_block_first * usb1;
struct ufs_cylinder_group * ucg;
u64 result, blkno;
+ int ret;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -747,8 +752,9 @@ gotit:
ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, -1);
- if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
- *err = -EDQUOT;
+ ret = dquot_alloc_block(inode, uspi->s_fpb);
+ if (ret) {
+ *err = ret;
return INVBLOCK;
}
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240..a8962cecde5 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
*/
#include <linux/fs.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .open = generic_file_open,
+ .open = dquot_file_open,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0..230ecf60802 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
is_directory = S_ISDIR(inode->i_mode);
- vfs_dq_free_inode(inode);
- vfs_dq_drop(inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode (inode);
@@ -355,9 +355,10 @@ cg_found:
unlock_super (sb);
- if (vfs_dq_alloc_inode(inode)) {
- vfs_dq_drop(inode);
- err = -EDQUOT;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err) {
+ dquot_drop(inode);
goto fail_without_unlock;
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd4..80b68c3702d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,8 @@
#include <linux/mm.h>
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
return 0;
}
-int ufs_write_inode (struct inode * inode, int wait)
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret;
lock_kernel();
- ret = ufs_update_inode (inode, wait);
+ ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
unlock_kernel();
return ret;
}
@@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
{
loff_t old_i_size;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4c26d9e8bc9..118556243e7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
int err;
UFSD("BEGIN\n");
+
+ dquot_initialize(dir);
+
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
if (!old_valid_dev(rdev))
return -EINVAL;
+
+ dquot_initialize(dir);
+
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
+ dquot_initialize(dir);
+
lock_kernel();
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return -EMLINK;
}
+ dquot_initialize(dir);
+
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= UFS_LINK_MAX)
goto out;
+ dquot_initialize(dir);
+
lock_kernel();
inode_inc_link_count(dir);
@@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct page *page;
int err = -ENOENT;
+ dquot_initialize(dir);
+
de = ufs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 143c20bfb04..14743d935a9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1016,6 +1016,9 @@ magic_found:
case UFS_FSSTABLE:
UFSD("fs is stable\n");
break;
+ case UFS_FSLOG:
+ UFSD("fs is logging fs\n");
+ break;
case UFS_FSOSF1:
UFSD("fs is DEC OSF/1\n");
break;
@@ -1432,6 +1435,11 @@ static void destroy_inodecache(void)
kmem_cache_destroy(ufs_inode_cachep);
}
+static void ufs_clear_inode(struct inode *inode)
+{
+ dquot_drop(inode);
+}
+
#ifdef CONFIG_QUOTA
static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1442,6 +1450,7 @@ static const struct super_operations ufs_super_ops = {
.destroy_inode = ufs_destroy_inode,
.write_inode = ufs_write_inode,
.delete_inode = ufs_delete_inode,
+ .clear_inode = ufs_clear_inode,
.put_super = ufs_put_super,
.write_super = ufs_write_super,
.sync_fs = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce22..d3b6270cb37 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/sched.h>
+#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ error = dquot_transfer(inode, attr);
+ if (error)
+ return error;
+ }
if (ia_valid & ATTR_SIZE &&
attr->ia_size != i_size_read(inode)) {
loff_t old_i_size = inode->i_size;
+
+ dquot_initialize(inode);
+
error = vmtruncate(inode, attr->ia_size);
if (error)
return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 01d0e2a3b23..43f9f5d5670 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
/* inode.c */
extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern int ufs_write_inode (struct inode *, int);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
extern int ufs_sync_inode (struct inode *);
extern void ufs_delete_inode (struct inode *);
extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a8..6943ec677c0 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
#define UFS_USEEFT ((__u16)65535)
+/* fs_clean values */
#define UFS_FSOK 0x7c269d38
#define UFS_FSACTIVE ((__s8)0x00)
#define UFS_FSCLEAN ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */
#define UFS_FSBAD ((__s8)0xff)
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */
+#define UFS_FSLOG ((__s8)0xfd) /* logging fs */
+#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */
+
/* From here to next blank line, s_flags for ufs_sb_info */
/* directory entry encoding */
#define UFS_DE_MASK 0x00000010 /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
*/
#define ufs_cbtocylno(bno) \
((bno) * uspi->s_nspf / uspi->s_spc)
-#define ufs_cbtorpos(bno) \
+#define ufs_cbtorpos(bno) \
+ ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \
+ (((((bno) * uspi->s_nspf % uspi->s_spc) % \
+ uspi->s_nsect) * \
+ uspi->s_nrpos) / uspi->s_nsect) \
+ : \
((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
* uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
% uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
- * uspi->s_nrpos) / uspi->s_npsect)
+ * uspi->s_nrpos) / uspi->s_npsect))
/*
* The following macros optimize certain frequently calculated
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5c5a366aa33..b4769e40e8b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
xfs_globals.o \
xfs_ioctl.o \
xfs_iops.o \
- xfs_lrw.o \
xfs_super.o \
xfs_sync.o \
xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c121..99628508cb1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
#include "xfs_iomap.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_bmap.h"
#include <linux/mpage.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
}
/*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size. If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated. If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk. The
+ * current in-memory file size is i_size. If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated. If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
*/
-
-STATIC void
+STATIC int
xfs_setfilesize(
xfs_ioend_t *ioend)
{
@@ -181,16 +185,40 @@ xfs_setfilesize(
ASSERT(ioend->io_type != IOMAP_READ);
if (unlikely(ioend->io_error))
- return;
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_ioend_new_eof(ioend);
if (isize) {
ip->i_d.di_size = isize;
- xfs_mark_inode_dirty_sync(ip);
+ xfs_mark_inode_dirty(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+ xfs_ioend_t *ioend,
+ int wait)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct workqueue_struct *wq;
+
+ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ xfsconvertd_workqueue : xfsdatad_workqueue;
+ queue_work(wq, &ioend->io_work);
+ if (wait)
+ flush_workqueue(wq);
+ }
}
/*
@@ -198,11 +226,11 @@ xfs_setfilesize(
*/
STATIC void
xfs_end_io(
- struct work_struct *work)
+ struct work_struct *work)
{
- xfs_ioend_t *ioend =
- container_of(work, xfs_ioend_t, io_work);
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ int error = 0;
/*
* For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
*/
if (ioend->io_type == IOMAP_UNWRITTEN &&
likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
- int error;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
* We might have to update the on-disk file size after extending
* writes.
*/
- if (ioend->io_type != IOMAP_READ)
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq;
-
- wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
- xfsconvertd_workqueue : xfsdatad_workqueue;
- queue_work(wq, &ioend->io_work);
- if (wait)
- flush_workqueue(wq);
+ if (ioend->io_type != IOMAP_READ) {
+ error = xfs_setfilesize(ioend);
+ ASSERT(!error || error == EAGAIN);
}
+
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else
+ xfs_destroy_ioend(ioend);
}
/*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
* but don't update the inode size until I/O completion.
*/
if (xfs_ioend_new_eof(ioend))
- xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+ xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,125 @@ xfs_cluster_write(
}
}
+STATIC void
+xfs_vm_invalidatepage(
+ struct page *page,
+ unsigned long offset)
+{
+ trace_xfs_invalidatepage(page->mapping->host, page, offset);
+ block_invalidatepage(page, offset);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+ struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct buffer_head *bh, *head;
+ loff_t offset = page_offset(page);
+ ssize_t len = 1 << inode->i_blkbits;
+
+ if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+ goto out_invalidate;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_invalidate;
+
+ xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+ "page discard on page %p, inode 0x%llx, offset %llu.",
+ page, ip->i_ino, offset);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ bh = head = page_buffers(page);
+ do {
+ int done;
+ xfs_fileoff_t offset_fsb;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ int error;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+
+ if (!buffer_delay(bh))
+ goto next_buffer;
+
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+
+ /*
+ * Map the range first and check that it is a delalloc extent
+ * before trying to unmap the range. Otherwise we will be
+ * trying to remove a real extent (which requires a
+ * transaction) or a hole, which is probably a bad idea...
+ */
+ error = xfs_bmapi(NULL, ip, offset_fsb, 1,
+ XFS_BMAPI_ENTIRE, NULL, 0, &imap,
+ &nimaps, NULL, NULL);
+
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+ "page discard failed delalloc mapping lookup.");
+ }
+ break;
+ }
+ if (!nimaps) {
+ /* nothing there */
+ goto next_buffer;
+ }
+ if (imap.br_startblock != DELAYSTARTBLOCK) {
+ /* been converted, ignore */
+ goto next_buffer;
+ }
+ WARN_ON(imap.br_blockcount == 0);
+
+ /*
+ * Note: while we initialise the firstblock/flist pair, they
+ * should never be used because blocks should never be
+ * allocated or freed for a delalloc extent and hence we need
+ * don't cancel or finish them after the xfs_bunmapi() call.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
+ &flist, NULL, &done);
+
+ ASSERT(!flist.xbf_count && !flist.xbf_first);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+ "page discard unable to remove delalloc mapping.");
+ }
+ break;
+ }
+next_buffer:
+ offset += len;
+
+ } while ((bh = bh->b_this_page) != head);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+ xfs_vm_invalidatepage(page, 0);
+ return;
+}
+
/*
* Calling this without startio set means we are being asked to make a dirty
* page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1264,7 @@ error:
*/
if (err != -EAGAIN) {
if (!unmapped)
- block_invalidatepage(page, 0);
+ xfs_aops_discard_page(page);
ClearPageUptodate(page);
}
return err;
@@ -1535,15 +1674,6 @@ xfs_vm_readpages(
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
-STATIC void
-xfs_vm_invalidatepage(
- struct page *page,
- unsigned long offset)
-{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
-}
-
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f19..bd111b7e1da 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -168,75 +168,6 @@ test_page_region(
}
/*
- * Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
- void *vm_addr;
- struct a_list *next;
-} a_list_t;
-
-static a_list_t *as_free_head;
-static int as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-
-/*
- * Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
- void *addr)
-{
- a_list_t *aentry;
-
-#ifdef CONFIG_XEN
- /*
- * Xen needs to be able to make sure it can get an exclusive
- * RO mapping of pages it wants to turn into a pagetable. If
- * a newly allocated page is also still being vmap()ed by xfs,
- * it will cause pagetable construction to fail. This is a
- * quick workaround to always eagerly unmap pages so that Xen
- * is happy.
- */
- vunmap(addr);
- return;
-#endif
-
- aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
- if (likely(aentry)) {
- spin_lock(&as_lock);
- aentry->next = as_free_head;
- aentry->vm_addr = addr;
- as_free_head = aentry;
- as_list_len++;
- spin_unlock(&as_lock);
- } else {
- vunmap(addr);
- }
-}
-
-STATIC void
-purge_addresses(void)
-{
- a_list_t *aentry, *old;
-
- if (as_free_head == NULL)
- return;
-
- spin_lock(&as_lock);
- aentry = as_free_head;
- as_free_head = NULL;
- as_list_len = 0;
- spin_unlock(&as_lock);
-
- while ((old = aentry) != NULL) {
- vunmap(aentry->vm_addr);
- aentry = aentry->next;
- kfree(old);
- }
-}
-
-/*
* Internal xfs_buf_t object manipulation
*/
@@ -337,7 +268,8 @@ xfs_buf_free(
uint i;
if (xfs_buf_is_vmapped(bp))
- free_address(bp->b_addr - bp->b_offset);
+ vm_unmap_ram(bp->b_addr - bp->b_offset,
+ bp->b_page_count);
for (i = 0; i < bp->b_page_count; i++) {
struct page *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
bp->b_flags |= XBF_MAPPED;
} else if (flags & XBF_MAPPED) {
- if (as_list_len > 64)
- purge_addresses();
- bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
- VM_MAP, PAGE_KERNEL);
+ bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+ -1, PAGE_KERNEL);
if (unlikely(bp->b_addr == NULL))
return -ENOMEM;
bp->b_addr += bp->b_offset;
@@ -1955,9 +1885,6 @@ xfsbufd(
xfs_buf_iostrategy(bp);
count++;
}
-
- if (as_list_len > 0)
- purge_addresses();
if (count)
blk_run_address_space(target->bt_mapping);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4..846b75aeb2a 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
#include "xfs_vnodeops.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
return d_obtain_alias(VFS_I(cip));
}
+STATIC int
+xfs_fs_nfs_commit_metadata(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+ XFS_LOG_SYNC, NULL);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ return error;
+}
+
const struct export_operations xfs_export_operations = {
.encode_fh = xfs_fs_encode_fh,
.fh_to_dentry = xfs_fs_fh_to_dentry,
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
+ .commit_metadata = xfs_fs_nfs_commit_metadata,
};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2..42dd3bcfba6 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
@@ -34,52 +35,279 @@
#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_rw.h"
#include "xfs_vnodeops.h"
#include "xfs_da_btree.h"
#include "xfs_ioctl.h"
+#include "xfs_trace.h"
#include <linux/dcache.h>
static const struct vm_operations_struct xfs_file_vm_ops;
-STATIC ssize_t
-xfs_file_aio_read(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+/*
+ * xfs_iozero
+ *
+ * xfs_iozero clears the specified range of buffer supplied,
+ * and marks all the affected blocks as valid and modified. If
+ * an affected block is not allocated, it will be allocated. If
+ * an affected block is not completely overwritten, and is not
+ * valid before the operation, it will be read from disk before
+ * being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+ struct xfs_inode *ip, /* inode */
+ loff_t pos, /* offset in file */
+ size_t count) /* size of data to zero */
{
- struct file *file = iocb->ki_filp;
- int ioflags = 0;
+ struct page *page;
+ struct address_space *mapping;
+ int status;
- BUG_ON(iocb->ki_pos != pos);
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
- return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
- nr_segs, &iocb->ki_pos, ioflags);
+ mapping = VFS_I(ip)->i_mapping;
+ do {
+ unsigned offset, bytes;
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+ page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ pos += bytes;
+ count -= bytes;
+ status = 0;
+ } while (count);
+
+ return (-status);
+}
+
+STATIC int
+xfs_file_fsync(
+ struct file *file,
+ struct dentry *dentry,
+ int datasync)
+{
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct xfs_trans *tp;
+ int error = 0;
+ int log_flushed = 0;
+
+ xfs_itrace_entry(ip);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -XFS_ERROR(EIO);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ /*
+ * We always need to make sure that the required inode state is safe on
+ * disk. The inode might be clean but we still might need to force the
+ * log because of committed transactions that haven't hit the disk yet.
+ * Likewise, there could be unflushed non-transactional changes to the
+ * inode core that have to go to disk and this requires us to issue
+ * a synchronous transaction to capture these changes correctly.
+ *
+ * This code relies on the assumption that if the i_update_core field
+ * of the inode is clear and the inode is unpinned then it is clean
+ * and no action is required.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ /*
+ * First check if the VFS inode is marked dirty. All the dirtying
+ * of non-transactional updates no goes through mark_inode_dirty*,
+ * which allows us to distinguish beteeen pure timestamp updates
+ * and i_size updates which need to be caught for fdatasync.
+ * After that also theck for the dirty state in the XFS inode, which
+ * might gets cleared when the inode gets written out via the AIL
+ * or xfs_iflush_cluster.
+ */
+ if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+ ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+ ip->i_update_core) {
+ /*
+ * Kick off a transaction to log the inode core to get the
+ * updates. The sync transaction will also force the log.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0,
+ XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return -error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * Note - it's possible that we might have pushed ourselves out
+ * of the way during trans_reserve which would flush the inode.
+ * But there's no guarantee that the inode buffer has actually
+ * gone out yet (it's delwri). Plus the buffer could be pinned
+ * anyway if it's part of an inode in another recent
+ * transaction. So we play it safe and fire off the
+ * transaction anyway.
+ */
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_set_sync(tp);
+ error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ } else {
+ /*
+ * Timestamps/size haven't changed since last inode flush or
+ * inode transaction commit. That means either nothing got
+ * written or a transaction committed which caught the updates.
+ * If the latter happened and the transaction hasn't hit the
+ * disk yet, the inode will be still be pinned. If it is,
+ * force the log.
+ */
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force_lsn(ip->i_mount,
+ ip->i_itemp->ili_last_lsn,
+ XFS_LOG_SYNC, &log_flushed);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ }
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If the log write didn't issue an ordered tag we need
+ * to flush the disk cache for the data device now.
+ */
+ if (!log_flushed)
+ xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+ /*
+ * If this inode is on the RT dev we need to flush that
+ * cache as well.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+ }
+
+ return -error;
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_read(
struct kiocb *iocb,
- const struct iovec *iov,
+ const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ size_t size = 0;
+ ssize_t ret = 0;
int ioflags = 0;
+ xfs_fsize_t n;
+ unsigned long seg;
+
+ XFS_STATS_INC(xs_read_calls);
BUG_ON(iocb->ki_pos != pos);
+
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
- &iocb->ki_pos, ioflags);
+
+ /* START copy & waste from filemap.c */
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *iv = &iovp[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ size += iv->iov_len;
+ if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+ return XFS_ERROR(-EINVAL);
+ }
+ /* END copy & waste from filemap.c */
+
+ if (unlikely(ioflags & IO_ISDIRECT)) {
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+ if ((iocb->ki_pos & target->bt_smask) ||
+ (size & target->bt_smask)) {
+ if (iocb->ki_pos == ip->i_size)
+ return 0;
+ return -XFS_ERROR(EINVAL);
+ }
+ }
+
+ n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+ if (n <= 0 || size == 0)
+ return 0;
+
+ if (n < size)
+ size = n;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (unlikely(ioflags & IO_ISDIRECT))
+ mutex_lock(&inode->i_mutex);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+ int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+ int iolock = XFS_IOLOCK_SHARED;
+
+ ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
+ dmflags, &iolock);
+ if (ret) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (unlikely(ioflags & IO_ISDIRECT))
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ if (unlikely(ioflags & IO_ISDIRECT)) {
+ if (inode->i_mapping->nrpages) {
+ ret = -xfs_flushinval_pages(ip,
+ (iocb->ki_pos & PAGE_CACHE_MASK),
+ -1, FI_REMAPF_LOCKED);
+ }
+ mutex_unlock(&inode->i_mutex);
+ if (ret) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+ }
+ }
+
+ trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+
+ ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
}
STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
struct file *infilp,
loff_t *ppos,
struct pipe_inode_info *pipe,
- size_t len,
+ size_t count,
unsigned int flags)
{
+ struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
int ioflags = 0;
+ ssize_t ret;
+
+ XFS_STATS_INC(xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
- infilp, ppos, pipe, len, flags, ioflags);
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+ int iolock = XFS_IOLOCK_SHARED;
+ int error;
+
+ error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
+ FILP_DELAY_FLAG(infilp), &iolock);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return -error;
+ }
+ }
+
+ trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
}
STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
struct pipe_inode_info *pipe,
struct file *outfilp,
loff_t *ppos,
- size_t len,
+ size_t count,
unsigned int flags)
{
+ struct inode *inode = outfilp->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t isize, new_size;
int ioflags = 0;
+ ssize_t ret;
+
+ XFS_STATS_INC(xs_write_calls);
if (outfilp->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
- pipe, outfilp, ppos, len, flags, ioflags);
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
+ int iolock = XFS_IOLOCK_EXCL;
+ int error;
+
+ error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
+ FILP_DELAY_FLAG(outfilp), &iolock);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return -error;
+ }
+ }
+
+ new_size = *ppos + count;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
+ ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_write_bytes, ret);
+
+ isize = i_size_read(inode);
+ if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+ *ppos = isize;
+
+ if (*ppos > ip->i_size) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (*ppos > ip->i_size)
+ ip->i_size = *ppos;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ if (ip->i_new_size) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ip->i_new_size = 0;
+ if (ip->i_d.di_size > ip->i_size)
+ ip->i_d.di_size = ip->i_size;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF. We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int /* error (positive) */
+xfs_zero_last_block(
+ xfs_inode_t *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
+{
+ xfs_fileoff_t last_fsb;
+ xfs_mount_t *mp = ip->i_mount;
+ int nimaps;
+ int zero_offset;
+ int zero_len;
+ int error = 0;
+ xfs_bmbt_irec_t imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ if (zero_offset == 0) {
+ /*
+ * There are no extra bytes in the last block on disk to
+ * zero, so return.
+ */
+ return 0;
+ }
+
+ last_fsb = XFS_B_TO_FSBT(mp, isize);
+ nimaps = 1;
+ error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+ &nimaps, NULL, NULL);
+ if (error) {
+ return error;
+ }
+ ASSERT(nimaps > 0);
+ /*
+ * If the block underlying isize is just a hole, then there
+ * is nothing to zero.
+ */
+ if (imap.br_startblock == HOLESTARTBLOCK) {
+ return 0;
+ }
+ /*
+ * Zero the part of the last block beyond the EOF, and write it
+ * out sync. We need to drop the ilock while we do this so we
+ * don't deadlock when the buffer cache calls back to us.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ zero_len = mp->m_sb.sb_blocksize - zero_offset;
+ if (isize + zero_len > offset)
+ zero_len = offset - isize;
+ error = xfs_iozero(ip, isize, zero_len);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ASSERT(error >= 0);
+ return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF. This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file. This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated. If fill is set,
+ * then any holes in the range are filled and zeroed. If not, the holes
+ * are left alone as holes.
+ */
+
+int /* error (positive) */
+xfs_zero_eof(
+ xfs_inode_t *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ xfs_bmbt_irec_t imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ ASSERT(offset > isize);
+
+ /*
+ * First handle zeroing the block on which isize resides.
+ * We only zero a part of that block so it is handled specially.
+ */
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ return error;
+ }
+
+ /*
+ * Calculate the range between the new size and the old
+ * where blocks needing to be zeroed may exist. To get the
+ * block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back
+ * to a block boundary. We subtract 1 in case the size is
+ * exactly on a block boundary.
+ */
+ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+ start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+ end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+ ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+ if (last_fsb == end_zero_fsb) {
+ /*
+ * The size was only incremented on its last block.
+ * We took care of that above, so just return.
+ */
+ return 0;
+ }
+
+ ASSERT(start_zero_fsb <= end_zero_fsb);
+ while (start_zero_fsb <= end_zero_fsb) {
+ nimaps = 1;
+ zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+ error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+ 0, NULL, 0, &imap, &nimaps, NULL, NULL);
+ if (error) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ return error;
+ }
+ ASSERT(nimaps > 0);
+
+ if (imap.br_state == XFS_EXT_UNWRITTEN ||
+ imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * This loop handles initializing pages that were
+ * partially initialized by the code below this
+ * loop. It basically zeroes the part of the page
+ * that sits on a hole and sets the page as P_HOLE
+ * and calls remapf if it is a mapped file.
+ */
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+ continue;
+ }
+
+ /*
+ * There are blocks we need to zero.
+ * Drop the inode lock while we're doing the I/O.
+ * We'll still have the iolock to protect us.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+ zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+ if ((zero_off + zero_len) > offset)
+ zero_len = offset - zero_off;
+
+ error = xfs_iozero(ip, zero_off, zero_len);
+ if (error) {
+ goto out_lock;
+ }
+
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ }
+
+ return 0;
+
+out_lock:
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ASSERT(error >= 0);
+ return error;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0, error = 0;
+ int ioflags = 0;
+ xfs_fsize_t isize, new_size;
+ int iolock;
+ int eventsent = 0;
+ size_t ocount = 0, count;
+ int need_i_mutex;
+
+ XFS_STATS_INC(xs_write_calls);
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ if (unlikely(file->f_flags & O_DIRECT))
+ ioflags |= IO_ISDIRECT;
+ if (file->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ if (error)
+ return error;
+
+ count = ocount;
+ if (count == 0)
+ return 0;
+
+ xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+relock:
+ if (ioflags & IO_ISDIRECT) {
+ iolock = XFS_IOLOCK_SHARED;
+ need_i_mutex = 0;
+ } else {
+ iolock = XFS_IOLOCK_EXCL;
+ need_i_mutex = 1;
+ mutex_lock(&inode->i_mutex);
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+
+start:
+ error = -generic_write_checks(file, &pos, &count,
+ S_ISBLK(inode->i_mode));
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+ goto out_unlock_mutex;
+ }
+
+ if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
+ !(ioflags & IO_INVIS) && !eventsent)) {
+ int dmflags = FILP_DELAY_FLAG(file);
+
+ if (need_i_mutex)
+ dmflags |= DM_FLAGS_IMUX;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
+ pos, count, dmflags, &iolock);
+ if (error) {
+ goto out_unlock_internal;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ eventsent = 1;
+
+ /*
+ * The iolock was dropped and reacquired in XFS_SEND_DATA
+ * so we have to recheck the size when appending.
+ * We will only "goto start;" once, since having sent the
+ * event prevents another call to XFS_SEND_DATA, which is
+ * what allows the size to change in the first place.
+ */
+ if ((file->f_flags & O_APPEND) && pos != ip->i_size)
+ goto start;
+ }
+
+ if (ioflags & IO_ISDIRECT) {
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+ return XFS_ERROR(-EINVAL);
+ }
+
+ if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ need_i_mutex = 1;
+ mutex_lock(&inode->i_mutex);
+ xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+ goto start;
+ }
+ }
+
+ new_size = pos + count;
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
+
+ if (likely(!(ioflags & IO_INVIS)))
+ file_update_time(file);
+
+ /*
+ * If the offset is beyond the size of the file, we have a couple
+ * of things to do. First, if there is already space allocated
+ * we need to either create holes or zero the disk or ...
+ *
+ * If there is a page where the previous size lands, we need
+ * to zero it out up to the new size.
+ */
+
+ if (pos > ip->i_size) {
+ error = xfs_zero_eof(ip, pos, ip->i_size);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_unlock_internal;
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we're writing the file then make sure to clear the
+ * setuid and setgid bits if the process is not being run
+ * by root. This keeps people from modifying setuid and
+ * setgid binaries.
+ */
+ error = -file_remove_suid(file);
+ if (unlikely(error))
+ goto out_unlock_internal;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+ if ((ioflags & IO_ISDIRECT)) {
+ if (mapping->nrpages) {
+ WARN_ON(need_i_mutex == 0);
+ error = xfs_flushinval_pages(ip,
+ (pos & PAGE_CACHE_MASK),
+ -1, FI_REMAPF_LOCKED);
+ if (error)
+ goto out_unlock_internal;
+ }
+
+ if (need_i_mutex) {
+ /* demote the lock now the cached pages are gone */
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ mutex_unlock(&inode->i_mutex);
+
+ iolock = XFS_IOLOCK_SHARED;
+ need_i_mutex = 0;
+ }
+
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+ ret = generic_file_direct_write(iocb, iovp,
+ &nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ if (ret >= 0 && ret != count) {
+ XFS_STATS_ADD(xs_write_bytes, ret);
+
+ pos += ret;
+ count -= ret;
+
+ ioflags &= ~IO_ISDIRECT;
+ xfs_iunlock(ip, iolock);
+ goto relock;
+ }
+ } else {
+ int enospc = 0;
+ ssize_t ret2 = 0;
+
+write_retry:
+ trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+ ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
+ pos, &iocb->ki_pos, count, ret);
+ /*
+ * if we just got an ENOSPC, flush the inode now we
+ * aren't holding any page locks and retry *once*
+ */
+ if (ret2 == -ENOSPC && !enospc) {
+ error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (error)
+ goto out_unlock_internal;
+ enospc = 1;
+ goto write_retry;
+ }
+ ret = ret2;
+ }
+
+ current->backing_dev_info = NULL;
+
+ isize = i_size_read(inode);
+ if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+ iocb->ki_pos = isize;
+
+ if (iocb->ki_pos > ip->i_size) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (iocb->ki_pos > ip->i_size)
+ ip->i_size = iocb->ki_pos;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ if (ret == -ENOSPC &&
+ DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
+ xfs_iunlock(ip, iolock);
+ if (need_i_mutex)
+ mutex_unlock(&inode->i_mutex);
+ error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
+ DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
+ 0, 0, 0); /* Delay flag intentionally unused */
+ if (need_i_mutex)
+ mutex_lock(&inode->i_mutex);
+ xfs_ilock(ip, iolock);
+ if (error)
+ goto out_unlock_internal;
+ goto start;
+ }
+
+ error = -ret;
+ if (ret <= 0)
+ goto out_unlock_internal;
+
+ XFS_STATS_ADD(xs_write_bytes, ret);
+
+ /* Handle various SYNC-type writes */
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
+ int error2;
+
+ xfs_iunlock(ip, iolock);
+ if (need_i_mutex)
+ mutex_unlock(&inode->i_mutex);
+
+ error2 = filemap_write_and_wait_range(mapping, pos, end);
+ if (!error)
+ error = error2;
+ if (need_i_mutex)
+ mutex_lock(&inode->i_mutex);
+ xfs_ilock(ip, iolock);
+
+ error2 = -xfs_file_fsync(file, file->f_path.dentry,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+ if (!error)
+ error = error2;
+ }
+
+ out_unlock_internal:
+ if (ip->i_new_size) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ip->i_new_size = 0;
+ /*
+ * If this was a direct or synchronous I/O that failed (such
+ * as ENOSPC) then part of the I/O may have been written to
+ * disk before the error occured. In this case the on-disk
+ * file size may have been adjusted beyond the in-memory file
+ * size and now needs to be truncated back.
+ */
+ if (ip->i_d.di_size > ip->i_size)
+ ip->i_d.di_size = ip->i_size;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+ xfs_iunlock(ip, iolock);
+ out_unlock_mutex:
+ if (need_i_mutex)
+ mutex_unlock(&inode->i_mutex);
+ return -error;
}
STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
return -xfs_release(XFS_I(inode));
}
-/*
- * We ignore the datasync flag here because a datasync is effectively
- * identical to an fsync. That is, datasync implies that we need to write
- * only the metadata needed to be able to access the data that is written
- * if we crash after the call completes. Hence if we are writing beyond
- * EOF we have to log the inode size change as well, which makes it a
- * full fsync. If we don't write beyond EOF, the inode core will be
- * clean in memory and so we don't need to log the inode, just like
- * fsync.
- */
-STATIC int
-xfs_file_fsync(
- struct file *file,
- struct dentry *dentry,
- int datasync)
-{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- return -xfs_fsync(ip);
-}
-
STATIC int
xfs_file_readdir(
struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
*
* Try to give it an estimate that's good enough, maybe at some
* point we can change the ->readdir prototype to include the
- * buffer size.
+ * buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
error = xfs_readdir(ip, dirent, bufsize,
(xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e8566bbf0f0..61a99608731 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync(
mark_inode_dirty_sync(inode);
}
+void
+xfs_mark_inode_dirty(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+ mark_inode_dirty(inode);
+}
+
/*
* Change the requested timestamp in the given inode.
* We don't lock across timestamp updates, and we don't log them but
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1a..facfb323a70 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
#include <xfs_super.h>
#include <xfs_globals.h>
#include <xfs_fs_subr.h>
-#include <xfs_lrw.h>
#include <xfs_buf.h>
/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index eac6f80d786..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,796 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_attr.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-#include <linux/capability.h>
-#include <linux/writeback.h>
-
-
-/*
- * xfs_iozero
- *
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
- */
-STATIC int
-xfs_iozero(
- struct xfs_inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count) /* size of data to zero */
-{
- struct page *page;
- struct address_space *mapping;
- int status;
-
- mapping = VFS_I(ip)->i_mapping;
- do {
- unsigned offset, bytes;
- void *fsdata;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
-
- zero_user(page, offset, bytes);
-
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
- pos += bytes;
- count -= bytes;
- status = 0;
- } while (count);
-
- return (-status);
-}
-
-ssize_t /* bytes read, or (-) error */
-xfs_read(
- xfs_inode_t *ip,
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned int segs,
- loff_t *offset,
- int ioflags)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- xfs_mount_t *mp = ip->i_mount;
- size_t size = 0;
- ssize_t ret = 0;
- xfs_fsize_t n;
- unsigned long seg;
-
-
- XFS_STATS_INC(xs_read_calls);
-
- /* START copy & waste from filemap.c */
- for (seg = 0; seg < segs; seg++) {
- const struct iovec *iv = &iovp[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- size += iv->iov_len;
- if (unlikely((ssize_t)(size|iv->iov_len) < 0))
- return XFS_ERROR(-EINVAL);
- }
- /* END copy & waste from filemap.c */
-
- if (unlikely(ioflags & IO_ISDIRECT)) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((*offset & target->bt_smask) ||
- (size & target->bt_smask)) {
- if (*offset == ip->i_size) {
- return (0);
- }
- return -XFS_ERROR(EINVAL);
- }
- }
-
- n = XFS_MAXIOFFSET(mp) - *offset;
- if ((n <= 0) || (size == 0))
- return 0;
-
- if (n < size)
- size = n;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
- if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
- int iolock = XFS_IOLOCK_SHARED;
-
- ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
- dmflags, &iolock);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_unlock(&inode->i_mutex);
- return ret;
- }
- }
-
- if (unlikely(ioflags & IO_ISDIRECT)) {
- if (inode->i_mapping->nrpages)
- ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
- mutex_unlock(&inode->i_mutex);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return ret;
- }
- }
-
- trace_xfs_file_read(ip, size, *offset, ioflags);
-
- iocb->ki_pos = *offset;
- ret = generic_file_aio_read(iocb, iovp, segs, *offset);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return ret;
-}
-
-ssize_t
-xfs_splice_read(
- xfs_inode_t *ip,
- struct file *infilp,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t count,
- int flags,
- int ioflags)
-{
- xfs_mount_t *mp = ip->i_mount;
- ssize_t ret;
-
- XFS_STATS_INC(xs_read_calls);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
- if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- int iolock = XFS_IOLOCK_SHARED;
- int error;
-
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
- FILP_DELAY_FLAG(infilp), &iolock);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return -error;
- }
- }
-
- trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return ret;
-}
-
-ssize_t
-xfs_splice_write(
- xfs_inode_t *ip,
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t count,
- int flags,
- int ioflags)
-{
- xfs_mount_t *mp = ip->i_mount;
- ssize_t ret;
- struct inode *inode = outfilp->f_mapping->host;
- xfs_fsize_t isize, new_size;
-
- XFS_STATS_INC(xs_write_calls);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
- int iolock = XFS_IOLOCK_EXCL;
- int error;
-
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
- FILP_DELAY_FLAG(outfilp), &iolock);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return -error;
- }
- }
-
- new_size = *ppos + count;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- if (ip->i_new_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF. We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- xfs_inode_t *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize)
-{
- xfs_fileoff_t last_fsb;
- xfs_mount_t *mp = ip->i_mount;
- int nimaps;
- int zero_offset;
- int zero_len;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- if (zero_offset == 0) {
- /*
- * There are no extra bytes in the last block on disk to
- * zero, so return.
- */
- return 0;
- }
-
- last_fsb = XFS_B_TO_FSBT(mp, isize);
- nimaps = 1;
- error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
- &nimaps, NULL, NULL);
- if (error) {
- return error;
- }
- ASSERT(nimaps > 0);
- /*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
- */
- if (imap.br_startblock == HOLESTARTBLOCK) {
- return 0;
- }
- /*
- * Zero the part of the last block beyond the EOF, and write it
- * out sync. We need to drop the ilock while we do this so we
- * don't deadlock when the buffer cache calls back to us.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
- if (isize + zero_len > offset)
- zero_len = offset - isize;
- error = xfs_iozero(ip, isize, zero_len);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
-}
-
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF. This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file. This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated. If fill is set,
- * then any holes in the range are filled and zeroed. If not, the holes
- * are left alone as holes.
- */
-
-int /* error (positive) */
-xfs_zero_eof(
- xfs_inode_t *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(offset > isize);
-
- /*
- * First handle zeroing the block on which isize resides.
- * We only zero a part of that block so it is handled specially.
- */
- error = xfs_zero_last_block(ip, offset, isize);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- return error;
- }
-
- /*
- * Calculate the range between the new size and the old
- * where blocks needing to be zeroed may exist. To get the
- * block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back
- * to a block boundary. We subtract 1 in case the size is
- * exactly on a block boundary.
- */
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
- return 0;
- }
-
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
- error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
- 0, NULL, 0, &imap, &nimaps, NULL, NULL);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- return error;
- }
- ASSERT(nimaps > 0);
-
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * This loop handles initializing pages that were
- * partially initialized by the code below this
- * loop. It basically zeroes the part of the page
- * that sits on a hole and sets the page as P_HOLE
- * and calls remapf if it is a mapped file.
- */
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
-
- /*
- * There are blocks we need to zero.
- * Drop the inode lock while we're doing the I/O.
- * We'll still have the iolock to protect us.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
- zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
- if ((zero_off + zero_len) > offset)
- zero_len = offset - zero_off;
-
- error = xfs_iozero(ip, zero_off, zero_len);
- if (error) {
- goto out_lock;
- }
-
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- }
-
- return 0;
-
-out_lock:
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
-}
-
-ssize_t /* bytes written, or (-) error */
-xfs_write(
- struct xfs_inode *xip,
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned int nsegs,
- loff_t *offset,
- int ioflags)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- unsigned long segs = nsegs;
- xfs_mount_t *mp;
- ssize_t ret = 0, error = 0;
- xfs_fsize_t isize, new_size;
- int iolock;
- int eventsent = 0;
- size_t ocount = 0, count;
- loff_t pos;
- int need_i_mutex;
-
- XFS_STATS_INC(xs_write_calls);
-
- error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
- if (error)
- return error;
-
- count = ocount;
- pos = *offset;
-
- if (count == 0)
- return 0;
-
- mp = xip->i_mount;
-
- xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
-relock:
- if (ioflags & IO_ISDIRECT) {
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- } else {
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
- }
-
- xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-
-start:
- error = -generic_write_checks(file, &pos, &count,
- S_ISBLK(inode->i_mode));
- if (error) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
- goto out_unlock_mutex;
- }
-
- if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
- !(ioflags & IO_INVIS) && !eventsent)) {
- int dmflags = FILP_DELAY_FLAG(file);
-
- if (need_i_mutex)
- dmflags |= DM_FLAGS_IMUX;
-
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
- pos, count, dmflags, &iolock);
- if (error) {
- goto out_unlock_internal;
- }
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- eventsent = 1;
-
- /*
- * The iolock was dropped and reacquired in XFS_SEND_DATA
- * so we have to recheck the size when appending.
- * We will only "goto start;" once, since having sent the
- * event prevents another call to XFS_SEND_DATA, which is
- * what allows the size to change in the first place.
- */
- if ((file->f_flags & O_APPEND) && pos != xip->i_size)
- goto start;
- }
-
- if (ioflags & IO_ISDIRECT) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(xip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
-
- if ((pos & target->bt_smask) || (count & target->bt_smask)) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
- return XFS_ERROR(-EINVAL);
- }
-
- if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
- xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
- goto start;
- }
- }
-
- new_size = pos + count;
- if (new_size > xip->i_size)
- xip->i_new_size = new_size;
-
- if (likely(!(ioflags & IO_INVIS)))
- file_update_time(file);
-
- /*
- * If the offset is beyond the size of the file, we have a couple
- * of things to do. First, if there is already space allocated
- * we need to either create holes or zero the disk or ...
- *
- * If there is a page where the previous size lands, we need
- * to zero it out up to the new size.
- */
-
- if (pos > xip->i_size) {
- error = xfs_zero_eof(xip, pos, xip->i_size);
- if (error) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- goto out_unlock_internal;
- }
- }
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
-
- /*
- * If we're writing the file then make sure to clear the
- * setuid and setgid bits if the process is not being run
- * by root. This keeps people from modifying setuid and
- * setgid binaries.
- */
- error = -file_remove_suid(file);
- if (unlikely(error))
- goto out_unlock_internal;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
-
- if ((ioflags & IO_ISDIRECT)) {
- if (mapping->nrpages) {
- WARN_ON(need_i_mutex == 0);
- error = xfs_flushinval_pages(xip,
- (pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock_internal;
- }
-
- if (need_i_mutex) {
- /* demote the lock now the cached pages are gone */
- xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
- mutex_unlock(&inode->i_mutex);
-
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- }
-
- trace_xfs_file_direct_write(xip, count, *offset, ioflags);
- ret = generic_file_direct_write(iocb, iovp,
- &segs, pos, offset, count, ocount);
-
- /*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
- */
- if (ret >= 0 && ret != count) {
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- pos += ret;
- count -= ret;
-
- ioflags &= ~IO_ISDIRECT;
- xfs_iunlock(xip, iolock);
- goto relock;
- }
- } else {
- int enospc = 0;
- ssize_t ret2 = 0;
-
-write_retry:
- trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
- ret2 = generic_file_buffered_write(iocb, iovp, segs,
- pos, offset, count, ret);
- /*
- * if we just got an ENOSPC, flush the inode now we
- * aren't holding any page locks and retry *once*
- */
- if (ret2 == -ENOSPC && !enospc) {
- error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
- if (error)
- goto out_unlock_internal;
- enospc = 1;
- goto write_retry;
- }
- ret = ret2;
- }
-
- current->backing_dev_info = NULL;
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
- *offset = isize;
-
- if (*offset > xip->i_size) {
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- if (*offset > xip->i_size)
- xip->i_size = *offset;
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- }
-
- if (ret == -ENOSPC &&
- DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
- xfs_iunlock(xip, iolock);
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
- error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
- DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
- 0, 0, 0); /* Delay flag intentionally unused */
- if (need_i_mutex)
- mutex_lock(&inode->i_mutex);
- xfs_ilock(xip, iolock);
- if (error)
- goto out_unlock_internal;
- goto start;
- }
-
- error = -ret;
- if (ret <= 0)
- goto out_unlock_internal;
-
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error2;
-
- xfs_iunlock(xip, iolock);
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
-
- error2 = filemap_write_and_wait_range(mapping, pos, end);
- if (!error)
- error = error2;
- if (need_i_mutex)
- mutex_lock(&inode->i_mutex);
- xfs_ilock(xip, iolock);
-
- error2 = xfs_fsync(xip);
- if (!error)
- error = error2;
- }
-
- out_unlock_internal:
- if (xip->i_new_size) {
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- xip->i_new_size = 0;
- /*
- * If this was a direct or synchronous I/O that failed (such
- * as ENOSPC) then part of the I/O may have been written to
- * disk before the error occured. In this case the on-disk
- * file size may have been adjusted beyond the in-memory file
- * size and now needs to be truncated back.
- */
- if (xip->i_d.di_size > xip->i_size)
- xip->i_d.di_size = xip->i_size;
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- }
- xfs_iunlock(xip, iolock);
- out_unlock_mutex:
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
- return -error;
-}
-
-/*
- * If the underlying (data/log/rt) device is readonly, there are some
- * operations that cannot proceed.
- */
-int
-xfs_dev_is_read_only(
- xfs_mount_t *mp,
- char *message)
-{
- if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
- xfs_readonly_buftarg(mp->m_logdev_targp) ||
- (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
- cmn_err(CE_NOTE,
- "XFS: %s required on read-only device.", message);
- cmn_err(CE_NOTE,
- "XFS: write access unavailable, cannot proceed.");
- return EROFS;
- }
- return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index 342ae8c0d01..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_LRW_H__
-#define __XFS_LRW_H__
-
-struct xfs_mount;
-struct xfs_inode;
-struct xfs_buf;
-
-extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-
-extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-
-#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d63..1947514ce1a 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
}
STATIC int
-xfs_fs_quota_sync(
- struct super_block *sb,
- int type)
-{
- struct xfs_mount *mp = XFS_M(sb);
-
- if (sb->s_flags & MS_RDONLY)
- return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
- return -xfs_sync_data(mp, 0);
-}
-
-STATIC int
xfs_fs_get_xstate(
struct super_block *sb,
struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
return -EROFS;
if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
if (uflags & XFS_QUOTA_UDQ_ACCT)
flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
}
const struct quotactl_ops xfs_quotactl_operations = {
- .quota_sync = xfs_fs_quota_sync,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
.get_xquota = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 25ea2408118..71345a370d9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1063,7 +1063,7 @@ xfs_log_inode(
STATIC int
xfs_fs_write_inode(
struct inode *inode,
- int sync)
+ struct writeback_control *wbc)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1074,11 +1074,7 @@ xfs_fs_write_inode(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (sync) {
- error = xfs_wait_on_pages(ip, 0, -1);
- if (error)
- goto out;
-
+ if (wbc->sync_mode == WB_SYNC_ALL) {
/*
* Make sure the inode has hit stable storage. By using the
* log and the fsync transactions we reduce the IOs we have
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a9f6d20aff4..05cd85317f6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -607,7 +607,8 @@ xfssyncd(
set_freezable();
timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
for (;;) {
- timeleft = schedule_timeout_interruptible(timeleft);
+ if (list_empty(&mp->m_sync_list))
+ timeleft = schedule_timeout_interruptible(timeleft);
/* swsusp */
try_to_freeze();
if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -627,8 +628,7 @@ xfssyncd(
list_add_tail(&mp->m_sync_work.w_list,
&mp->m_sync_list);
}
- list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
- list_move(&work->w_list, &tmp);
+ list_splice_init(&mp->m_sync_list, &tmp);
spin_unlock(&mp->m_sync_lock);
list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -688,12 +688,12 @@ xfs_inode_set_reclaim_tag(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- read_lock(&pag->pag_ici_lock);
+ write_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
__xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ write_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d60..5a107601e96 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -52,22 +52,6 @@
#include "quota/xfs_dquot.h"
/*
- * Format fsblock number into a static buffer & return it.
- */
-STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
-{
- static char rval[50];
-
- if (bno == NULLFSBLOCK)
- sprintf(rval, "NULLFSBLOCK");
- else if (isnullstartblock(bno))
- sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
- else
- sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
- return rval;
-}
-
-/*
* We include this last to have the helpers above available for the trace
* event implementations.
*/
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index a4574dcf506..fcaa62f0799 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -197,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %s count %lld flag %d caller %pf",
+ "offset %lld block %lld count %lld flag %d caller %pf",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
(long)__entry->idx,
__entry->startoff,
- xfs_fmtfsblock(__entry->startblock),
+ (__int64_t)__entry->startblock,
__entry->blockcount,
__entry->state,
(char *)__entry->caller_ip)
@@ -241,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %s count %lld flag %d caller %pf",
+ "offset %lld block %lld count %lld flag %d caller %pf",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
(long)__entry->idx,
__entry->startoff,
- xfs_fmtfsblock(__entry->startblock),
+ (__int64_t)__entry->startblock,
__entry->blockcount,
__entry->state,
(char *)__entry->caller_ip)
@@ -593,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
TP_ARGS(dqp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(__be32, id)
+ __field(u32, id)
__field(unsigned, flags)
__field(unsigned, nrefs)
__field(unsigned long long, res_bcount)
@@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
), \
TP_fast_assign(
__entry->dev = dqp->q_mount->m_super->s_dev;
- __entry->id = dqp->q_core.d_id;
+ __entry->id = be32_to_cpu(dqp->q_core.d_id);
__entry->flags = dqp->dq_flags;
__entry->nrefs = dqp->q_nrefs;
__entry->res_bcount = dqp->q_res_bcount;
@@ -622,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
be64_to_cpu(dqp->q_core.d_ino_softlimit);
),
TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
- "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] "
- "icnt 0x%llx [hard 0x%llx | soft 0x%llx]",
+ "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+ "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
MAJOR(__entry->dev), MINOR(__entry->dev),
- be32_to_cpu(__entry->id),
+ __entry->id,
__print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
__entry->nrefs,
__entry->res_bcount,
@@ -881,7 +881,7 @@ TRACE_EVENT(name, \
), \
TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
"offset 0x%llx count %zd flags %s " \
- "startoff 0x%llx startblock %s blockcount 0x%llx", \
+ "startoff 0x%llx startblock %lld blockcount 0x%llx", \
MAJOR(__entry->dev), MINOR(__entry->dev), \
__entry->ino, \
__entry->size, \
@@ -890,7 +890,7 @@ TRACE_EVENT(name, \
__entry->count, \
__print_flags(__entry->flags, "|", BMAPI_FLAGS), \
__entry->startoff, \
- xfs_fmtfsblock(__entry->startblock), \
+ (__int64_t)__entry->startblock, \
__entry->blockcount) \
)
DEFINE_IOMAP_EVENT(xfs_iomap_enter);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1869fb97381..5c11e4d1701 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc(
}
STATIC int
+xfs_bmap_btalloc_nullfb(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_perag *pag;
+ xfs_agnumber_t ag, startag;
+ int notinit = 0;
+ int error;
+
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ else
+ args->type = XFS_ALLOCTYPE_START_BNO;
+ args->total = ap->total;
+
+ /*
+ * Search for an allocation group with a single extent large enough
+ * for the request. If one isn't found, then adjust the minimum
+ * allocation size to the largest space found.
+ */
+ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (startag == NULLAGNUMBER)
+ startag = ag = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ while (*blen < ap->alen) {
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, args->tp, ag,
+ XFS_ALLOC_FLAG_TRYLOCK);
+ if (error) {
+ xfs_perag_put(pag);
+ return error;
+ }
+ }
+
+ /*
+ * See xfs_alloc_fix_freelist...
+ */
+ if (pag->pagf_init) {
+ xfs_extlen_t longest;
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
+ } else
+ notinit = 1;
+
+ if (xfs_inode_is_filestream(ap->ip)) {
+ if (*blen >= ap->alen)
+ break;
+
+ if (ap->userdata) {
+ /*
+ * If startag is an invalid AG, we've
+ * come here once before and
+ * xfs_filestream_new_ag picked the
+ * best currently available.
+ *
+ * Don't continue looping, since we
+ * could loop forever.
+ */
+ if (startag == NULLAGNUMBER)
+ break;
+
+ error = xfs_filestream_new_ag(ap, &ag);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ /* loop again to set 'blen'*/
+ startag = NULLAGNUMBER;
+ pag = xfs_perag_get(mp, ag);
+ continue;
+ }
+ }
+ if (++ag == mp->m_sb.sb_agcount)
+ ag = 0;
+ if (ag == startag)
+ break;
+ xfs_perag_put(pag);
+ pag = xfs_perag_get(mp, ag);
+ }
+ xfs_perag_put(pag);
+
+ /*
+ * Since the above loop did a BUF_TRYLOCK, it is
+ * possible that there is space for this request.
+ */
+ if (notinit || *blen < ap->minlen)
+ args->minlen = ap->minlen;
+ /*
+ * If the best seen length is less than the request
+ * length, use the best as the minimum.
+ */
+ else if (*blen < ap->alen)
+ args->minlen = *blen;
+ /*
+ * Otherwise we've seen an extent as big as alen,
+ * use that as the minimum.
+ */
+ else
+ args->minlen = ap->alen;
+
+ /*
+ * set the failure fallback case to look in the selected
+ * AG as the stream may have moved.
+ */
+ if (xfs_inode_is_filestream(ap->ip))
+ ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+
+ return 0;
+}
+
+STATIC int
xfs_bmap_btalloc(
xfs_bmalloca_t *ap) /* bmap alloc argument struct */
{
xfs_mount_t *mp; /* mount point structure */
xfs_alloctype_t atype = 0; /* type for allocation routines */
xfs_extlen_t align; /* minimum allocation alignment */
- xfs_agnumber_t ag;
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
- xfs_agnumber_t startag;
+ xfs_agnumber_t ag;
xfs_alloc_arg_t args;
xfs_extlen_t blen;
xfs_extlen_t nextminlen = 0;
- xfs_perag_t *pag;
int nullfb; /* true if ap->firstblock isn't set */
int isaligned;
- int notinit;
int tryagain;
int error;
@@ -2612,103 +2724,9 @@ xfs_bmap_btalloc(
args.firstblock = ap->firstblock;
blen = 0;
if (nullfb) {
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- else
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.total = ap->total;
-
- /*
- * Search for an allocation group with a single extent
- * large enough for the request.
- *
- * If one isn't found, then adjust the minimum allocation
- * size to the largest space found.
- */
- startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
- if (startag == NULLAGNUMBER)
- startag = ag = 0;
- notinit = 0;
- pag = xfs_perag_get(mp, ag);
- while (blen < ap->alen) {
- if (!pag->pagf_init &&
- (error = xfs_alloc_pagf_init(mp, args.tp,
- ag, XFS_ALLOC_FLAG_TRYLOCK))) {
- xfs_perag_put(pag);
- return error;
- }
- /*
- * See xfs_alloc_fix_freelist...
- */
- if (pag->pagf_init) {
- xfs_extlen_t longest;
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if (blen < longest)
- blen = longest;
- } else
- notinit = 1;
-
- if (xfs_inode_is_filestream(ap->ip)) {
- if (blen >= ap->alen)
- break;
-
- if (ap->userdata) {
- /*
- * If startag is an invalid AG, we've
- * come here once before and
- * xfs_filestream_new_ag picked the
- * best currently available.
- *
- * Don't continue looping, since we
- * could loop forever.
- */
- if (startag == NULLAGNUMBER)
- break;
-
- error = xfs_filestream_new_ag(ap, &ag);
- xfs_perag_put(pag);
- if (error)
- return error;
-
- /* loop again to set 'blen'*/
- startag = NULLAGNUMBER;
- pag = xfs_perag_get(mp, ag);
- continue;
- }
- }
- if (++ag == mp->m_sb.sb_agcount)
- ag = 0;
- if (ag == startag)
- break;
- xfs_perag_put(pag);
- pag = xfs_perag_get(mp, ag);
- }
- xfs_perag_put(pag);
- /*
- * Since the above loop did a BUF_TRYLOCK, it is
- * possible that there is space for this request.
- */
- if (notinit || blen < ap->minlen)
- args.minlen = ap->minlen;
- /*
- * If the best seen length is less than the request
- * length, use the best as the minimum.
- */
- else if (blen < ap->alen)
- args.minlen = blen;
- /*
- * Otherwise we've seen an extent as big as alen,
- * use that as the minimum.
- */
- else
- args.minlen = ap->alen;
-
- /*
- * set the failure fallback case to look in the selected
- * AG as the stream may have moved.
- */
- if (xfs_inode_is_filestream(ap->ip))
- ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ if (error)
+ return error;
} else if (ap->low) {
if (xfs_inode_is_filestream(ap->ip))
args.type = XFS_ALLOCTYPE_FIRST_AG;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277..7cf7220e7d5 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
__s32 bs_extents; /* number of extents */
__u32 bs_gen; /* generation count */
__u16 bs_projid; /* project id */
- unsigned char bs_pad[14]; /* pad space, unused */
+ __u16 bs_forkoff; /* inode fork offset in bytes */
+ unsigned char bs_pad[12]; /* pad space, unused */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e281eb4a1c4..6845db90818 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -190,13 +190,12 @@ xfs_iget_cache_hit(
trace_xfs_iget_reclaim(ip);
/*
- * We need to set XFS_INEW atomically with clearing the
- * reclaimable tag so that we do have an indicator of the
- * inode still being initialized.
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
*/
- ip->i_flags |= XFS_INEW;
- ip->i_flags &= ~XFS_IRECLAIMABLE;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
@@ -216,7 +215,15 @@ xfs_iget_cache_hit(
trace_xfs_iget_reclaim(ip);
goto out_error;
}
+
+ write_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
inode->i_state = I_NEW;
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
} else {
/* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fa31360046d..0ffd5644704 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2439,75 +2439,31 @@ xfs_idestroy_fork(
}
/*
- * Increment the pin count of the given buffer.
- * This value is protected by ipinlock spinlock in the mount structure.
+ * This is called to unpin an inode. The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
*/
-void
-xfs_ipin(
- xfs_inode_t *ip)
-{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- atomic_inc(&ip->i_pincount);
-}
-
-/*
- * Decrement the pin count of the given inode, and wake up
- * anyone in xfs_iwait_unpin() if the count goes to 0. The
- * inode must have been previously pinned with a call to xfs_ipin().
- */
-void
-xfs_iunpin(
- xfs_inode_t *ip)
-{
- ASSERT(atomic_read(&ip->i_pincount) > 0);
-
- if (atomic_dec_and_test(&ip->i_pincount))
- wake_up(&ip->i_ipin_wait);
-}
-
-/*
- * This is called to unpin an inode. It can be directed to wait or to return
- * immediately without waiting for the inode to be unpinned. The caller must
- * have the inode locked in at least shared mode so that the buffer cannot be
- * subsequently pinned once someone is waiting for it to be unpinned.
- */
-STATIC void
-__xfs_iunpin_wait(
- xfs_inode_t *ip,
- int wait)
+static void
+xfs_iunpin_nowait(
+ struct xfs_inode *ip)
{
- xfs_inode_log_item_t *iip = ip->i_itemp;
-
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- if (atomic_read(&ip->i_pincount) == 0)
- return;
/* Give the log a push to start the unpinning I/O */
- if (iip && iip->ili_last_lsn)
- xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
- else
- xfs_log_force(ip->i_mount, 0);
+ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
- if (wait)
- wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
}
void
xfs_iunpin_wait(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- __xfs_iunpin_wait(ip, 1);
-}
-
-static inline void
-xfs_iunpin_nowait(
- xfs_inode_t *ip)
-{
- __xfs_iunpin_wait(ip, 0);
+ if (xfs_ipincount(ip)) {
+ xfs_iunpin_nowait(ip);
+ wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+ }
}
-
/*
* xfs_iextents_copy()
*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6c912b02759..9965e40a461 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -471,8 +471,6 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
-void xfs_ipin(xfs_inode_t *);
-void xfs_iunpin(xfs_inode_t *);
void xfs_iunpin_wait(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
void xfs_ichgtime(xfs_inode_t *, int);
@@ -480,6 +478,7 @@ void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
void xfs_synchronize_times(xfs_inode_t *);
+void xfs_mark_inode_dirty(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d4dc063111f..7bfea854015 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -535,23 +535,23 @@ xfs_inode_item_format(
/*
* This is called to pin the inode associated with the inode log
- * item in memory so it cannot be written out. Do this by calling
- * xfs_ipin() to bump the pin count in the inode while holding the
- * inode pin lock.
+ * item in memory so it cannot be written out.
*/
STATIC void
xfs_inode_item_pin(
xfs_inode_log_item_t *iip)
{
ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
- xfs_ipin(iip->ili_inode);
+
+ atomic_inc(&iip->ili_inode->i_pincount);
}
/*
* This is called to unpin the inode associated with the inode log
* item which was previously pinned with a call to xfs_inode_item_pin().
- * Just call xfs_iunpin() on the inode to do this.
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
*/
/* ARGSUSED */
STATIC void
@@ -559,7 +559,11 @@ xfs_inode_item_unpin(
xfs_inode_log_item_t *iip,
int stale)
{
- xfs_iunpin(iip->ili_inode);
+ struct xfs_inode *ip = iip->ili_inode;
+
+ ASSERT(atomic_read(&ip->i_pincount) > 0);
+ if (atomic_dec_and_test(&ip->i_pincount))
+ wake_up(&ip->i_ipin_wait);
}
/* ARGSUSED */
@@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove(
xfs_inode_log_item_t *iip,
xfs_trans_t *tp)
{
- xfs_iunpin(iip->ili_inode);
+ xfs_inode_item_unpin(iip, 0);
}
/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3af02314c60..b1b801e4a28 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
buf->bs_dmevmask = dic->di_dmevmask;
buf->bs_dmstate = dic->di_dmstate;
buf->bs_aextents = dic->di_anextents;
+ buf->bs_forkoff = XFS_IFORK_BOFF(ip);
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
buf->bs_aextents = be16_to_cpu(dic->di_anextents);
+ buf->bs_forkoff = XFS_DFORK_BOFF(dic);
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4f16be4b6ee..e8fba92d7cd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
STATIC void xlog_dealloc_log(xlog_t *log);
STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
- int nentries, xfs_log_ticket_t tic,
+ int nentries, struct xlog_ticket *tic,
xfs_lsn_t *start_lsn,
xlog_in_core_t **commit_iclog,
uint flags);
@@ -243,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
* out when the next write occurs.
*/
xfs_lsn_t
-xfs_log_done(xfs_mount_t *mp,
- xfs_log_ticket_t xtic,
- void **iclog,
- uint flags)
+xfs_log_done(
+ struct xfs_mount *mp,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ uint flags)
{
- xlog_t *log = mp->m_log;
- xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic;
- xfs_lsn_t lsn = 0;
+ struct log *log = mp->m_log;
+ xfs_lsn_t lsn = 0;
if (XLOG_FORCED_SHUTDOWN(log) ||
/*
@@ -258,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp,
* If we get an error, just continue and give back the log ticket.
*/
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(mp, ticket,
- (xlog_in_core_t **)iclog, &lsn)))) {
+ (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
flags |= XFS_LOG_REL_PERM_RESERV;
@@ -289,7 +288,7 @@ xfs_log_done(xfs_mount_t *mp,
}
return lsn;
-} /* xfs_log_done */
+}
/*
* Attaches a new iclog I/O completion callback routine during
@@ -298,11 +297,11 @@ xfs_log_done(xfs_mount_t *mp,
* executing the callback at an appropriate time.
*/
int
-xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
- void *iclog_hndl, /* iclog to hang callback off */
- xfs_log_callback_t *cb)
+xfs_log_notify(
+ struct xfs_mount *mp,
+ struct xlog_in_core *iclog,
+ xfs_log_callback_t *cb)
{
- xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
int abortflg;
spin_lock(&iclog->ic_callback_lock);
@@ -316,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
}
spin_unlock(&iclog->ic_callback_lock);
return abortflg;
-} /* xfs_log_notify */
+}
int
-xfs_log_release_iclog(xfs_mount_t *mp,
- void *iclog_hndl)
+xfs_log_release_iclog(
+ struct xfs_mount *mp,
+ struct xlog_in_core *iclog)
{
- xlog_t *log = mp->m_log;
- xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
-
- if (xlog_state_release_iclog(log, iclog)) {
+ if (xlog_state_release_iclog(mp->m_log, iclog)) {
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
return EIO;
}
@@ -344,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
* reservation, we prevent over allocation problems.
*/
int
-xfs_log_reserve(xfs_mount_t *mp,
- int unit_bytes,
- int cnt,
- xfs_log_ticket_t *ticket,
- __uint8_t client,
- uint flags,
- uint t_type)
+xfs_log_reserve(
+ struct xfs_mount *mp,
+ int unit_bytes,
+ int cnt,
+ struct xlog_ticket **ticket,
+ __uint8_t client,
+ uint flags,
+ uint t_type)
{
- xlog_t *log = mp->m_log;
- xlog_ticket_t *internal_ticket;
- int retval = 0;
+ struct log *log = mp->m_log;
+ struct xlog_ticket *internal_ticket;
+ int retval = 0;
ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -367,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp,
if (*ticket != NULL) {
ASSERT(flags & XFS_LOG_PERM_RESERV);
- internal_ticket = (xlog_ticket_t *)*ticket;
+ internal_ticket = *ticket;
trace_xfs_log_reserve(log, internal_ticket);
@@ -519,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
xlog_in_core_t *first_iclog;
#endif
xfs_log_iovec_t reg[1];
- xfs_log_ticket_t tic = NULL;
+ xlog_ticket_t *tic = NULL;
xfs_lsn_t lsn;
int error;
@@ -656,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
* transaction occur with one call to xfs_log_write().
*/
int
-xfs_log_write(xfs_mount_t * mp,
- xfs_log_iovec_t reg[],
- int nentries,
- xfs_log_ticket_t tic,
- xfs_lsn_t *start_lsn)
+xfs_log_write(
+ struct xfs_mount *mp,
+ struct xfs_log_iovec reg[],
+ int nentries,
+ struct xlog_ticket *tic,
+ xfs_lsn_t *start_lsn)
{
- int error;
- xlog_t *log = mp->m_log;
+ struct log *log = mp->m_log;
+ int error;
if (XLOG_FORCED_SHUTDOWN(log))
return XFS_ERROR(EIO);
- if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
+ error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+ if (error)
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- }
return error;
-} /* xfs_log_write */
-
+}
void
xfs_log_move_tail(xfs_mount_t *mp,
@@ -1642,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
* bytes have been written out.
*/
STATIC int
-xlog_write(xfs_mount_t * mp,
- xfs_log_iovec_t reg[],
- int nentries,
- xfs_log_ticket_t tic,
- xfs_lsn_t *start_lsn,
- xlog_in_core_t **commit_iclog,
- uint flags)
+xlog_write(
+ struct xfs_mount *mp,
+ struct xfs_log_iovec reg[],
+ int nentries,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog,
+ uint flags)
{
xlog_t *log = mp->m_log;
- xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
xlog_op_header_t *logop_head; /* ptr to log operation header */
__psint_t ptr; /* copy address into data region */
@@ -1765,7 +1763,7 @@ xlog_write(xfs_mount_t * mp,
default:
xfs_fs_cmn_err(CE_WARN, mp,
"Bad XFS transaction clientid 0x%x in ticket 0x%p",
- logop_head->oh_clientid, tic);
+ logop_head->oh_clientid, ticket);
return XFS_ERROR(EIO);
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 7074be9d13e..97a24c7795a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,8 +110,6 @@ typedef struct xfs_log_iovec {
uint i_type; /* type of region */
} xfs_log_iovec_t;
-typedef void* xfs_log_ticket_t;
-
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
@@ -126,10 +124,12 @@ typedef struct xfs_log_callback {
#ifdef __KERNEL__
/* Log manager interfaces */
struct xfs_mount;
+struct xlog_in_core;
struct xlog_ticket;
+
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- xfs_log_ticket_t ticket,
- void **iclog,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
uint flags);
int _xfs_log_force(struct xfs_mount *mp,
uint flags,
@@ -151,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_move_tail(struct xfs_mount *mp,
xfs_lsn_t tail_lsn);
int xfs_log_notify(struct xfs_mount *mp,
- void *iclog,
+ struct xlog_in_core *iclog,
xfs_log_callback_t *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
- void *iclog_hndl);
+ struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
- xfs_log_ticket_t *ticket,
+ struct xlog_ticket **ticket,
__uint8_t clientid,
uint flags,
uint t_type);
int xfs_log_write(struct xfs_mount *mp,
xfs_log_iovec_t region[],
int nentries,
- xfs_log_ticket_t ticket,
+ struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6afaaeb2950..e79b56b4bca 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1097,13 +1097,15 @@ xfs_default_resblks(xfs_mount_t *mp)
__uint64_t resblks;
/*
- * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
- * This may drive us straight to ENOSPC on mount, but that implies
- * we were already there on the last unmount. Warn if this occurs.
+ * We default to 5% or 8192 fsbs of space reserved, whichever is
+ * smaller. This is intended to cover concurrent allocation
+ * transactions when we initially hit enospc. These each require a 4
+ * block reservation. Hence by default we cover roughly 2000 concurrent
+ * allocation reservations.
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
- resblks = min_t(__uint64_t, resblks, 1024);
+ resblks = min_t(__uint64_t, resblks, 8192);
return resblks;
}
@@ -1417,6 +1419,9 @@ xfs_mountfs(
* when at ENOSPC. This is needed for operations like create with
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
* are not allowed to use this reserved space.
+ *
+ * This may drive us straight to ENOSPC on mount, but that implies
+ * we were already there on the last unmount. Warn if this occurs.
*/
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
resblks = xfs_default_resblks(mp);
@@ -1725,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
lcounter += rem;
}
} else { /* Taking blocks away */
-
lcounter += delta;
+ if (lcounter >= 0) {
+ mp->m_sb.sb_fdblocks = lcounter +
+ XFS_ALLOC_SET_ASIDE(mp);
+ return 0;
+ }
- /*
- * If were out of blocks, use any available reserved blocks if
- * were allowed to.
- */
+ /*
+ * We are out of blocks, use any available reserved
+ * blocks if were allowed to.
+ */
+ if (!rsvd)
+ return XFS_ERROR(ENOSPC);
- if (lcounter < 0) {
- if (rsvd) {
- lcounter = (long long)mp->m_resblks_avail + delta;
- if (lcounter < 0) {
- return XFS_ERROR(ENOSPC);
- }
- mp->m_resblks_avail = lcounter;
- return 0;
- } else { /* not reserved */
- return XFS_ERROR(ENOSPC);
- }
+ lcounter = (long long)mp->m_resblks_avail + delta;
+ if (lcounter >= 0) {
+ mp->m_resblks_avail = lcounter;
+ return 0;
}
+ printk_once(KERN_WARNING
+ "Filesystem \"%s\": reserve blocks depleted! "
+ "Consider increasing reserve pool size.",
+ mp->m_fsname);
+ return XFS_ERROR(ENOSPC);
}
mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -2052,6 +2061,26 @@ xfs_mount_log_sb(
return error;
}
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+ struct xfs_mount *mp,
+ char *message)
+{
+ if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+ xfs_readonly_buftarg(mp->m_logdev_targp) ||
+ (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+ cmn_err(CE_NOTE,
+ "XFS: %s required on read-only device.", message);
+ cmn_err(CE_NOTE,
+ "XFS: write access unavailable, cannot proceed.");
+ return EROFS;
+ }
+ return 0;
+}
#ifdef HAVE_PERCPU_SB
/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 14dafd60823..4fa0bc7b983 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -436,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *);
extern int xfs_fs_writable(xfs_mount_t *);
extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
+
extern int xfs_dmops_get(struct xfs_mount *);
extern void xfs_dmops_put(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be942d4e332..f73e358bae8 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -796,7 +796,7 @@ _xfs_trans_commit(
int sync;
#define XFS_TRANS_LOGVEC_COUNT 16
xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
- void *commit_iclog;
+ struct xlog_in_core *commit_iclog;
int shutdown;
commit_lsn = -1;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c93e3a10285..79c8bab9dff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -910,7 +910,7 @@ typedef struct xfs_trans {
unsigned int t_blk_res_used; /* # of resvd blocks used */
unsigned int t_rtx_res; /* # of rt extents resvd */
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
- xfs_log_ticket_t t_ticket; /* log mgr ticket */
+ struct xlog_ticket *t_ticket; /* log mgr ticket */
xfs_lsn_t t_lsn; /* log seq num of start of
* transaction. */
xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 5ffd544434e..fb586360d1c 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
xfs_daddr_t, int);
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it. Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ int reset_recur)
+{
+ struct xfs_buf_log_item *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+ /*
+ * The xfs_buf_log_item pointer is stored in b_fsprivate. If
+ * it doesn't have one yet, then allocate one and initialize it.
+ * The checks to see if one is there are in xfs_buf_item_init().
+ */
+ xfs_buf_item_init(bp, tp->t_mountp);
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+ if (reset_recur)
+ bip->bli_recur = 0;
+
+ /*
+ * Take a reference for this transaction on the buf item.
+ */
+ atomic_inc(&bip->bli_refcount);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+
+ /*
+ * Initialize b_fsprivate2 so we can find it with incore_match()
+ * in xfs_trans_get_buf() and friends above.
+ */
+ XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+}
+
+void
+xfs_trans_bjoin(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ _xfs_trans_bjoin(tp, bp, 0);
+ trace_xfs_trans_bjoin(bp->b_fspriv);
+}
/*
* Get and lock the buffer for the caller if it is not already
@@ -132,40 +191,8 @@ xfs_trans_get_buf(xfs_trans_t *tp,
ASSERT(!XFS_BUF_GETERROR(bp));
- /*
- * The xfs_buf_log_item pointer is stored in b_fsprivate. If
- * it doesn't have one yet, then allocate one and initialize it.
- * The checks to see if one is there are in xfs_buf_item_init().
- */
- xfs_buf_item_init(bp, tp->t_mountp);
-
- /*
- * Set the recursion count for the buffer within this transaction
- * to 0.
- */
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
- bip->bli_recur = 0;
-
- /*
- * Take a reference for this transaction on the buf item.
- */
- atomic_inc(&bip->bli_refcount);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * above.
- */
- XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
- trace_xfs_trans_get_buf(bip);
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_get_buf(bp->b_fspriv);
return (bp);
}
@@ -210,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp,
}
bp = xfs_getsb(mp, flags);
- if (bp == NULL) {
+ if (bp == NULL)
return NULL;
- }
-
- /*
- * The xfs_buf_log_item pointer is stored in b_fsprivate. If
- * it doesn't have one yet, then allocate one and initialize it.
- * The checks to see if one is there are in xfs_buf_item_init().
- */
- xfs_buf_item_init(bp, mp);
-
- /*
- * Set the recursion count for the buffer within this transaction
- * to 0.
- */
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
- bip->bli_recur = 0;
-
- /*
- * Take a reference for this transaction on the buf item.
- */
- atomic_inc(&bip->bli_refcount);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * above.
- */
- XFS_BUF_SET_FSPRIVATE2(bp, tp);
- trace_xfs_trans_getsb(bip);
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_getsb(bp->b_fspriv);
return (bp);
}
@@ -425,40 +419,9 @@ xfs_trans_read_buf(
if (XFS_FORCED_SHUTDOWN(mp))
goto shutdown_abort;
- /*
- * The xfs_buf_log_item pointer is stored in b_fsprivate. If
- * it doesn't have one yet, then allocate one and initialize it.
- * The checks to see if one is there are in xfs_buf_item_init().
- */
- xfs_buf_item_init(bp, tp->t_mountp);
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_read_buf(bp->b_fspriv);
- /*
- * Set the recursion count for the buffer within this transaction
- * to 0.
- */
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
- bip->bli_recur = 0;
-
- /*
- * Take a reference for this transaction on the buf item.
- */
- atomic_inc(&bip->bli_refcount);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * above.
- */
- XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
- trace_xfs_trans_read_buf(bip);
*bpp = bp;
return 0;
@@ -623,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
}
/*
- * Add the locked buffer to the transaction.
- * The buffer must be locked, and it cannot be associated with any
- * transaction.
- *
- * If the buffer does not yet have a buf log item associated with it,
- * then allocate one for it. Then add the buf item to the transaction.
- */
-void
-xfs_trans_bjoin(xfs_trans_t *tp,
- xfs_buf_t *bp)
-{
- xfs_buf_log_item_t *bip;
-
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
-
- /*
- * The xfs_buf_log_item pointer is stored in b_fsprivate. If
- * it doesn't have one yet, then allocate one and initialize it.
- * The checks to see if one is there are in xfs_buf_item_init().
- */
- xfs_buf_item_init(bp, tp->t_mountp);
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-
- /*
- * Take a reference for this transaction on the buf item.
- */
- atomic_inc(&bip->bli_refcount);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * in xfs_trans_get_buf() and friends above.
- */
- XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
- trace_xfs_trans_bjoin(bip);
-}
-
-/*
* Mark the buffer as not needing to be unlocked when the buf item's
* IOP_UNLOCK() routine is called. The buffer must already be locked
* and associated with the given transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ddd2c5d1b85..9d376be0ea3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -584,113 +584,6 @@ xfs_readlink(
}
/*
- * xfs_fsync
- *
- * This is called to sync the inode and its data out to disk. We need to hold
- * the I/O lock while flushing the data, and the inode lock while flushing the
- * inode. The inode lock CANNOT be held while flushing the data, so acquire
- * after we're done with that.
- */
-int
-xfs_fsync(
- xfs_inode_t *ip)
-{
- xfs_trans_t *tp;
- int error = 0;
- int log_flushed = 0;
-
- xfs_itrace_entry(ip);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return XFS_ERROR(EIO);
-
- /*
- * We always need to make sure that the required inode state is safe on
- * disk. The inode might be clean but we still might need to force the
- * log because of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional changes to the
- * inode core that have to go to disk and this requires us to issue
- * a synchronous transaction to capture these changes correctly.
- *
- * This code relies on the assumption that if the update_* fields
- * of the inode are clear and the inode is unpinned then it is clean
- * and no action is required.
- */
- xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- if (!ip->i_update_core) {
- /*
- * Timestamps/size haven't changed since last inode flush or
- * inode transaction commit. That means either nothing got
- * written or a transaction committed which caught the updates.
- * If the latter happened and the transaction hasn't hit the
- * disk yet, the inode will be still be pinned. If it is,
- * force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (ip->i_itemp->ili_last_lsn) {
- error = _xfs_log_force_lsn(ip->i_mount,
- ip->i_itemp->ili_last_lsn,
- XFS_LOG_SYNC, &log_flushed);
- } else {
- error = _xfs_log_force(ip->i_mount,
- XFS_LOG_SYNC, &log_flushed);
- }
- }
- } else {
- /*
- * Kick off a transaction to log the inode core to get the
- * updates. The sync transaction will also force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Note - it's possible that we might have pushed ourselves out
- * of the way during trans_reserve which would flush the inode.
- * But there's no guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer could be pinned
- * anyway if it's part of an inode in another recent
- * transaction. So we play it safe and fire off the
- * transaction anyway.
- */
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = _xfs_trans_commit(tp, 0, &log_flushed);
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
- /*
- * If the log write didn't issue an ordered tag we need
- * to flush the disk cache for the data device now.
- */
- if (!log_flushed)
- xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-
- /*
- * If this inode is on the RT dev we need to flush that
- * cache as well.
- */
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
- }
-
- return error;
-}
-
-/*
* Flags for xfs_free_eofblocks
*/
#define XFS_FREE_EOF_TRYLOCK (1<<0)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 774f40729ca..d8dfa8d0dad 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip);
int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -50,18 +49,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
- const struct iovec *iovp, unsigned int segs,
- loff_t *offset, int ioflags);
-ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
- loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
- int flags, int ioflags);
-ssize_t xfs_splice_write(struct xfs_inode *ip,
- struct pipe_inode_info *pipe, struct file *outfilp,
- loff_t *ppos, size_t count, int flags, int ioflags);
-ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
- const struct iovec *iovp, unsigned int nsegs,
- loff_t *offset, int ioflags);
int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int flags, struct xfs_iomap *iomapp, int *niomaps);
void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
xfs_off_t last, uint64_t flags, int fiopt);
int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+
#endif /* _XFS_VNODEOPS_H */